diff options
author | Pawel Jakub Dawidek <pjd@FreeBSD.org> | 2008-11-17 20:49:29 +0000 |
---|---|---|
committer | Pawel Jakub Dawidek <pjd@FreeBSD.org> | 2008-11-17 20:49:29 +0000 |
commit | 1ba4a712dde6e6c613fc411a96958b4ade67de4c (patch) | |
tree | 81b89fa4ac6467771d5aa291a97f4665981a6108 /sys | |
parent | 8fc061164d74a4c9775f39da3c0b5d02112866c8 (diff) | |
download | src-1ba4a712dde6e6c613fc411a96958b4ade67de4c.tar.gz src-1ba4a712dde6e6c613fc411a96958b4ade67de4c.zip |
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes:
- Delegated Administration
Allows regular users to perform ZFS operations, like file system
creation, snapshot creation, etc.
- L2ARC
Level 2 cache for ZFS - allows to use additional disks for cache.
Huge performance improvements mostly for random read of mostly
static content.
- slog
Allow to use additional disks for ZFS Intent Log to speed up
operations like fsync(2).
- vfs.zfs.super_owner
Allows regular users to perform privileged operations on files stored
on ZFS file systems owned by him. Very careful with this one.
- chflags(2)
Not all the flags are supported. This still needs work.
- ZFSBoot
Support to boot off of ZFS pool. Not finished, AFAIK.
Submitted by: dfr
- Snapshot properties
- New failure modes
Before if write requested failed, system paniced. Now one
can select from one of three failure modes:
- panic - panic on write error
- wait - wait for disk to reappear
- continue - serve read requests if possible, block write requests
- Refquota, refreservation properties
Just quota and reservation properties, but don't count space consumed
by children file systems, clones and snapshots.
- Sparse volumes
ZVOLs that don't reserve space in the pool.
- External attributes
Compatible with extattr(2).
- NFSv4-ACLs
Not sure about the status, might not be complete yet.
Submitted by: trasz
- Creation-time properties
- Regression tests for zpool(8) command.
Obtained from: OpenSolaris
Notes
Notes:
svn path=/head/; revision=185029
Diffstat (limited to 'sys')
228 files changed, 77058 insertions, 11097 deletions
diff --git a/sys/boot/Makefile b/sys/boot/Makefile index 1af14577652e..27cb7e34308a 100644 --- a/sys/boot/Makefile +++ b/sys/boot/Makefile @@ -26,6 +26,10 @@ SUBDIR+= ofw SUBDIR+= uboot .endif +.if defined(LOADER_ZFS_SUPPORT) +SUBDIR+= zfs +.endif + # Pick the machine-dependent subdir based on the target architecture. ADIR= ${MACHINE:S/amd64/i386/:S/sun4v/sparc64/} .if exists(${.CURDIR}/${ADIR}/.) diff --git a/sys/boot/common/bootstrap.h b/sys/boot/common/bootstrap.h index 57982d1e9853..5f0848089f43 100644 --- a/sys/boot/common/bootstrap.h +++ b/sys/boot/common/bootstrap.h @@ -43,6 +43,7 @@ struct devdesc #define DEVT_DISK 1 #define DEVT_NET 2 #define DEVT_CD 3 +#define DEVT_ZFS 4 int d_unit; }; diff --git a/sys/boot/i386/Makefile b/sys/boot/i386/Makefile index b89222d85b4d..6af86425e7f7 100644 --- a/sys/boot/i386/Makefile +++ b/sys/boot/i386/Makefile @@ -1,7 +1,7 @@ # $FreeBSD$ -SUBDIR= mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot kgzldr \ - libi386 libfirewire loader +SUBDIR= mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot zfsboot \ + kgzldr libi386 libfirewire loader # special boot programs, 'self-extracting boot2+loader' SUBDIR+= pxeldr diff --git a/sys/boot/i386/libi386/bootinfo32.c b/sys/boot/i386/libi386/bootinfo32.c index 6b517c5d3707..d43442783518 100644 --- a/sys/boot/i386/libi386/bootinfo32.c +++ b/sys/boot/i386/libi386/bootinfo32.c @@ -183,6 +183,7 @@ bi_load32(char *args, int *howtop, int *bootdevp, vm_offset_t *bip, vm_offset_t break; case DEVT_NET: + case DEVT_ZFS: break; default: diff --git a/sys/boot/i386/libi386/devicename.c b/sys/boot/i386/libi386/devicename.c index e1035aa33d05..79a562b62283 100644 --- a/sys/boot/i386/libi386/devicename.c +++ b/sys/boot/i386/libi386/devicename.c @@ -167,6 +167,7 @@ i386_parsedev(struct i386_devdesc **dev, const char *devspec, const char **path) case DEVT_CD: case DEVT_NET: + case DEVT_ZFS: unit = 0; if (*np && (*np != ':')) { @@ -238,6 +239,7 @@ i386_fmtdev(void *vdev) break; case DEVT_NET: + case DEVT_ZFS: sprintf(buf, "%s%d:", dev->d_dev->dv_name, dev->d_unit); break; } diff --git a/sys/boot/i386/loader/Makefile b/sys/boot/i386/loader/Makefile index df2ccc0f15ad..79aceca277c6 100644 --- a/sys/boot/i386/loader/Makefile +++ b/sys/boot/i386/loader/Makefile @@ -17,6 +17,12 @@ CFLAGS+= -DLOADER_FIREWIRE_SUPPORT LIBFIREWIRE= ${.OBJDIR}/../libfirewire/libfirewire.a .endif +# Put LOADER_ZFS_SUPPORT=yes in /etc/make.conf for ZFS support +.if defined(LOADER_ZFS_SUPPORT) +CFLAGS+= -DLOADER_ZFS_SUPPORT +LIBZFS= ${.OBJDIR}/../../zfs/libzfsboot.a +.endif + # Enable PXE TFTP or NFS support, not both. .if defined(LOADER_TFTP_SUPPORT) CFLAGS+= -DLOADER_TFTP_SUPPORT @@ -98,8 +104,8 @@ FILES+= loader.rc # XXX crt0.o needs to be first for pxeboot(8) to work OBJS= ${BTXCRT} -DPADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBI386} ${LIBSTAND} -LDADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBI386} -lstand +DPADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} ${LIBSTAND} +LDADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} -lstand .include <bsd.prog.mk> diff --git a/sys/boot/i386/loader/conf.c b/sys/boot/i386/loader/conf.c index 245f960ed210..05c9a9e95f3c 100644 --- a/sys/boot/i386/loader/conf.c +++ b/sys/boot/i386/loader/conf.c @@ -50,6 +50,10 @@ __FBSDID("$FreeBSD$"); extern struct devsw fwohci; #endif +#if defined(LOADER_ZFS_SUPPORT) +extern struct devsw zfs_dev; +#endif + /* Exported for libstand */ struct devsw *devsw[] = { &bioscd, @@ -60,15 +64,25 @@ struct devsw *devsw[] = { #if defined(LOADER_FIREWIRE_SUPPORT) &fwohci, #endif +#if defined(LOADER_ZFS_SUPPORT) + &zfs_dev, +#endif NULL }; +#if defined(LOADER_ZFS_SUPPORT) +extern struct fs_ops zfs_fsops; +#endif + struct fs_ops *file_system[] = { &ufs_fsops, &ext2fs_fsops, &dosfs_fsops, &cd9660_fsops, &splitfs_fsops, +#if defined(LOADER_ZFS_SUPPORT) + &zfs_fsops, +#endif #ifdef LOADER_GZIP_SUPPORT &gzipfs_fsops, #endif diff --git a/sys/boot/i386/loader/main.c b/sys/boot/i386/loader/main.c index 5b23670d12d0..cac28aef9763 100644 --- a/sys/boot/i386/loader/main.c +++ b/sys/boot/i386/loader/main.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #define KARGS_FLAGS_CD 0x1 #define KARGS_FLAGS_PXE 0x2 +#define KARGS_FLAGS_ZFS 0x4 /* Arguments passed in from the boot1/boot2 loader */ static struct @@ -51,8 +52,13 @@ static struct u_int32_t howto; u_int32_t bootdev; u_int32_t bootflags; - u_int32_t pxeinfo; - u_int32_t res2; + union { + struct { + u_int32_t pxeinfo; + u_int32_t res2; + }; + uint64_t zfspool; + }; u_int32_t bootinfo; } *kargs; @@ -96,7 +102,7 @@ main(void) */ bios_getmem(); -#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT) +#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT) || defined(LOADER_ZFS_SUPPORT) heap_top = PTOV(memtop_copyin); memtop_copyin -= 0x300000; heap_bottom = PTOV(memtop_copyin); @@ -145,6 +151,14 @@ main(void) bc_add(initial_bootdev); } + archsw.arch_autoload = i386_autoload; + archsw.arch_getdev = i386_getdev; + archsw.arch_copyin = i386_copyin; + archsw.arch_copyout = i386_copyout; + archsw.arch_readin = i386_readin; + archsw.arch_isainb = isa_inb; + archsw.arch_isaoutb = isa_outb; + /* * March through the device switch probing for things. */ @@ -172,14 +186,6 @@ main(void) bios_getsmap(); - archsw.arch_autoload = i386_autoload; - archsw.arch_getdev = i386_getdev; - archsw.arch_copyin = i386_copyin; - archsw.arch_copyout = i386_copyout; - archsw.arch_readin = i386_readin; - archsw.arch_isainb = isa_inb; - archsw.arch_isaoutb = isa_outb; - interact(); /* doesn't return */ /* if we ever get here, it is an error */ @@ -252,6 +258,29 @@ extract_currdev(void) i386_setcurrdev, env_nounset); env_setenv("loaddev", EV_VOLATILE, i386_fmtdev(&new_currdev), env_noset, env_nounset); + +#ifdef LOADER_ZFS_SUPPORT + /* + * If we were started from a ZFS-aware boot2, we can work out + * which ZFS pool we are booting from. + */ + if (kargs->bootflags & KARGS_FLAGS_ZFS) { + /* + * Dig out the pool guid and convert it to a 'unit number' + */ + uint64_t guid; + int unit; + char devname[32]; + extern int zfs_guid_to_unit(uint64_t); + + guid = kargs->zfspool; + unit = zfs_guid_to_unit(guid); + if (unit >= 0) { + sprintf(devname, "zfs%d", unit); + setenv("currdev", devname, 1); + } + } +#endif } COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot); diff --git a/sys/boot/i386/zfsboot/Makefile b/sys/boot/i386/zfsboot/Makefile new file mode 100644 index 000000000000..41f1672c82ef --- /dev/null +++ b/sys/boot/i386/zfsboot/Makefile @@ -0,0 +1,108 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../boot2 + +FILES= zfsboot + +NM?= nm + +# A value of 0x80 enables LBA support. +BOOT_BOOT1_FLAGS?= 0x80 + +BOOT_COMCONSOLE_PORT?= 0x3f8 +BOOT_COMCONSOLE_SPEED?= 9600 +B2SIOFMT?= 0x3 + +REL1= 0x700 +ORG1= 0x7c00 +ORG2= 0x2000 + +CFLAGS= -Os -g \ + -fno-guess-branch-probability \ + -fomit-frame-pointer \ + -fno-unit-at-a-time \ + -mno-align-long-strings \ + -mrtd \ + -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 \ + -DBOOT2 \ + -DFLAGS=${BOOT_BOOT1_FLAGS} \ + -DSIOPRT=${BOOT_COMCONSOLE_PORT} \ + -DSIOFMT=${B2SIOFMT} \ + -DSIOSPD=${BOOT_COMCONSOLE_SPEED} \ + -I${.CURDIR}/../../zfs \ + -I${.CURDIR}/../../../cddl/boot/zfs \ + -I${.CURDIR}/../btx/lib -I. \ + -I${.CURDIR}/../boot2 \ + -Wall -Waggregate-return -Wbad-function-cast -Wcast-align \ + -Wmissing-declarations -Wmissing-prototypes -Wnested-externs \ + -Wpointer-arith -Wshadow -Wstrict-prototypes -Wwrite-strings \ + -Winline --param max-inline-insns-single=100 + +LDFLAGS=-static -N --gc-sections + +# Pick up ../Makefile.inc early. +.include <bsd.init.mk> + +CLEANFILES= zfsboot + +zfsboot: zfsboot1 zfsboot2 + cat zfsboot1 zfsboot2 > zfsboot + +CLEANFILES+= zfsboot1 zfsldr.out zfsldr.o + +zfsboot1: zfsldr.out + objcopy -S -O binary zfsldr.out ${.TARGET} + +zfsldr.out: zfsldr.o + ${LD} ${LDFLAGS} -e start -Ttext ${ORG1} -o ${.TARGET} zfsldr.o + +CLEANFILES+= zfsboot2 zfsboot.ld zfsboot.ldr zfsboot.bin zfsboot.out \ + zfsboot.o zfsboot.s zfsboot.s.tmp zfsboot.h sio.o + +# We currently allow 32768 bytes for zfsboot - in practice it could be +# any size up to 3.5Mb but keeping it fixed size simplifies zfsldr. +# +BOOT2SIZE= 32768 + +zfsboot2: zfsboot.ld + @set -- `ls -l zfsboot.ld`; x=$$((${BOOT2SIZE}-$$5)); \ + echo "$$x bytes available"; test $$x -ge 0 + dd if=zfsboot.ld of=${.TARGET} obs=${BOOT2SIZE} conv=osync + +zfsboot.ld: zfsboot.ldr zfsboot.bin ${BTXKERN} + btxld -v -E ${ORG2} -f bin -b ${BTXKERN} -l zfsboot.ldr \ + -o ${.TARGET} -P 1 zfsboot.bin + +zfsboot.ldr: + cp /dev/null ${.TARGET} + +zfsboot.bin: zfsboot.out + objcopy -S -O binary zfsboot.out ${.TARGET} + +zfsboot.out: ${BTXCRT} zfsboot.o sio.o + ${LD} ${LDFLAGS} -Ttext ${ORG2} -o ${.TARGET} ${.ALLSRC} + +zfsboot.o: zfsboot.s + +SRCS= zfsboot.c zfsboot.h + +zfsboot.s: zfsboot.c zfsboot.h ${.CURDIR}/../../zfs/zfsimpl.c + ${CC} ${CFLAGS} -S -o zfsboot.s.tmp ${.CURDIR}/zfsboot.c + sed -e '/align/d' -e '/nop/d' < zfsboot.s.tmp > zfsboot.s + rm -f zfsboot.s.tmp + +zfsboot.h: zfsldr.out + ${NM} -t d ${.ALLSRC} | awk '/([0-9])+ T xread/ \ + { x = $$1 - ORG1; \ + printf("#define XREADORG %#x\n", REL1 + x) }' \ + ORG1=`printf "%d" ${ORG1}` \ + REL1=`printf "%d" ${REL1}` > ${.TARGET} + +.if ${MACHINE_ARCH} == "amd64" +beforedepend zfsboot.s: machine +CLEANFILES+= machine +machine: + ln -sf ${.CURDIR}/../../../i386/include machine +.endif + +.include <bsd.prog.mk> diff --git a/sys/boot/i386/zfsboot/zfsboot.c b/sys/boot/i386/zfsboot/zfsboot.c new file mode 100644 index 000000000000..9b0a465cbac3 --- /dev/null +++ b/sys/boot/i386/zfsboot/zfsboot.c @@ -0,0 +1,944 @@ +/*- + * Copyright (c) 1998 Robert Nordier + * All rights reserved. + * + * Redistribution and use in source and binary forms are freely + * permitted provided that the above copyright notice and this + * paragraph and the following disclaimer are duplicated in all + * such forms. + * + * This software is provided "AS IS" and without any express or + * implied warranties, including, without limitation, the implied + * warranties of merchantability and fitness for a particular + * purpose. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/diskmbr.h> +#include <sys/reboot.h> +#include <sys/queue.h> + +#include <machine/bootinfo.h> +#include <machine/elf.h> + +#include <stdarg.h> +#include <stddef.h> + +#include <a.out.h> + +#include <btxv86.h> + +#include "zfsboot.h" +#include "lib.h" + +#define IO_KEYBOARD 1 +#define IO_SERIAL 2 + +#define SECOND 18 /* Circa that many ticks in a second. */ + +#define RBX_ASKNAME 0x0 /* -a */ +#define RBX_SINGLE 0x1 /* -s */ +/* 0x2 is reserved for log2(RB_NOSYNC). */ +/* 0x3 is reserved for log2(RB_HALT). */ +/* 0x4 is reserved for log2(RB_INITNAME). */ +#define RBX_DFLTROOT 0x5 /* -r */ +#define RBX_KDB 0x6 /* -d */ +/* 0x7 is reserved for log2(RB_RDONLY). */ +/* 0x8 is reserved for log2(RB_DUMP). */ +/* 0x9 is reserved for log2(RB_MINIROOT). */ +#define RBX_CONFIG 0xa /* -c */ +#define RBX_VERBOSE 0xb /* -v */ +#define RBX_SERIAL 0xc /* -h */ +#define RBX_CDROM 0xd /* -C */ +/* 0xe is reserved for log2(RB_POWEROFF). */ +#define RBX_GDB 0xf /* -g */ +#define RBX_MUTE 0x10 /* -m */ +/* 0x11 is reserved for log2(RB_SELFTEST). */ +/* 0x12 is reserved for boot programs. */ +/* 0x13 is reserved for boot programs. */ +#define RBX_PAUSE 0x14 /* -p */ +#define RBX_QUIET 0x15 /* -q */ +#define RBX_NOINTR 0x1c /* -n */ +/* 0x1d is reserved for log2(RB_MULTIPLE) and is just misnamed here. */ +#define RBX_DUAL 0x1d /* -D */ +/* 0x1f is reserved for log2(RB_BOOTINFO). */ + +/* pass: -a, -s, -r, -d, -c, -v, -h, -C, -g, -m, -p, -D */ +#define RBX_MASK (OPT_SET(RBX_ASKNAME) | OPT_SET(RBX_SINGLE) | \ + OPT_SET(RBX_DFLTROOT) | OPT_SET(RBX_KDB ) | \ + OPT_SET(RBX_CONFIG) | OPT_SET(RBX_VERBOSE) | \ + OPT_SET(RBX_SERIAL) | OPT_SET(RBX_CDROM) | \ + OPT_SET(RBX_GDB ) | OPT_SET(RBX_MUTE) | \ + OPT_SET(RBX_PAUSE) | OPT_SET(RBX_DUAL)) + +/* Hint to loader that we came from ZFS */ +#define KARGS_FLAGS_ZFS 0x4 + +#define PATH_CONFIG "/boot.config" +#define PATH_BOOT3 "/boot/loader" +#define PATH_KERNEL "/boot/kernel/kernel" + +#define ARGS 0x900 +#define NOPT 14 +#define NDEV 3 +#define MEM_BASE 0x12 +#define MEM_EXT 0x15 +#define V86_CY(x) ((x) & 1) +#define V86_ZR(x) ((x) & 0x40) + +#define DRV_HARD 0x80 +#define DRV_MASK 0x7f + +#define TYPE_AD 0 +#define TYPE_DA 1 +#define TYPE_MAXHARD TYPE_DA +#define TYPE_FD 2 + +#define OPT_SET(opt) (1 << (opt)) +#define OPT_CHECK(opt) ((opts) & OPT_SET(opt)) + +extern uint32_t _end; + +static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */ +static const unsigned char flags[NOPT] = { + RBX_DUAL, + RBX_SERIAL, + RBX_ASKNAME, + RBX_CDROM, + RBX_CONFIG, + RBX_KDB, + RBX_GDB, + RBX_MUTE, + RBX_NOINTR, + RBX_PAUSE, + RBX_QUIET, + RBX_DFLTROOT, + RBX_SINGLE, + RBX_VERBOSE +}; + +static const char *const dev_nm[NDEV] = {"ad", "da", "fd"}; +static const unsigned char dev_maj[NDEV] = {30, 4, 2}; + +struct dsk { + unsigned drive; + unsigned type; + unsigned unit; + unsigned slice; + unsigned part; + unsigned start; + int init; +}; +static char cmd[512]; +static char kname[1024]; +static uint32_t opts; +static int comspeed = SIOSPD; +static struct bootinfo bootinfo; +static uint32_t bootdev; +static uint8_t ioctrl = IO_KEYBOARD; + +/* Buffers that must not span a 64k boundary. */ +#define READ_BUF_SIZE 8192 +struct dmadat { + char rdbuf[READ_BUF_SIZE]; /* for reading large things */ + char secbuf[READ_BUF_SIZE]; /* for MBR/disklabel */ +}; +static struct dmadat *dmadat; + +void exit(int); +static void load(void); +static int parse(void); +static void printf(const char *,...); +static void putchar(int); +static uint32_t memsize(void); +static int drvread(struct dsk *, void *, unsigned, unsigned); +static int keyhit(unsigned); +static int xputc(int); +static int xgetc(int); +static int getc(int); + +static void memcpy(void *, const void *, int); +static void +memcpy(void *dst, const void *src, int len) +{ + const char *s = src; + char *d = dst; + + while (len--) + *d++ = *s++; +} + +static void +strcpy(char *dst, const char *src) +{ + while (*src) + *dst++ = *src++; + *dst++ = 0; +} + +static void +strcat(char *dst, const char *src) +{ + while (*dst) + dst++; + while (*src) + *dst++ = *src++; + *dst++ = 0; +} + +static int +strcmp(const char *s1, const char *s2) +{ + for (; *s1 == *s2 && *s1; s1++, s2++); + return (unsigned char)*s1 - (unsigned char)*s2; +} + +static const char * +strchr(const char *s, char ch) +{ + for (; *s; s++) + if (*s == ch) + return s; + return 0; +} + +static int +memcmp(const void *p1, const void *p2, size_t n) +{ + const char *s1 = (const char *) p1; + const char *s2 = (const char *) p2; + for (; n > 0 && *s1 == *s2; s1++, s2++, n--); + if (n) + return (unsigned char)*s1 - (unsigned char)*s2; + else + return 0; +} + +static void +memset(void *p, char val, size_t n) +{ + char *s = (char *) p; + while (n--) + *s++ = val; +} + +static void * +malloc(size_t n) +{ + static char *heap_next; + static char *heap_end; + + if (!heap_next) { + heap_next = (char *) dmadat + sizeof(*dmadat); + heap_end = (char *) (640*1024); + } + + char *p = heap_next; + if (p + n > heap_end) { + printf("malloc failure\n"); + for (;;) + ; + return 0; + } + heap_next += n; + return p; +} + +static size_t +strlen(const char *s) +{ + size_t len = 0; + while (*s++) + len++; + return len; +} + +static char * +strdup(const char *s) +{ + char *p = malloc(strlen(s) + 1); + strcpy(p, s); + return p; +} + +#include "zfsimpl.c" + +/* + * Read from a dnode (which must be from a ZPL filesystem). + */ +static int +zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size) +{ + const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus; + size_t n; + int rc; + + n = size; + if (*offp + n > zp->zp_size) + n = zp->zp_size - *offp; + + rc = dnode_read(spa, dnode, *offp, start, n); + if (rc) + return (-1); + *offp += n; + + return (n); +} + +/* + * Current ZFS pool + */ +spa_t *spa; + +/* + * A wrapper for dskread that doesn't have to worry about whether the + * buffer pointer crosses a 64k boundary. + */ +static int +vdev_read(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes) +{ + char *p; + unsigned int lba, nb; + struct dsk *dsk = (struct dsk *) priv; + + if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1))) + return -1; + + p = buf; + lba = off / DEV_BSIZE; + while (bytes > 0) { + nb = bytes / DEV_BSIZE; + if (nb > READ_BUF_SIZE / DEV_BSIZE) + nb = READ_BUF_SIZE / DEV_BSIZE; + if (drvread(dsk, dmadat->rdbuf, lba, nb)) + return -1; + memcpy(p, dmadat->rdbuf, nb * DEV_BSIZE); + p += nb * DEV_BSIZE; + lba += nb; + bytes -= nb * DEV_BSIZE; + } + + return 0; +} + +static int +xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte) +{ + if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) { + printf("Invalid %s\n", "format"); + return -1; + } + return 0; +} + +static inline uint32_t +memsize(void) +{ + v86.addr = MEM_EXT; + v86.eax = 0x8800; + v86int(); + return v86.eax; +} + +static inline void +getstr(void) +{ + char *s; + int c; + + s = cmd; + for (;;) { + switch (c = xgetc(0)) { + case 0: + break; + case '\177': + case '\b': + if (s > cmd) { + s--; + printf("\b \b"); + } + break; + case '\n': + case '\r': + *s = 0; + return; + default: + if (s - cmd < sizeof(cmd) - 1) + *s++ = c; + putchar(c); + } + } +} + +static inline void +putc(int c) +{ + v86.addr = 0x10; + v86.eax = 0xe00 | (c & 0xff); + v86.ebx = 0x7; + v86int(); +} + +/* + * Try to detect a device supported by the legacy int13 BIOS + */ +static int +int13probe(int drive) +{ + v86.ctl = V86_FLAGS; + v86.addr = 0x13; + v86.eax = 0x800; + v86.edx = drive; + v86int(); + + if (!(v86.efl & 0x1) && /* carry clear */ + ((v86.edx & 0xff) != (drive & DRV_MASK))) { /* unit # OK */ + if ((v86.ecx & 0x3f) == 0) { /* absurd sector size */ + return(0); /* skip device */ + } + return (1); + } + return(0); +} + +static void +probe_drive(struct dsk *dsk, spa_t **spap) +{ + struct dos_partition *dp; + char *sec; + unsigned i; + + if (!int13probe(dsk->drive)) + return; + + /* + * If we find a vdev on the whole disk, stop here. Otherwise dig + * out the MBR and probe each slice in turn for a vdev. + */ + if (vdev_probe(vdev_read, dsk, spap) == 0) + return; + + sec = dmadat->secbuf; + dsk->start = 0; + if (drvread(dsk, sec, DOSBBSECTOR, 1)) + return; + dp = (void *)(sec + DOSPARTOFF); + + for (i = 0; i < NDOSPART; i++) { + if (!dp[i].dp_typ) + continue; + dsk->start = dp[i].dp_start; + if (vdev_probe(vdev_read, dsk, spap) == 0) { + /* + * We record the first pool we find (we will try to boot + * from that one. + */ + spap = 0; + + /* + * This slice had a vdev. We need a new dsk structure now + * sice the vdev now owns this one. + */ + struct dsk *newdsk; + newdsk = malloc(sizeof(struct dsk)); + *newdsk = *dsk; + dsk = newdsk; + } + } +} + +int +main(void) +{ + int autoboot, i; + dnode_phys_t dn; + off_t off; + struct dsk *dsk; + + dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base); + v86.ctl = V86_FLAGS; + + dsk = malloc(sizeof(struct dsk)); + dsk->drive = *(uint8_t *)PTOV(ARGS); + dsk->type = dsk->drive & DRV_HARD ? TYPE_AD : TYPE_FD; + dsk->unit = dsk->drive & DRV_MASK; + dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1; + dsk->part = 0; + dsk->start = 0; + dsk->init = 0; + + bootinfo.bi_version = BOOTINFO_VERSION; + bootinfo.bi_size = sizeof(bootinfo); + bootinfo.bi_basemem = 0; /* XXX will be filled by loader or kernel */ + bootinfo.bi_extmem = memsize(); + bootinfo.bi_memsizes_valid++; + bootinfo.bi_bios_dev = dsk->drive; + + bootdev = MAKEBOOTDEV(dev_maj[dsk->type], + dsk->slice, dsk->unit, dsk->part), + + /* Process configuration file */ + + autoboot = 1; + + zfs_init(); + + /* + * Probe the boot drive first - we will try to boot from whatever + * pool we find on that drive. + */ + probe_drive(dsk, &spa); + + /* + * Probe the rest of the drives that the bios knows about. This + * will find any other available pools and it may fill in missing + * vdevs for the boot pool. + */ + for (i = 0; i < 4; i++) { + if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS)) + continue; + + dsk = malloc(sizeof(struct dsk)); + dsk->drive = i | DRV_HARD; + dsk->type = dsk->drive & TYPE_AD; + dsk->unit = i; + dsk->slice = 0; + dsk->part = 0; + dsk->start = 0; + dsk->init = 0; + probe_drive(dsk, 0); + } + + /* + * If we didn't find a pool on the boot drive, default to the + * first pool we found, if any. + */ + if (!spa) { + spa = STAILQ_FIRST(&zfs_pools); + if (!spa) { + printf("No ZFS pools located, can't boot\n"); + for (;;) + ; + } + } + + zfs_mount_pool(spa); + + if (zfs_lookup(spa, PATH_CONFIG, &dn) == 0) { + off = 0; + xfsread(&dn, &off, cmd, sizeof(cmd)); + } + + if (*cmd) { + if (parse()) + autoboot = 0; + if (!OPT_CHECK(RBX_QUIET)) + printf("%s: %s", PATH_CONFIG, cmd); + /* Do not process this command twice */ + *cmd = 0; + } + + /* + * Try to exec stage 3 boot loader. If interrupted by a keypress, + * or in case of failure, try to load a kernel directly instead. + */ + + if (autoboot && !*kname) { + memcpy(kname, PATH_BOOT3, sizeof(PATH_BOOT3)); + if (!keyhit(3*SECOND)) { + load(); + memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL)); + } + } + + /* Present the user with the boot2 prompt. */ + + for (;;) { + if (!autoboot || !OPT_CHECK(RBX_QUIET)) + printf("\nFreeBSD/i386 boot\n" + "Default: %s:%s\n" + "boot: ", + spa->spa_name, kname); + if (ioctrl & IO_SERIAL) + sio_flush(); + if (!autoboot || keyhit(5*SECOND)) + getstr(); + else if (!autoboot || !OPT_CHECK(RBX_QUIET)) + putchar('\n'); + autoboot = 0; + if (parse()) + putchar('\a'); + else + load(); + } +} + +/* XXX - Needed for btxld to link the boot2 binary; do not remove. */ +void +exit(int x) +{ +} + +static void +load(void) +{ + union { + struct exec ex; + Elf32_Ehdr eh; + } hdr; + static Elf32_Phdr ep[2]; + static Elf32_Shdr es[2]; + caddr_t p; + dnode_phys_t dn; + off_t off; + uint32_t addr, x; + int fmt, i, j; + + if (zfs_lookup(spa, kname, &dn)) { + return; + } + off = 0; + if (xfsread(&dn, &off, &hdr, sizeof(hdr))) + return; + if (N_GETMAGIC(hdr.ex) == ZMAGIC) + fmt = 0; + else if (IS_ELF(hdr.eh)) + fmt = 1; + else { + printf("Invalid %s\n", "format"); + return; + } + if (fmt == 0) { + addr = hdr.ex.a_entry & 0xffffff; + p = PTOV(addr); + off = PAGE_SIZE; + if (xfsread(&dn, &off, p, hdr.ex.a_text)) + return; + p += roundup2(hdr.ex.a_text, PAGE_SIZE); + if (xfsread(&dn, &off, p, hdr.ex.a_data)) + return; + p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE); + bootinfo.bi_symtab = VTOP(p); + memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms)); + p += sizeof(hdr.ex.a_syms); + if (hdr.ex.a_syms) { + if (xfsread(&dn, &off, p, hdr.ex.a_syms)) + return; + p += hdr.ex.a_syms; + if (xfsread(&dn, &off, p, sizeof(int))) + return; + x = *(uint32_t *)p; + p += sizeof(int); + x -= sizeof(int); + if (xfsread(&dn, &off, p, x)) + return; + p += x; + } + } else { + off = hdr.eh.e_phoff; + for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) { + if (xfsread(&dn, &off, ep + j, sizeof(ep[0]))) + return; + if (ep[j].p_type == PT_LOAD) + j++; + } + for (i = 0; i < 2; i++) { + p = PTOV(ep[i].p_paddr & 0xffffff); + off = ep[i].p_offset; + if (xfsread(&dn, &off, p, ep[i].p_filesz)) + return; + } + p += roundup2(ep[1].p_memsz, PAGE_SIZE); + bootinfo.bi_symtab = VTOP(p); + if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) { + off = hdr.eh.e_shoff + sizeof(es[0]) * + (hdr.eh.e_shstrndx + 1); + if (xfsread(&dn, &off, &es, sizeof(es))) + return; + for (i = 0; i < 2; i++) { + memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size)); + p += sizeof(es[i].sh_size); + off = es[i].sh_offset; + if (xfsread(&dn, &off, p, es[i].sh_size)) + return; + p += es[i].sh_size; + } + } + addr = hdr.eh.e_entry & 0xffffff; + } + bootinfo.bi_esymtab = VTOP(p); + bootinfo.bi_kernelname = VTOP(kname); + __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK), + bootdev, + KARGS_FLAGS_ZFS, + (uint32_t) spa->spa_guid, + (uint32_t) (spa->spa_guid >> 32), + VTOP(&bootinfo)); +} + +static int +parse() +{ + char *arg = cmd; + char *ep, *p, *q; + const char *cp; + //unsigned int drv; + int c, i, j; + + while ((c = *arg++)) { + if (c == ' ' || c == '\t' || c == '\n') + continue; + for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++); + ep = p; + if (*p) + *p++ = 0; + if (c == '-') { + while ((c = *arg++)) { + if (c == 'P') { + if (*(uint8_t *)PTOV(0x496) & 0x10) { + cp = "yes"; + } else { + opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL); + cp = "no"; + } + printf("Keyboard: %s\n", cp); + continue; + } else if (c == 'S') { + j = 0; + while ((unsigned int)(i = *arg++ - '0') <= 9) + j = j * 10 + i; + if (j > 0 && i == -'0') { + comspeed = j; + break; + } + /* Fall through to error below ('S' not in optstr[]). */ + } + for (i = 0; c != optstr[i]; i++) + if (i == NOPT - 1) + return -1; + opts ^= OPT_SET(flags[i]); + } + ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) : + OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD; + if (ioctrl & IO_SERIAL) + sio_init(115200 / comspeed); + } if (c == '?') { + dnode_phys_t dn; + + if (zfs_lookup(spa, arg, &dn) == 0) { + zap_list(spa, &dn); + } + return -1; + } else { + arg--; + + /* + * Report pool status if the comment is 'status'. Lets + * hope no-one wants to load /status as a kernel. + */ + if (!strcmp(arg, "status")) { + spa_all_status(); + return -1; + } + + /* + * If there is a colon, switch pools. + */ + q = (char *) strchr(arg, ':'); + if (q) { + spa_t *newspa; + + *q++ = 0; + newspa = spa_find_by_name(arg); + if (newspa) { + spa = newspa; + zfs_mount_pool(spa); + } else { + printf("\nCan't find ZFS pool %s\n", arg); + return -1; + } + arg = q; + } + if ((i = ep - arg)) { + if ((size_t)i >= sizeof(kname)) + return -1; + memcpy(kname, arg, i + 1); + } + } + arg = p; + } + return 0; +} + +static void +printf(const char *fmt,...) +{ + va_list ap; + char buf[10]; + char *s; + unsigned u; + int c; + int minus; + int prec; + int len; + int pad; + + va_start(ap, fmt); + while ((c = *fmt++)) { + if (c == '%') { + minus = 0; + prec = 0; + nextfmt: + c = *fmt++; + switch (c) { + case '-': + minus = 1; + goto nextfmt; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + prec = 10 * prec + (c - '0'); + goto nextfmt; + case 'c': + putchar(va_arg(ap, int)); + continue; + case 's': + s = va_arg(ap, char *); + if (prec) { + len = strlen(s); + if (len < prec) + pad = prec - len; + else + pad = 0; + if (minus) + while (pad--) + putchar(' '); + for (; *s; s++) + putchar(*s); + if (!minus) + while (pad--) + putchar(' '); + } else { + for (; *s; s++) + putchar(*s); + } + continue; + case 'u': + u = va_arg(ap, unsigned); + s = buf; + do + *s++ = '0' + u % 10U; + while (u /= 10U); + while (--s >= buf) + putchar(*s); + continue; + } + } + putchar(c); + } + va_end(ap); + return; +} + +static void +putchar(int c) +{ + if (c == '\n') + xputc('\r'); + xputc(c); +} + +static int +drvread(struct dsk *dsk, void *buf, unsigned lba, unsigned nblk) +{ + static unsigned c = 0x2d5c7c2f; + + lba += dsk->start; + if (!OPT_CHECK(RBX_QUIET)) + printf("%c\b", c = c << 8 | c >> 24); + v86.ctl = V86_ADDR | V86_CALLF | V86_FLAGS; + v86.addr = XREADORG; /* call to xread in boot1 */ + v86.es = VTOPSEG(buf); + v86.eax = lba; + v86.ebx = VTOPOFF(buf); + v86.ecx = lba >> 16; + v86.edx = nblk << 8 | dsk->drive; + v86int(); + v86.ctl = V86_FLAGS; + if (V86_CY(v86.efl)) { + printf("error %u lba %u\n", v86.eax >> 8 & 0xff, lba); + return -1; + } + return 0; +} + +static int +keyhit(unsigned ticks) +{ + uint32_t t0, t1; + + if (OPT_CHECK(RBX_NOINTR)) + return 0; + t0 = 0; + for (;;) { + if (xgetc(1)) + return 1; + t1 = *(uint32_t *)PTOV(0x46c); + if (!t0) + t0 = t1; + if (t1 < t0 || t1 >= t0 + ticks) + return 0; + } +} + +static int +xputc(int c) +{ + if (ioctrl & IO_KEYBOARD) + putc(c); + if (ioctrl & IO_SERIAL) + sio_putc(c); + return c; +} + +static int +xgetc(int fn) +{ + if (OPT_CHECK(RBX_NOINTR)) + return 0; + for (;;) { + if (ioctrl & IO_KEYBOARD && getc(1)) + return fn ? 1 : getc(0); + if (ioctrl & IO_SERIAL && sio_ischar()) + return fn ? 1 : sio_getc(); + if (fn) + return 0; + } +} + +static int +getc(int fn) +{ + /* + * The extra comparison against zero is an attempt to work around + * what appears to be a bug in QEMU and Bochs. Both emulators + * sometimes report a key-press with scancode one and ascii zero + * when no such key is pressed in reality. As far as I can tell, + * this only happens shortly after a reboot. + */ + v86.addr = 0x16; + v86.eax = fn << 8; + v86int(); + return fn == 0 ? v86.eax & 0xff : (!V86_ZR(v86.efl) && (v86.eax & 0xff)); +} diff --git a/sys/boot/i386/zfsboot/zfsldr.S b/sys/boot/i386/zfsboot/zfsldr.S new file mode 100644 index 000000000000..a256d30276df --- /dev/null +++ b/sys/boot/i386/zfsboot/zfsldr.S @@ -0,0 +1,402 @@ +/* + * Copyright (c) 1998 Robert Nordier + * All rights reserved. + * + * Redistribution and use in source and binary forms are freely + * permitted provided that the above copyright notice and this + * paragraph and the following disclaimer are duplicated in all + * such forms. + * + * This software is provided "AS IS" and without any express or + * implied warranties, including, without limitation, the implied + * warranties of merchantability and fitness for a particular + * purpose. + * + * $FreeBSD$ + */ + +/* Memory Locations */ + .set MEM_REL,0x700 # Relocation address + .set MEM_ARG,0x900 # Arguments + .set MEM_ORG,0x7c00 # Origin + .set MEM_BUF,0x8000 # Load area + .set MEM_BTX,0x9000 # BTX start + .set MEM_JMP,0x9010 # BTX entry point + .set MEM_USR,0xa000 # Client start + .set BDA_BOOT,0x472 # Boot howto flag + +/* Partition Constants */ + .set PRT_OFF,0x1be # Partition offset + .set PRT_NUM,0x4 # Partitions + .set PRT_BSD,0xa5 # Partition type + +/* Flag Bits */ + .set FL_PACKET,0x80 # Packet mode + +/* Misc. Constants */ + .set SIZ_PAG,0x1000 # Page size + .set SIZ_SEC,0x200 # Sector size + + .set NSECT,0x40 + .globl start + .globl xread + .code16 + +start: jmp main # Start recognizably + +/* + * This is the start of a standard BIOS Parameter Block (BPB). Most bootable + * FAT disks have this at the start of their MBR. While normal BIOS's will + * work fine without this section, IBM's El Torito emulation "fixes" up the + * BPB by writing into the memory copy of the MBR. Rather than have data + * written into our xread routine, we'll define a BPB to work around it. + * The data marked with (T) indicates a field required for a ThinkPad to + * recognize the disk and (W) indicates fields written from IBM BIOS code. + * The use of the BPB is based on what OpenBSD and NetBSD implemented in + * their boot code but the required fields were determined by trial and error. + * + * Note: If additional space is needed in boot1, one solution would be to + * move the "prompt" message data (below) to replace the OEM ID. + */ + .org 0x03, 0x00 +oemid: .space 0x08, 0x00 # OEM ID + + .org 0x0b, 0x00 +bpb: .word 512 # sector size (T) + .byte 0 # sectors/clustor + .word 0 # reserved sectors + .byte 0 # number of FATs + .word 0 # root entries + .word 0 # small sectors + .byte 0 # media type (W) + .word 0 # sectors/fat + .word 18 # sectors per track (T) + .word 2 # number of heads (T) + .long 0 # hidden sectors (W) + .long 0 # large sectors + + .org 0x24, 0x00 +ebpb: .byte 0 # BIOS physical drive number (W) + + .org 0x25,0x90 +/* + * Trampoline used by boot2 to call read to read data from the disk via + * the BIOS. Call with: + * + * %cx:%ax - long - LBA to read in + * %es:(%bx) - caddr_t - buffer to read data into + * %dl - byte - drive to read from + * %dh - byte - num sectors to read + */ + +xread: push %ss # Address + pop %ds # data +/* + * Setup an EDD disk packet and pass it to read + */ +xread.1: # Starting + pushl $0x0 # absolute + push %cx # block + push %ax # number + push %es # Address of + push %bx # transfer buffer + xor %ax,%ax # Number of + movb %dh,%al # blocks to + push %ax # transfer + push $0x10 # Size of packet + mov %sp,%bp # Packet pointer + callw read # Read from disk + lea 0x10(%bp),%sp # Clear stack + lret # To far caller +/* + * Load the rest of boot2 and BTX up, copy the parts to the right locations, + * and start it all up. + */ + +/* + * Setup the segment registers to flat addressing (segment 0) and setup the + * stack to end just below the start of our code. + */ +main: cld # String ops inc + xor %cx,%cx # Zero + mov %cx,%es # Address + mov %cx,%ds # data + mov %cx,%ss # Set up + mov $start,%sp # stack +/* + * Relocate ourself to MEM_REL. Since %cx == 0, the inc %ch sets + * %cx == 0x100. + */ + mov %sp,%si # Source + mov $MEM_REL,%di # Destination + incb %ch # Word count + rep # Copy + movsw # code +/* + * If we are on a hard drive, then load the MBR and look for the first + * FreeBSD slice. We use the fake partition entry below that points to + * the MBR when we call nread. The first pass looks for the first active + * FreeBSD slice. The second pass looks for the first non-active FreeBSD + * slice if the first one fails. + */ + mov $part4,%si # Partition + cmpb $0x80,%dl # Hard drive? + jb main.4 # No + movb $0x1,%dh # Block count + callw nread # Read MBR + mov $0x1,%cx # Two passes +main.1: mov $MEM_BUF+PRT_OFF,%si # Partition table + movb $0x1,%dh # Partition +main.2: cmpb $PRT_BSD,0x4(%si) # Our partition type? + jne main.3 # No + jcxz main.5 # If second pass + testb $0x80,(%si) # Active? + jnz main.5 # Yes +main.3: add $0x10,%si # Next entry + incb %dh # Partition + cmpb $0x1+PRT_NUM,%dh # In table? + jb main.2 # Yes + dec %cx # Do two + jcxz main.1 # passes +/* + * If we get here, we didn't find any FreeBSD slices at all, so print an + * error message and die. + */ + mov $msg_part,%si # Message + jmp error # Error +/* + * Floppies use partition 0 of drive 0. + */ +main.4: xor %dx,%dx # Partition:drive + +/* + * Ok, we have a slice and drive in %dx now, so use that to locate and + * load boot2. %si references the start of the slice we are looking + * for, so go ahead and load up the 64 sectors starting at sector 1024 + * (i.e. after the two vdev labels). We don't have do anything fancy + * here to allow for an extra copy of boot1 and a partition table + * (compare to this section of the UFS bootstrap) so we just load it + * all at 0x8000. The first part of boot2 is BTX, which wants to run + * at 0x9000. The boot2.bin binary starts right after the end of BTX, + * so we have to figure out where the start of it is and then move the + * binary to 0xc000. After we have moved the client, we relocate BTX + * itself to 0x9000 - doing it in this order means that none of the + * memcpy regions overlap which would corrupt the copy. Normally, BTX + * clients start at MEM_USR, or 0xa000, but when we use btxld to + * create boot2, we use an entry point of 0x2000. That entry point is + * relative to MEM_USR; thus boot2.bin starts at 0xc000. + * + * The load area and the target area for the client overlap so we have + * to use a decrementing string move. We also play segment register + * games with the destination address for the move so that the client + * can be larger than 16k (which would overflow the zero segment since + * the client starts at 0xc000). Relocating BTX is easy since the load + * area and target area do not overlap. + */ +main.5: mov %dx,MEM_ARG # Save args + movb $NSECT,%dh # Sector count + movw $1024,%ax # Offset to boot2 + callw nread.1 # Read disk +main.6: mov $MEM_BUF,%si # BTX (before reloc) + mov 0xa(%si),%bx # Get BTX length and set + mov $NSECT*SIZ_SEC-1,%di # Size of load area (less one) + mov %di,%si # End of load + add $MEM_BUF,%si # area + sub %bx,%di # End of client, 0xc000 rel + mov %di,%cx # Size of + inc %cx # client + mov $(MEM_USR+2*SIZ_PAG)>>4,%dx # Segment + mov %dx,%es # addressing 0xc000 + std # Move with decrement + rep # Relocate + movsb # client + mov %ds,%dx # Back to + mov %dx,%es # zero segment + mov $MEM_BUF,%si # BTX (before reloc) + mov $MEM_BTX,%di # BTX + mov %bx,%cx # Get BTX length + cld # Increment this time + rep # Relocate + movsb # BTX + +/* + * Enable A20 so we can access memory above 1 meg. + * Use the zero-valued %cx as a timeout for embedded hardware which do not + * have a keyboard controller. + */ +seta20: cli # Disable interrupts +seta20.1: dec %cx # Timeout? + jz seta20.3 # Yes + inb $0x64,%al # Get status + testb $0x2,%al # Busy? + jnz seta20.1 # Yes + movb $0xd1,%al # Command: Write + outb %al,$0x64 # output port +seta20.2: inb $0x64,%al # Get status + testb $0x2,%al # Busy? + jnz seta20.2 # Yes + movb $0xdf,%al # Enable + outb %al,$0x60 # A20 +seta20.3: sti # Enable interrupts + + jmp start+MEM_JMP-MEM_ORG # Start BTX + + +/* + * Trampoline used to call read from within boot1. + */ +nread: xor %ax,%ax # Sector offset in partition +nread.1: mov $MEM_BUF,%bx # Transfer buffer + add 0x8(%si),%ax # Get + mov 0xa(%si),%cx # LBA + push %cs # Read from + callw xread.1 # disk + jnc return # If success, return + mov $msg_read,%si # Otherwise, set the error + # message and fall through to + # the error routine +/* + * Print out the error message pointed to by %ds:(%si) followed + * by a prompt, wait for a keypress, and then reboot the machine. + */ +error: callw putstr # Display message + mov $prompt,%si # Display + callw putstr # prompt + xorb %ah,%ah # BIOS: Get + int $0x16 # keypress + movw $0x1234, BDA_BOOT # Do a warm boot + ljmp $0xffff,$0x0 # reboot the machine +/* + * Display a null-terminated string using the BIOS output. + */ +putstr.0: mov $0x7,%bx # Page:attribute + movb $0xe,%ah # BIOS: Display + int $0x10 # character +putstr: lodsb # Get char + testb %al,%al # End of string? + jne putstr.0 # No + +/* + * Overused return code. ereturn is used to return an error from the + * read function. Since we assume putstr succeeds, we (ab)use the + * same code when we return from putstr. + */ +ereturn: movb $0x1,%ah # Invalid + stc # argument +return: retw # To caller +/* + * Reads sectors from the disk. If EDD is enabled, then check if it is + * installed and use it if it is. If it is not installed or not enabled, then + * fall back to using CHS. Since we use a LBA, if we are using CHS, we have to + * fetch the drive parameters from the BIOS and divide it out ourselves. + * Call with: + * + * %dl - byte - drive number + * stack - 10 bytes - EDD Packet + */ +read: testb $FL_PACKET,%cs:MEM_REL+flags-start # LBA support enabled? + jz read.1 # No, use CHS + cmpb $0x80,%dl # Hard drive? + jb read.1 # No, use CHS + mov $0x55aa,%bx # Magic + push %dx # Save + movb $0x41,%ah # BIOS: Check + int $0x13 # extensions present + pop %dx # Restore + jc read.1 # If error, use CHS + cmp $0xaa55,%bx # Magic? + jne read.1 # No, so use CHS + testb $0x1,%cl # Packet interface? + jz read.1 # No, so use CHS + mov %bp,%si # Disk packet + movb $0x42,%ah # BIOS: Extended + int $0x13 # read + retw # To caller +#if 0 +read.1: push %dx # Save + movb $0x8,%ah # BIOS: Get drive + int $0x13 # parameters + movb %dh,%ch # Max head number + pop %dx # Restore + jc return # If error + andb $0x3f,%cl # Sectors per track + jz ereturn # If zero + cli # Disable interrupts + mov 0x8(%bp),%eax # Get LBA + push %dx # Save + movzbl %cl,%ebx # Divide by + xor %edx,%edx # sectors + div %ebx # per track + movb %ch,%bl # Max head number + movb %dl,%ch # Sector number + inc %bx # Divide by + xorb %dl,%dl # number + div %ebx # of heads + movb %dl,%bh # Head number + pop %dx # Restore + cmpl $0x3ff,%eax # Cylinder number supportable? + sti # Enable interrupts + ja ereturn # No, return an error + xchgb %al,%ah # Set up cylinder + rorb $0x2,%al # number + orb %ch,%al # Merge + inc %ax # sector + xchg %ax,%cx # number + movb %bh,%dh # Head number + subb %ah,%al # Sectors this track + mov 0x2(%bp),%ah # Blocks to read + cmpb %ah,%al # To read + jb read.2 # this +#ifdef TRACK_AT_A_TIME + movb %ah,%al # track +#else + movb $1,%al # one sector +#endif +read.2: mov $0x5,%di # Try count +read.3: les 0x4(%bp),%bx # Transfer buffer + push %ax # Save + movb $0x2,%ah # BIOS: Read + int $0x13 # from disk + pop %bx # Restore + jnc read.4 # If success + dec %di # Retry? + jz read.6 # No + xorb %ah,%ah # BIOS: Reset + int $0x13 # disk system + xchg %bx,%ax # Block count + jmp read.3 # Continue +read.4: movzbw %bl,%ax # Sectors read + add %ax,0x8(%bp) # Adjust + jnc read.5 # LBA, + incw 0xa(%bp) # transfer +read.5: shlb %bl # buffer + add %bl,0x5(%bp) # pointer, + sub %al,0x2(%bp) # block count + ja read.1 # If not done +read.6: retw # To caller +#else +read.1: mov $msg_chs,%si + jmp error +msg_chs: .asciz "CHS not supported" +#endif + +/* Messages */ + +msg_read: .asciz "Read" +msg_part: .asciz "Boot" + +prompt: .asciz " error\r\n" + +flags: .byte FLAGS # Flags + + .org PRT_OFF,0x90 + +/* Partition table */ + + .fill 0x30,0x1,0x0 +part4: .byte 0x80, 0x00, 0x01, 0x00 + .byte 0xa5, 0xfe, 0xff, 0xff + .byte 0x00, 0x00, 0x00, 0x00 + .byte 0x50, 0xc3, 0x00, 0x00 # 50000 sectors long, bleh + + .word 0xaa55 # Magic number diff --git a/sys/boot/zfs/Makefile b/sys/boot/zfs/Makefile new file mode 100644 index 000000000000..723233ce4ebb --- /dev/null +++ b/sys/boot/zfs/Makefile @@ -0,0 +1,29 @@ +# $FreeBSD$ + +LIB= zfsboot +INTERNALLIB= + +SRCS+= zfs.c + +CFLAGS+= -I${.CURDIR}/../common -I${.CURDIR}/../.. -I. +CFLAGS+= -I${.CURDIR}/../../../lib/libstand +CFLAGS+= -I${.CURDIR}/../../cddl/boot/zfs + +# XXX need arch-specific bootstrap CFLAGS here +# +CFLAGS+= -ffreestanding -mpreferred-stack-boundary=2 \ + -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 + +CFLAGS+= -Wformat -Wall + +.if ${MACHINE_ARCH} == "amd64" +CLEANFILES+= machine +machine: + ln -sf ${.CURDIR}/../../../i386/include machine +.endif + +.include <bsd.lib.mk> + +.if ${MACHINE_ARCH} == "amd64" +beforedepend ${OBJS}: machine +.endif diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c new file mode 100644 index 000000000000..cf0bb9c7f656 --- /dev/null +++ b/sys/boot/zfs/zfs.c @@ -0,0 +1,514 @@ +/*- + * Copyright (c) 2007 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Stand-alone file reading package. + */ + +#include <sys/param.h> +#include <sys/disklabel.h> +#include <sys/time.h> +#include <sys/queue.h> +#include <stddef.h> +#include <stdarg.h> +#include <string.h> +#include <stand.h> +#include <bootstrap.h> + +#include "zfsimpl.c" + +static int zfs_open(const char *path, struct open_file *f); +static int zfs_write(struct open_file *f, void *buf, size_t size, size_t *resid); +static int zfs_close(struct open_file *f); +static int zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid); +static off_t zfs_seek(struct open_file *f, off_t offset, int where); +static int zfs_stat(struct open_file *f, struct stat *sb); +static int zfs_readdir(struct open_file *f, struct dirent *d); + +struct devsw zfs_dev; + +struct fs_ops zfs_fsops = { + "zfs", + zfs_open, + zfs_close, + zfs_read, + zfs_write, + zfs_seek, + zfs_stat, + zfs_readdir +}; + +/* + * In-core open file. + */ +struct file { + off_t f_seekp; /* seek pointer */ + dnode_phys_t f_dnode; + uint64_t f_zap_type; /* zap type for readdir */ + uint64_t f_num_leafs; /* number of fzap leaf blocks */ + zap_leaf_phys_t *f_zap_leaf; /* zap leaf buffer */ +}; + +/* + * Open a file. + */ +static int +zfs_open(const char *upath, struct open_file *f) +{ + spa_t *spa = (spa_t *) f->f_devdata; + struct file *fp; + int rc; + + if (f->f_dev != &zfs_dev) + return (EINVAL); + + rc = zfs_mount_pool(spa); + if (rc) + return (rc); + + /* allocate file system specific data structure */ + fp = malloc(sizeof(struct file)); + bzero(fp, sizeof(struct file)); + f->f_fsdata = (void *)fp; + + if (spa->spa_root_objset.os_type != DMU_OST_ZFS) { + printf("Unexpected object set type %lld\n", + spa->spa_root_objset.os_type); + rc = EIO; + goto out; + } + + rc = zfs_lookup(spa, upath, &fp->f_dnode); + if (rc) + goto out; + + fp->f_seekp = 0; +out: + if (rc) { + f->f_fsdata = NULL; + free(fp); + } + return (rc); +} + +static int +zfs_close(struct open_file *f) +{ + struct file *fp = (struct file *)f->f_fsdata; + + dnode_cache_obj = 0; + f->f_fsdata = (void *)0; + if (fp == (struct file *)0) + return (0); + + free(fp); + return (0); +} + +/* + * Copy a portion of a file into kernel memory. + * Cross block boundaries when necessary. + */ +static int +zfs_read(struct open_file *f, void *start, size_t size, size_t *resid /* out */) +{ + spa_t *spa = (spa_t *) f->f_devdata; + struct file *fp = (struct file *)f->f_fsdata; + const znode_phys_t *zp = (const znode_phys_t *) fp->f_dnode.dn_bonus; + size_t n; + int rc; + + n = size; + if (fp->f_seekp + n > zp->zp_size) + n = zp->zp_size - fp->f_seekp; + + rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n); + if (rc) + return (rc); + + if (0) { + int i; + for (i = 0; i < n; i++) + putchar(((char*) start)[i]); + } + fp->f_seekp += n; + if (resid) + *resid = size - n; + + return (0); +} + +/* + * Don't be silly - the bootstrap has no business writing anything. + */ +static int +zfs_write(struct open_file *f, void *start, size_t size, size_t *resid /* out */) +{ + + return (EROFS); +} + +static off_t +zfs_seek(struct open_file *f, off_t offset, int where) +{ + struct file *fp = (struct file *)f->f_fsdata; + znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus; + + switch (where) { + case SEEK_SET: + fp->f_seekp = offset; + break; + case SEEK_CUR: + fp->f_seekp += offset; + break; + case SEEK_END: + fp->f_seekp = zp->zp_size - offset; + break; + default: + errno = EINVAL; + return (-1); + } + return (fp->f_seekp); +} + +static int +zfs_stat(struct open_file *f, struct stat *sb) +{ + struct file *fp = (struct file *)f->f_fsdata; + znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus; + + /* only important stuff */ + sb->st_mode = zp->zp_mode; + sb->st_uid = zp->zp_uid; + sb->st_gid = zp->zp_gid; + sb->st_size = zp->zp_size; + + return (0); +} + +static int +zfs_readdir(struct open_file *f, struct dirent *d) +{ + spa_t *spa = (spa_t *) f->f_devdata; + struct file *fp = (struct file *)f->f_fsdata; + znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus; + mzap_ent_phys_t mze; + size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT; + int rc; + + if ((zp->zp_mode >> 12) != 0x4) { + return (ENOTDIR); + } + + /* + * If this is the first read, get the zap type. + */ + if (fp->f_seekp == 0) { + rc = dnode_read(spa, &fp->f_dnode, + 0, &fp->f_zap_type, sizeof(fp->f_zap_type)); + if (rc) + return (rc); + + if (fp->f_zap_type == ZBT_MICRO) { + fp->f_seekp = offsetof(mzap_phys_t, mz_chunk); + } else { + rc = dnode_read(spa, &fp->f_dnode, + offsetof(zap_phys_t, zap_num_leafs), + &fp->f_num_leafs, + sizeof(fp->f_num_leafs)); + if (rc) + return (rc); + + fp->f_seekp = bsize; + fp->f_zap_leaf = (zap_leaf_phys_t *)malloc(bsize); + rc = dnode_read(spa, &fp->f_dnode, + fp->f_seekp, + fp->f_zap_leaf, + bsize); + if (rc) + return (rc); + } + } + + if (fp->f_zap_type == ZBT_MICRO) { + mzap_next: + if (fp->f_seekp >= bsize) + return (ENOENT); + + rc = dnode_read(spa, &fp->f_dnode, + fp->f_seekp, &mze, sizeof(mze)); + fp->f_seekp += sizeof(mze); + + if (!mze.mze_name[0]) + goto mzap_next; + + d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value); + d->d_type = ZFS_DIRENT_TYPE(mze.mze_value); + strcpy(d->d_name, mze.mze_name); + d->d_namlen = strlen(d->d_name); + return (0); + } else { + zap_leaf_t zl; + zap_leaf_chunk_t *zc, *nc; + int chunk; + size_t namelen; + char *p; + uint64_t value; + + /* + * Initialise this so we can use the ZAP size + * calculating macros. + */ + zl.l_bs = ilog2(bsize); + zl.l_phys = fp->f_zap_leaf; + + /* + * Figure out which chunk we are currently looking at + * and consider seeking to the next leaf. We use the + * low bits of f_seekp as a simple chunk index. + */ + fzap_next: + chunk = fp->f_seekp & (bsize - 1); + if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) { + fp->f_seekp = (fp->f_seekp & ~(bsize - 1)) + bsize; + chunk = 0; + + /* + * Check for EOF and read the new leaf. + */ + if (fp->f_seekp >= bsize * fp->f_num_leafs) + return (ENOENT); + + rc = dnode_read(spa, &fp->f_dnode, + fp->f_seekp, + fp->f_zap_leaf, + bsize); + if (rc) + return (rc); + } + + zc = &ZAP_LEAF_CHUNK(&zl, chunk); + fp->f_seekp++; + if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) + goto fzap_next; + + namelen = zc->l_entry.le_name_length; + if (namelen > sizeof(d->d_name)) + namelen = sizeof(d->d_name); + + /* + * Paste the name back together. + */ + nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); + p = d->d_name; + while (namelen > 0) { + int len; + len = namelen; + if (len > ZAP_LEAF_ARRAY_BYTES) + len = ZAP_LEAF_ARRAY_BYTES; + memcpy(p, nc->l_array.la_array, len); + p += len; + namelen -= len; + nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); + } + d->d_name[sizeof(d->d_name) - 1] = 0; + + /* + * Assume the first eight bytes of the value are + * a uint64_t. + */ + value = fzap_leaf_value(&zl, zc); + + d->d_fileno = ZFS_DIRENT_OBJ(value); + d->d_type = ZFS_DIRENT_TYPE(value); + d->d_namlen = strlen(d->d_name); + + return (0); + } +} + +static int +vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size) +{ + int fd; + + fd = (uintptr_t) priv; + lseek(fd, offset, SEEK_SET); + if (read(fd, buf, size) == size) { + return 0; + } else { + return (EIO); + } +} + +/* + * Convert a pool guid to a 'unit number' suitable for use with zfs_dev_open. + */ +int +zfs_guid_to_unit(uint64_t guid) +{ + spa_t *spa; + int unit; + + unit = 0; + STAILQ_FOREACH(spa, &zfs_pools, spa_link) { + if (spa->spa_guid == guid) + return unit; + unit++; + } + return (-1); +} + +static int +zfs_dev_init(void) +{ + char devname[512]; + int unit, slice; + int fd; + + /* + * Open all the disks we can find and see if we can reconstruct + * ZFS pools from them. Bogusly assumes that the disks are named + * diskN or diskNsM. + */ + zfs_init(); + for (unit = 0; unit < 32 /* XXX */; unit++) { + sprintf(devname, "disk%d:", unit); + fd = open(devname, O_RDONLY); + if (fd == -1) + continue; + + /* + * If we find a vdev, the zfs code will eat the fd, otherwise + * we close it. + */ + if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0)) + close(fd); + + for (slice = 1; slice <= 4; slice++) { + sprintf(devname, "disk%ds%d:", unit, slice); + fd = open(devname, O_RDONLY); + if (fd == -1) + continue; + if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0)) + close(fd); + } + } + + return (0); +} + +/* + * Print information about ZFS pools + */ +static void +zfs_dev_print(int verbose) +{ + spa_t *spa; + char line[80]; + int unit; + + if (verbose) { + spa_all_status(); + return; + } + unit = 0; + STAILQ_FOREACH(spa, &zfs_pools, spa_link) { + sprintf(line, " zfs%d: %s\n", unit, spa->spa_name); + pager_output(line); + unit++; + } +} + +/* + * Attempt to open the pool described by (dev) for use by (f). + */ +static int +zfs_dev_open(struct open_file *f, ...) +{ + va_list args; + struct devdesc *dev; + int unit, i; + spa_t *spa; + + va_start(args, f); + dev = va_arg(args, struct devdesc*); + va_end(args); + + /* + * We mostly ignore the stuff that devopen sends us. For now, + * use the unit to find a pool - later we will override the + * devname parsing so that we can name a pool and a fs within + * the pool. + */ + unit = dev->d_unit; + free(dev); + + i = 0; + STAILQ_FOREACH(spa, &zfs_pools, spa_link) { + if (i == unit) + break; + i++; + } + if (!spa) { + return (ENXIO); + } + + f->f_devdata = spa; + return (0); +} + +static int +zfs_dev_close(struct open_file *f) +{ + + f->f_devdata = NULL; + return (0); +} + +static int +zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize) +{ + + return (ENOSYS); +} + +struct devsw zfs_dev = { + .dv_name = "zfs", + .dv_type = DEVT_ZFS, + .dv_init = zfs_dev_init, + .dv_strategy = zfs_dev_strategy, + .dv_open = zfs_dev_open, + .dv_close = zfs_dev_close, + .dv_ioctl = noioctl, + .dv_print = zfs_dev_print, + .dv_cleanup = NULL +}; diff --git a/sys/boot/zfs/zfsimpl.c b/sys/boot/zfs/zfsimpl.c new file mode 100644 index 000000000000..5bbc351f8408 --- /dev/null +++ b/sys/boot/zfs/zfsimpl.c @@ -0,0 +1,1443 @@ +/*- + * Copyright (c) 2007 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Stand-alone ZFS file reader. + */ + +#include "zfsimpl.h" +#include "zfssubr.c" + +/* + * List of all vdevs, chained through v_alllink. + */ +static vdev_list_t zfs_vdevs; + +/* + * List of all pools, chained through spa_link. + */ +static spa_list_t zfs_pools; + +static uint64_t zfs_crc64_table[256]; +static char *zfs_decomp_buf; +static const dnode_phys_t *dnode_cache_obj = 0; +static uint64_t dnode_cache_bn; +static char *dnode_cache_buf; +static char *zap_scratch; + +/* + * Forward declarations. + */ +static int zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset); + +static void +zfs_init(void) +{ + STAILQ_INIT(&zfs_vdevs); + STAILQ_INIT(&zfs_pools); + + zfs_decomp_buf = malloc(128*1024); + dnode_cache_buf = malloc(128*1024); + zap_scratch = malloc(128*1024); + + zfs_init_crc(); +} + +static int +xdr_int(const unsigned char **xdr, int *ip) +{ + *ip = ((*xdr)[0] << 24) + | ((*xdr)[1] << 16) + | ((*xdr)[2] << 8) + | ((*xdr)[3] << 0); + (*xdr) += 4; + return (0); +} + +static int +xdr_u_int(const unsigned char **xdr, u_int *ip) +{ + *ip = ((*xdr)[0] << 24) + | ((*xdr)[1] << 16) + | ((*xdr)[2] << 8) + | ((*xdr)[3] << 0); + (*xdr) += 4; + return (0); +} + +static int +xdr_uint64_t(const unsigned char **xdr, uint64_t *lp) +{ + u_int hi, lo; + + xdr_u_int(xdr, &hi); + xdr_u_int(xdr, &lo); + *lp = (((uint64_t) hi) << 32) | lo; + return (0); +} + +static int +nvlist_find(const unsigned char *nvlist, const char *name, int type, + int* elementsp, void *valuep) +{ + const unsigned char *p, *pair; + int junk; + int encoded_size, decoded_size; + + p = nvlist; + xdr_int(&p, &junk); + xdr_int(&p, &junk); + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + while (encoded_size && decoded_size) { + int namelen, pairtype, elements; + const char *pairname; + + xdr_int(&p, &namelen); + pairname = (const char*) p; + p += roundup(namelen, 4); + xdr_int(&p, &pairtype); + + if (!memcmp(name, pairname, namelen) && type == pairtype) { + xdr_int(&p, &elements); + if (elementsp) + *elementsp = elements; + if (type == DATA_TYPE_UINT64) { + xdr_uint64_t(&p, (uint64_t *) valuep); + return (0); + } else if (type == DATA_TYPE_STRING) { + int len; + xdr_int(&p, &len); + (*(const char**) valuep) = (const char*) p; + return (0); + } else if (type == DATA_TYPE_NVLIST + || type == DATA_TYPE_NVLIST_ARRAY) { + (*(const unsigned char**) valuep) = + (const unsigned char*) p; + return (0); + } else { + return (EIO); + } + } else { + /* + * Not the pair we are looking for, skip to the next one. + */ + p = pair + encoded_size; + } + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + } + + return (EIO); +} + +/* + * Return the next nvlist in an nvlist array. + */ +static const unsigned char * +nvlist_next(const unsigned char *nvlist) +{ + const unsigned char *p, *pair; + int junk; + int encoded_size, decoded_size; + + p = nvlist; + xdr_int(&p, &junk); + xdr_int(&p, &junk); + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + while (encoded_size && decoded_size) { + p = pair + encoded_size; + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + } + + return p; +} + +#ifdef TEST + +static const unsigned char * +nvlist_print(const unsigned char *nvlist, unsigned int indent) +{ + static const char* typenames[] = { + "DATA_TYPE_UNKNOWN", + "DATA_TYPE_BOOLEAN", + "DATA_TYPE_BYTE", + "DATA_TYPE_INT16", + "DATA_TYPE_UINT16", + "DATA_TYPE_INT32", + "DATA_TYPE_UINT32", + "DATA_TYPE_INT64", + "DATA_TYPE_UINT64", + "DATA_TYPE_STRING", + "DATA_TYPE_BYTE_ARRAY", + "DATA_TYPE_INT16_ARRAY", + "DATA_TYPE_UINT16_ARRAY", + "DATA_TYPE_INT32_ARRAY", + "DATA_TYPE_UINT32_ARRAY", + "DATA_TYPE_INT64_ARRAY", + "DATA_TYPE_UINT64_ARRAY", + "DATA_TYPE_STRING_ARRAY", + "DATA_TYPE_HRTIME", + "DATA_TYPE_NVLIST", + "DATA_TYPE_NVLIST_ARRAY", + "DATA_TYPE_BOOLEAN_VALUE", + "DATA_TYPE_INT8", + "DATA_TYPE_UINT8", + "DATA_TYPE_BOOLEAN_ARRAY", + "DATA_TYPE_INT8_ARRAY", + "DATA_TYPE_UINT8_ARRAY" + }; + + unsigned int i, j; + const unsigned char *p, *pair; + int junk; + int encoded_size, decoded_size; + + p = nvlist; + xdr_int(&p, &junk); + xdr_int(&p, &junk); + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + while (encoded_size && decoded_size) { + int namelen, pairtype, elements; + const char *pairname; + + xdr_int(&p, &namelen); + pairname = (const char*) p; + p += roundup(namelen, 4); + xdr_int(&p, &pairtype); + + for (i = 0; i < indent; i++) + printf(" "); + printf("%s %s", typenames[pairtype], pairname); + + xdr_int(&p, &elements); + switch (pairtype) { + case DATA_TYPE_UINT64: { + uint64_t val; + xdr_uint64_t(&p, &val); + printf(" = 0x%llx\n", val); + break; + } + + case DATA_TYPE_STRING: { + int len; + xdr_int(&p, &len); + printf(" = \"%s\"\n", p); + break; + } + + case DATA_TYPE_NVLIST: + printf("\n"); + nvlist_print(p, indent + 1); + break; + + case DATA_TYPE_NVLIST_ARRAY: + for (j = 0; j < elements; j++) { + printf("[%d]\n", j); + p = nvlist_print(p, indent + 1); + if (j != elements - 1) { + for (i = 0; i < indent; i++) + printf(" "); + printf("%s %s", typenames[pairtype], pairname); + } + } + break; + + default: + printf("\n"); + } + + p = pair + encoded_size; + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + } + + return p; +} + +#endif + +static int +vdev_mirror_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size) +{ + vdev_t *kid; + int rc; + + rc = EIO; + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + if (kid->v_state != VDEV_STATE_HEALTHY) + continue; + rc = kid->v_read(kid, kid->v_read_priv, offset, buf, size); + if (!rc) + return (0); + } + + return (rc); +} + +static vdev_t * +vdev_find(uint64_t guid) +{ + vdev_t *vdev; + + STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) + if (vdev->v_guid == guid) + return (vdev); + + return (0); +} + +static vdev_t * +vdev_create(uint64_t guid, vdev_read_t *read, void *read_priv) +{ + vdev_t *vdev; + + vdev = malloc(sizeof(vdev_t)); + memset(vdev, 0, sizeof(vdev_t)); + STAILQ_INIT(&vdev->v_children); + vdev->v_guid = guid; + vdev->v_state = VDEV_STATE_OFFLINE; + vdev->v_read = read; + vdev->v_read_priv = read_priv; + STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); + + return (vdev); +} + +static int +vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp) +{ + int rc; + uint64_t guid, id; + const char *type; + const char *path; + vdev_t *vdev, *kid; + const unsigned char *kids; + int nkids, i; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, + DATA_TYPE_UINT64, 0, &guid) + || nvlist_find(nvlist, ZPOOL_CONFIG_ID, + DATA_TYPE_UINT64, 0, &id) + || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, + DATA_TYPE_STRING, 0, &type)) { + printf("ZFS: can't find vdev details\n"); + return (ENOENT); + } + + /* + * Assume that if we've seen this vdev tree before, this one + * will be identical. + */ + vdev = vdev_find(guid); + if (vdev) { + if (vdevp) + *vdevp = vdev; + return (0); + } + + if (strcmp(type, VDEV_TYPE_MIRROR) + && strcmp(type, VDEV_TYPE_DISK)) { + printf("ZFS: can only boot from disk or mirror vdevs\n"); + return (EIO); + } + + if (!strcmp(type, VDEV_TYPE_MIRROR)) + vdev = vdev_create(guid, vdev_mirror_read, 0); + else + vdev = vdev_create(guid, 0, 0); + + + if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, + DATA_TYPE_STRING, 0, &path) == 0) { + if (strlen(path) > 5 + && path[0] == '/' + && path[1] == 'd' + && path[2] == 'e' + && path[3] == 'v' + && path[4] == '/') + path += 5; + vdev->v_name = strdup(path); + } else { + vdev->v_name = strdup(type); + } + vdev->v_id = id; + rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, + DATA_TYPE_NVLIST_ARRAY, &nkids, &kids); + /* + * Its ok if we don't have any kids. + */ + if (rc == 0) { + for (i = 0; i < nkids; i++) { + rc = vdev_init_from_nvlist(kids, &kid); + if (rc) + return (rc); + STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink); + kids = nvlist_next(kids); + } + } + + if (vdevp) + *vdevp = vdev; + return (0); +} + +static void +vdev_set_state(vdev_t *vdev) +{ + vdev_t *kid; + int good_kids; + int bad_kids; + + /* + * We assume that if we have kids, we are a mirror. A mirror + * is healthy if all its kids are healthy. Its degraded (but + * working) if at least one kid is healty. + */ + + if (STAILQ_FIRST(&vdev->v_children)) { + good_kids = 0; + bad_kids = 0; + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + if (kid->v_state == VDEV_STATE_HEALTHY) + good_kids++; + else + bad_kids++; + } + if (good_kids) { + if (!bad_kids && good_kids) + vdev->v_state = VDEV_STATE_HEALTHY; + else + vdev->v_state = VDEV_STATE_DEGRADED; + } else { + vdev->v_state = VDEV_STATE_OFFLINE; + } + } +} + +static spa_t * +spa_find_by_guid(uint64_t guid) +{ + spa_t *spa; + + STAILQ_FOREACH(spa, &zfs_pools, spa_link) + if (spa->spa_guid == guid) + return (spa); + + return (0); +} + +#ifdef BOOT2 + +static spa_t * +spa_find_by_name(const char *name) +{ + spa_t *spa; + + STAILQ_FOREACH(spa, &zfs_pools, spa_link) + if (!strcmp(spa->spa_name, name)) + return (spa); + + return (0); +} + +#endif + +static spa_t * +spa_create(uint64_t guid) +{ + spa_t *spa; + + spa = malloc(sizeof(spa_t)); + memset(spa, 0, sizeof(spa_t)); + STAILQ_INIT(&spa->spa_vdevs); + spa->spa_guid = guid; + STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); + + return (spa); +} + +static const char * +state_name(vdev_state_t state) +{ + static const char* names[] = { + "UNKNOWN", + "CLOSED", + "OFFLINE", + "CANT_OPEN", + "DEGRADED", + "ONLINE" + }; + return names[state]; +} + +#ifdef BOOT2 + +#define pager_printf printf + +#else + +static void +pager_printf(const char *fmt, ...) +{ + char line[80]; + va_list args; + + va_start(args, fmt); + vsprintf(line, fmt, args); + va_end(args); + pager_output(line); +} + +#endif + +#define STATUS_FORMAT " %-16s %-10s\n" + +static void +print_state(int indent, const char *name, vdev_state_t state) +{ + int i; + char buf[512]; + + buf[0] = 0; + for (i = 0; i < indent; i++) + strcat(buf, " "); + strcat(buf, name); + pager_printf(STATUS_FORMAT, buf, state_name(state)); + +} + +static void +vdev_status(vdev_t *vdev, int indent) +{ + vdev_t *kid; + print_state(indent, vdev->v_name, vdev->v_state); + + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + vdev_status(kid, indent + 1); + } +} + +static void +spa_status(spa_t *spa) +{ + vdev_t *vdev; + int good_kids, bad_kids, degraded_kids; + vdev_state_t state; + + pager_printf(" pool: %s\n", spa->spa_name); + pager_printf("config:\n\n"); + pager_printf(STATUS_FORMAT, "NAME", "STATE"); + + good_kids = 0; + degraded_kids = 0; + bad_kids = 0; + STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + if (vdev->v_state == VDEV_STATE_HEALTHY) + good_kids++; + else if (vdev->v_state == VDEV_STATE_DEGRADED) + degraded_kids++; + else + bad_kids++; + } + + state = VDEV_STATE_CLOSED; + if (good_kids > 0 && (degraded_kids + bad_kids) == 0) + state = VDEV_STATE_HEALTHY; + else if ((good_kids + degraded_kids) > 0) + state = VDEV_STATE_DEGRADED; + + print_state(0, spa->spa_name, state); + STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + vdev_status(vdev, 1); + } +} + +static void +spa_all_status(void) +{ + spa_t *spa; + int first = 1; + + STAILQ_FOREACH(spa, &zfs_pools, spa_link) { + if (!first) + pager_printf("\n"); + first = 0; + spa_status(spa); + } +} + +static int +vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap) +{ + vdev_t vtmp; + vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch; + spa_t *spa; + vdev_t *vdev, *top_vdev, *pool_vdev; + off_t off; + blkptr_t bp; + const unsigned char *nvlist; + uint64_t val; + uint64_t guid; + uint64_t pool_txg, pool_guid; + const char *pool_name; + const unsigned char *vdevs; + int i; + char upbuf[1024]; + const struct uberblock *up; + + /* + * Load the vdev label and figure out which + * uberblock is most current. + */ + memset(&vtmp, 0, sizeof(vtmp)); + vtmp.v_read = read; + vtmp.v_read_priv = read_priv; + off = offsetof(vdev_label_t, vl_vdev_phys); + BP_ZERO(&bp); + BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); + BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + if (zio_read_phys(&vtmp, &bp, vdev_label, off)) + return (EIO); + + if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) { + return (EIO); + } + + nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4; + + if (nvlist_find(nvlist, + ZPOOL_CONFIG_VERSION, + DATA_TYPE_UINT64, 0, &val)) { + return (EIO); + } + + if (val != ZFS_VERSION) { + printf("ZFS: unsupported ZFS version %d\n", (int) val); + return (EIO); + } + + if (nvlist_find(nvlist, + ZPOOL_CONFIG_POOL_STATE, + DATA_TYPE_UINT64, 0, &val)) { + return (EIO); + } + + if (val != POOL_STATE_ACTIVE) { + /* + * Don't print a message here. If we happen to reboot + * while where is an exported pool around, we don't + * need a cascade of confusing messages during boot. + */ + /*printf("ZFS: pool is not active\n");*/ + return (EIO); + } + + if (nvlist_find(nvlist, + ZPOOL_CONFIG_POOL_TXG, + DATA_TYPE_UINT64, 0, &pool_txg) + || nvlist_find(nvlist, + ZPOOL_CONFIG_POOL_GUID, + DATA_TYPE_UINT64, 0, &pool_guid) + || nvlist_find(nvlist, + ZPOOL_CONFIG_POOL_NAME, + DATA_TYPE_STRING, 0, &pool_name)) { + printf("ZFS: can't find pool details\n"); + return (EIO); + } + + /* + * Create the pool if this is the first time we've seen it. + */ + spa = spa_find_by_guid(pool_guid); + if (!spa) { + spa = spa_create(pool_guid); + spa->spa_name = strdup(pool_name); + } + if (pool_txg > spa->spa_txg) + spa->spa_txg = pool_txg; + + /* + * Get the vdev tree and create our in-core copy of it. + * If we already have a healthy vdev with this guid, this must + * be some kind of alias (overlapping slices, dangerously dedicated + * disks etc). + */ + if (nvlist_find(nvlist, + ZPOOL_CONFIG_GUID, + DATA_TYPE_UINT64, 0, &guid)) { + return (EIO); + } + vdev = vdev_find(guid); + if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) { + return (EIO); + } + + if (nvlist_find(nvlist, + ZPOOL_CONFIG_VDEV_TREE, + DATA_TYPE_NVLIST, 0, &vdevs)) { + return (EIO); + } + vdev_init_from_nvlist(vdevs, &top_vdev); + + /* + * Add the toplevel vdev to the pool if its not already there. + */ + STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink) + if (top_vdev == pool_vdev) + break; + if (!pool_vdev && top_vdev) + STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink); + + /* + * We should already have created an incomplete vdev for this + * vdev. Find it and initialise it with our read proc. + */ + vdev = vdev_find(guid); + if (vdev) { + vdev->v_read = read; + vdev->v_read_priv = read_priv; + vdev->v_state = VDEV_STATE_HEALTHY; + } else { + printf("ZFS: inconsistent nvlist contents\n"); + return (EIO); + } + + /* + * Re-evaluate top-level vdev state. + */ + vdev_set_state(top_vdev); + + /* + * Ok, we are happy with the pool so far. Lets find + * the best uberblock and then we can actually access + * the contents of the pool. + */ + for (i = 0; + i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT; + i++) { + off = offsetof(vdev_label_t, vl_uberblock); + off += i << UBERBLOCK_SHIFT; + BP_ZERO(&bp); + DVA_SET_OFFSET(&bp.blk_dva[0], off); + BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT); + BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + if (zio_read_phys(vdev, &bp, upbuf, off)) + continue; + + up = (const struct uberblock *) upbuf; + if (up->ub_magic != UBERBLOCK_MAGIC) + continue; + if (up->ub_txg < spa->spa_txg) + continue; + if (up->ub_txg > spa->spa_uberblock.ub_txg) { + spa->spa_uberblock = *up; + } else if (up->ub_txg == spa->spa_uberblock.ub_txg) { + if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp) + spa->spa_uberblock = *up; + } + } + + if (spap) + *spap = spa; + return (0); +} + +static int +ilog2(int n) +{ + int v; + + for (v = 0; v < 32; v++) + if (n == (1 << v)) + return v; + return -1; +} + +static int +zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset) +{ + int cpfunc = BP_GET_COMPRESS(bp); + size_t lsize = BP_GET_LSIZE(bp); + size_t psize = BP_GET_PSIZE(bp); + int rc; + + /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/ + if (cpfunc != ZIO_COMPRESS_OFF) { + rc = vdev->v_read(vdev, vdev->v_read_priv, offset, zfs_decomp_buf, psize); + if (rc) + return (rc); + if (zio_checksum_error(bp, zfs_decomp_buf)) + return (EIO); + if (zio_decompress_data(cpfunc, zfs_decomp_buf, psize, + buf, lsize)) + return (EIO); + } else { + rc = vdev->v_read(vdev, vdev->v_read_priv, offset, buf, psize); + if (rc) + return (rc); + + if (zio_checksum_error(bp, buf)) + return (EIO); + } + return (0); +} + +static int +zio_read(spa_t *spa, const blkptr_t *bp, void *buf) +{ + int i; + + for (i = 0; i < SPA_DVAS_PER_BP; i++) { + const dva_t *dva = &bp->blk_dva[i]; + vdev_t *vdev; + int vdevid; + off_t offset; + + if (!dva->dva_word[0] && !dva->dva_word[1]) + continue; + + vdevid = DVA_GET_VDEV(dva); + offset = DVA_GET_OFFSET(dva) + VDEV_LABEL_START_SIZE; + STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) + if (vdev->v_id == vdevid) + break; + if (!vdev || !vdev->v_read) + continue; + if (zio_read_phys(vdev, bp, buf, offset)) + continue; + + return (0); + } + printf("ZFS: i/o error - all block copies unavailable\n"); + + return (EIO); +} + +static int +dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen) +{ + int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; + int nlevels = dnode->dn_nlevels; + int i, rc; + + /* + * We truncate the offset to 32bits, mainly so that I don't + * have to find a copy of __divdi3 to put into the bootstrap. + * I don't think the bootstrap needs to access anything bigger + * than 2G anyway. Note that block addresses are still 64bit + * so it doesn't affect the possible size of the media. + * We still use 64bit block numbers so that the bitshifts + * work correctly. Note: bsize may not be a power of two here. + */ + while (buflen > 0) { + uint64_t bn = ((int) offset) / bsize; + int boff = ((int) offset) % bsize; + int ibn; + const blkptr_t *indbp; + blkptr_t bp; + + if (bn > dnode->dn_maxblkid) + return (EIO); + + if (dnode == dnode_cache_obj && bn == dnode_cache_bn) + goto cached; + + indbp = dnode->dn_blkptr; + for (i = 0; i < nlevels; i++) { + /* + * Copy the bp from the indirect array so that + * we can re-use the scratch buffer for multi-level + * objects. + */ + ibn = bn >> ((nlevels - i - 1) * ibshift); + ibn &= ((1 << ibshift) - 1); + bp = indbp[ibn]; + rc = zio_read(spa, &bp, dnode_cache_buf); + if (rc) + return (rc); + indbp = (const blkptr_t *) dnode_cache_buf; + } + dnode_cache_obj = dnode; + dnode_cache_bn = bn; + cached: + + /* + * The buffer contains our data block. Copy what we + * need from it and loop. + */ + i = bsize - boff; + if (i > buflen) i = buflen; + memcpy(buf, &dnode_cache_buf[boff], i); + buf = ((char*) buf) + i; + offset += i; + buflen -= i; + } + + return (0); +} + +/* + * Lookup a value in a microzap directory. Assumes that the zap + * scratch buffer contains the directory contents. + */ +static int +mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value) +{ + const mzap_phys_t *mz; + const mzap_ent_phys_t *mze; + size_t size; + int chunks, i; + + /* + * Microzap objects use exactly one block. Read the whole + * thing. + */ + size = dnode->dn_datablkszsec * 512; + + mz = (const mzap_phys_t *) zap_scratch; + chunks = size / MZAP_ENT_LEN - 1; + + for (i = 0; i < chunks; i++) { + mze = &mz->mz_chunk[i]; + if (!strcmp(mze->mze_name, name)) { + *value = mze->mze_value; + return (0); + } + } + + return (ENOENT); +} + +/* + * Compare a name with a zap leaf entry. Return non-zero if the name + * matches. + */ +static int +fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name) +{ + size_t namelen; + const zap_leaf_chunk_t *nc; + const char *p; + + namelen = zc->l_entry.le_name_length; + + nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); + p = name; + while (namelen > 0) { + size_t len; + len = namelen; + if (len > ZAP_LEAF_ARRAY_BYTES) + len = ZAP_LEAF_ARRAY_BYTES; + if (memcmp(p, nc->l_array.la_array, len)) + return (0); + p += len; + namelen -= len; + nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); + } + + return 1; +} + +/* + * Extract a uint64_t value from a zap leaf entry. + */ +static uint64_t +fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) +{ + const zap_leaf_chunk_t *vc; + int i; + uint64_t value; + const uint8_t *p; + + vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); + for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { + value = (value << 8) | p[i]; + } + + return value; +} + +/* + * Lookup a value in a fatzap directory. Assumes that the zap scratch + * buffer contains the directory header. + */ +static int +fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value) +{ + int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; + zap_phys_t zh = *(zap_phys_t *) zap_scratch; + fat_zap_t z; + uint64_t *ptrtbl; + uint64_t hash; + int rc; + + if (zh.zap_magic != ZAP_MAGIC) + return (EIO); + + z.zap_block_shift = ilog2(bsize); + z.zap_phys = (zap_phys_t *) zap_scratch; + + /* + * Figure out where the pointer table is and read it in if necessary. + */ + if (zh.zap_ptrtbl.zt_blk) { + rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize, + zap_scratch, bsize); + if (rc) + return (rc); + ptrtbl = (uint64_t *) zap_scratch; + } else { + ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0); + } + + hash = zap_hash(zh.zap_salt, name); + + zap_leaf_t zl; + zl.l_bs = z.zap_block_shift; + + off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs; + zap_leaf_chunk_t *zc; + + rc = dnode_read(spa, dnode, off, zap_scratch, bsize); + if (rc) + return (rc); + + zl.l_phys = (zap_leaf_phys_t *) zap_scratch; + + /* + * Make sure this chunk matches our hash. + */ + if (zl.l_phys->l_hdr.lh_prefix_len > 0 + && zl.l_phys->l_hdr.lh_prefix + != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len)) + return (ENOENT); + + /* + * Hash within the chunk to find our entry. + */ + int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len); + int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1); + h = zl.l_phys->l_hash[h]; + if (h == 0xffff) + return (ENOENT); + zc = &ZAP_LEAF_CHUNK(&zl, h); + while (zc->l_entry.le_hash != hash) { + if (zc->l_entry.le_next == 0xffff) { + zc = 0; + break; + } + zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next); + } + if (fzap_name_equal(&zl, zc, name)) { + *value = fzap_leaf_value(&zl, zc); + return (0); + } + + return (ENOENT); +} + +/* + * Lookup a name in a zap object and return its value as a uint64_t. + */ +static int +zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value) +{ + int rc; + uint64_t zap_type; + size_t size = dnode->dn_datablkszsec * 512; + + rc = dnode_read(spa, dnode, 0, zap_scratch, size); + if (rc) + return (rc); + + zap_type = *(uint64_t *) zap_scratch; + if (zap_type == ZBT_MICRO) + return mzap_lookup(spa, dnode, name, value); + else + return fzap_lookup(spa, dnode, name, value); +} + +#ifdef BOOT2 + +/* + * List a microzap directory. Assumes that the zap scratch buffer contains + * the directory contents. + */ +static int +mzap_list(spa_t *spa, const dnode_phys_t *dnode) +{ + const mzap_phys_t *mz; + const mzap_ent_phys_t *mze; + size_t size; + int chunks, i; + + /* + * Microzap objects use exactly one block. Read the whole + * thing. + */ + size = dnode->dn_datablkszsec * 512; + mz = (const mzap_phys_t *) zap_scratch; + chunks = size / MZAP_ENT_LEN - 1; + + for (i = 0; i < chunks; i++) { + mze = &mz->mz_chunk[i]; + if (mze->mze_name[0]) + //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value); + printf("%s\n", mze->mze_name); + } + + return (0); +} + +/* + * List a fatzap directory. Assumes that the zap scratch buffer contains + * the directory header. + */ +static int +fzap_list(spa_t *spa, const dnode_phys_t *dnode) +{ + int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; + zap_phys_t zh = *(zap_phys_t *) zap_scratch; + fat_zap_t z; + int i, j; + + if (zh.zap_magic != ZAP_MAGIC) + return (EIO); + + z.zap_block_shift = ilog2(bsize); + z.zap_phys = (zap_phys_t *) zap_scratch; + + /* + * This assumes that the leaf blocks start at block 1. The + * documentation isn't exactly clear on this. + */ + zap_leaf_t zl; + zl.l_bs = z.zap_block_shift; + for (i = 0; i < zh.zap_num_leafs; i++) { + off_t off = (i + 1) << zl.l_bs; + char name[256], *p; + uint64_t value; + + if (dnode_read(spa, dnode, off, zap_scratch, bsize)) + return (EIO); + + zl.l_phys = (zap_leaf_phys_t *) zap_scratch; + + for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { + zap_leaf_chunk_t *zc, *nc; + int namelen; + + zc = &ZAP_LEAF_CHUNK(&zl, j); + if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) + continue; + namelen = zc->l_entry.le_name_length; + if (namelen > sizeof(name)) + namelen = sizeof(name); + + /* + * Paste the name back together. + */ + nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); + p = name; + while (namelen > 0) { + int len; + len = namelen; + if (len > ZAP_LEAF_ARRAY_BYTES) + len = ZAP_LEAF_ARRAY_BYTES; + memcpy(p, nc->l_array.la_array, len); + p += len; + namelen -= len; + nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); + } + + /* + * Assume the first eight bytes of the value are + * a uint64_t. + */ + value = fzap_leaf_value(&zl, zc); + + printf("%-32s 0x%llx\n", name, value); + } + } + + return (0); +} + +/* + * List a zap directory. + */ +static int +zap_list(spa_t *spa, const dnode_phys_t *dnode) +{ + uint64_t zap_type; + size_t size = dnode->dn_datablkszsec * 512; + + if (dnode_read(spa, dnode, 0, zap_scratch, size)) + return (EIO); + + zap_type = *(uint64_t *) zap_scratch; + if (zap_type == ZBT_MICRO) + return mzap_list(spa, dnode); + else + return fzap_list(spa, dnode); +} + +#endif + +static int +objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode) +{ + off_t offset; + + offset = objnum * sizeof(dnode_phys_t); + return dnode_read(spa, &os->os_meta_dnode, offset, + dnode, sizeof(dnode_phys_t)); +} + +/* + * Find the object set given the object number of its dataset object + * and return its details in *objset + */ +static int +zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset) +{ + dnode_phys_t dataset; + dsl_dataset_phys_t *ds; + + if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { + printf("ZFS: can't find dataset %lld\n", objnum); + return (EIO); + } + + ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; + if (zio_read(spa, &ds->ds_bp, objset)) { + printf("ZFS: can't read object set for dataset %lld\n", objnum); + return (EIO); + } + + return (0); +} + +/* + * Find the object set pointed to by the BOOTFS property or the root + * dataset if there is none and return its details in *objset + */ +static int +zfs_mount_root(spa_t *spa, objset_phys_t *objset) +{ + dnode_phys_t dir, propdir; + uint64_t props, bootfs, root; + + /* + * Start with the MOS directory object. + */ + if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) { + printf("ZFS: can't read MOS object directory\n"); + return (EIO); + } + + /* + * Lookup the pool_props and see if we can find a bootfs. + */ + if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0 + && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 + && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0) + return zfs_mount_dataset(spa, bootfs, objset); + + /* + * Lookup the root dataset directory + */ + if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root) + || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) { + printf("ZFS: can't find root dsl_dir\n"); + return (EIO); + } + + /* + * Use the information from the dataset directory's bonus buffer + * to find the dataset object and from that the object set itself. + */ + dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus; + return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset); +} + +static int +zfs_mount_pool(spa_t *spa) +{ + /* + * Find the MOS and work our way in from there. + */ + if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) { + printf("ZFS: can't read MOS\n"); + return (EIO); + } + + /* + * Find the root object set + */ + if (zfs_mount_root(spa, &spa->spa_root_objset)) { + printf("Can't find root filesystem - giving up\n"); + return (EIO); + } + + return (0); +} + +/* + * Lookup a file and return its dnode. + */ +static int +zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode) +{ + int rc; + uint64_t objnum, rootnum, parentnum; + dnode_phys_t dn; + const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus; + const char *p, *q; + char element[256]; + char path[1024]; + int symlinks_followed = 0; + + if (spa->spa_root_objset.os_type != DMU_OST_ZFS) { + printf("ZFS: unexpected object set type %lld\n", + spa->spa_root_objset.os_type); + return (EIO); + } + + /* + * Get the root directory dnode. + */ + rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn); + if (rc) + return (rc); + + rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum); + if (rc) + return (rc); + + rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn); + if (rc) + return (rc); + + objnum = rootnum; + p = upath; + while (p && *p) { + while (*p == '/') + p++; + if (!*p) + break; + q = strchr(p, '/'); + if (q) { + memcpy(element, p, q - p); + element[q - p] = 0; + p = q; + } else { + strcpy(element, p); + p = 0; + } + + if ((zp->zp_mode >> 12) != 0x4) { + return (ENOTDIR); + } + + parentnum = objnum; + rc = zap_lookup(spa, &dn, element, &objnum); + if (rc) + return (rc); + objnum = ZFS_DIRENT_OBJ(objnum); + + rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn); + if (rc) + return (rc); + + /* + * Check for symlink. + */ + if ((zp->zp_mode >> 12) == 0xa) { + if (symlinks_followed > 10) + return (EMLINK); + symlinks_followed++; + + /* + * Read the link value and copy the tail of our + * current path onto the end. + */ + if (p) + strcpy(&path[zp->zp_size], p); + else + path[zp->zp_size] = 0; + if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) { + memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)], + zp->zp_size); + } else { + rc = dnode_read(spa, &dn, 0, path, zp->zp_size); + if (rc) + return (rc); + } + + /* + * Restart with the new path, starting either at + * the root or at the parent depending whether or + * not the link is relative. + */ + p = path; + if (*p == '/') + objnum = rootnum; + else + objnum = parentnum; + objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn); + } + } + + *dnode = dn; + return (0); +} diff --git a/sys/cddl/boot/zfs/README b/sys/cddl/boot/zfs/README new file mode 100644 index 000000000000..4b6218165ad4 --- /dev/null +++ b/sys/cddl/boot/zfs/README @@ -0,0 +1,14 @@ +$FreeBSD$ + +This directory contains various files derived from CDDL sources that +are used by the ZFS bootstrap: + + fletcher.c checksum support + sha256.c checksum support + lzjb.c compression support + zfssubr.c mostly checksum and compression support + zfsimpl.h mostly describing the physical layout + +The files fletcher.c, lzjb.c and sha256.c are largely identical to the +ZFS base code (with write support removed) and could be shared but +that might complicate future imports from OpenSolaris. diff --git a/sys/cddl/boot/zfs/fletcher.c b/sys/cddl/boot/zfs/fletcher.c new file mode 100644 index 000000000000..2b9728d70484 --- /dev/null +++ b/sys/cddl/boot/zfs/fletcher.c @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/*#pragma ident "%Z%%M% %I% %E% SMI"*/ + +static void +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += ip[0]; + a1 += ip[1]; + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +static void +fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} diff --git a/sys/cddl/boot/zfs/lzjb.c b/sys/cddl/boot/zfs/lzjb.c new file mode 100644 index 000000000000..1283a6c31a4b --- /dev/null +++ b/sys/cddl/boot/zfs/lzjb.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/*#pragma ident "%Z%%M% %I% %E% SMI"*/ + +/* + * We keep our own copy of this algorithm for 2 main reasons: + * 1. If we didn't, anyone modifying common/os/compress.c would + * directly break our on disk format + * 2. Our version of lzjb does not have a number of checks that the + * common/os version needs and uses + * In particular, we are adding the "feature" that compress() can + * take a destination buffer size and return -1 if the data will not + * compress to d_len or less. + */ + +#define MATCH_BITS 6 +#define MATCH_MIN 3 +#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) +#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) +#define LEMPEL_SIZE 256 + +/*ARGSUSED*/ +static int +lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + unsigned char *src = s_start; + unsigned char *dst = d_start; + unsigned char *d_end = (unsigned char *)d_start + d_len; + unsigned char *cpy, copymap = 0; + int copymask = 1 << (NBBY - 1); + + while (dst < d_end) { + if ((copymask <<= 1) == (1 << NBBY)) { + copymask = 1; + copymap = *src++; + } + if (copymap & copymask) { + int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; + int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; + src += 2; + if ((cpy = dst - offset) < (unsigned char *)d_start) + return (-1); + while (--mlen >= 0 && dst < d_end) + *dst++ = *cpy++; + } else { + *dst++ = *src++; + } + } + return (0); +} diff --git a/sys/cddl/boot/zfs/sha256.c b/sys/cddl/boot/zfs/sha256.c new file mode 100644 index 000000000000..f0d83acf557c --- /dev/null +++ b/sys/cddl/boot/zfs/sha256.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/*#pragma ident "%Z%%M% %I% %E% SMI"*/ + +/* + * SHA-256 checksum, as specified in FIPS 180-2, available at: + * http://csrc.nist.gov/cryptval + * + * This is a very compact implementation of SHA-256. + * It is designed to be simple and portable, not to be fast. + */ + +/* + * The literal definitions according to FIPS180-2 would be: + * + * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) + * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) + * + * We use logical equivalents which require one less op. + */ +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) +#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s))) +#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22)) +#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25)) +#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3)) +#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10)) + +static const uint32_t SHA256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +static void +SHA256Transform(uint32_t *H, const uint8_t *cp) +{ + uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64]; + + for (t = 0; t < 16; t++, cp += 4) + W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3]; + + for (t = 16; t < 64; t++) + W[t] = sigma1(W[t - 2]) + W[t - 7] + + sigma0(W[t - 15]) + W[t - 16]; + + a = H[0]; b = H[1]; c = H[2]; d = H[3]; + e = H[4]; f = H[5]; g = H[6]; h = H[7]; + + for (t = 0; t < 64; t++) { + T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t]; + T2 = SIGMA0(a) + Maj(a, b, c); + h = g; g = f; f = e; e = d + T1; + d = c; c = b; b = a; a = T1 + T2; + } + + H[0] += a; H[1] += b; H[2] += c; H[3] += d; + H[4] += e; H[5] += f; H[6] += g; H[7] += h; +} + +static void +zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; + uint8_t pad[128]; + int padsize = size & 63; + int i; + + for (i = 0; i < size - padsize; i += 64) + SHA256Transform(H, (uint8_t *)buf + i); + + for (i = 0; i < padsize; i++) + pad[i] = ((uint8_t *)buf)[i]; + + for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) + pad[padsize] = 0; + + for (i = 0; i < 8; i++) + pad[padsize++] = (size << 3) >> (56 - 8 * i); + + for (i = 0; i < padsize; i += 64) + SHA256Transform(H, pad + i); + + ZIO_SET_CHECKSUM(zcp, + (uint64_t)H[0] << 32 | H[1], + (uint64_t)H[2] << 32 | H[3], + (uint64_t)H[4] << 32 | H[5], + (uint64_t)H[6] << 32 | H[7]); +} diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h new file mode 100644 index 000000000000..3d178b493664 --- /dev/null +++ b/sys/cddl/boot/zfs/zfsimpl.h @@ -0,0 +1,1151 @@ +/*- + * Copyright (c) 2002 McAfee, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and McAfee Research,, the Security Research Division of + * McAfee, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as + * part of the DARPA CHATS research program + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* CRC64 table */ +#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ + +/* + * Macros for various sorts of alignment and rounding when the alignment + * is known to be a power of 2. + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2END(x, align) (-(~(x) & -(align))) +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) \ + ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len)) +#define BF64_SET(x, low, len, val) \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +#define BF32_SET_SB(x, low, len, shift, bias, val) \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) +#define BF64_SET_SB(x, low, len, shift, bias, val) \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) + +/* + * We currently support nine block sizes, from 512 bytes to 128K. + * We could go higher, but the benefits are near-zero and the cost + * of COWing a giant block to modify one byte would become excessive. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 17 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * E endianness + * type DMU object type + * lvl level of indirection + * birth txg transaction group in which the block was born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ +typedef struct blkptr { + dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[3]; /* Extra space for the future */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) +#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_HOLE(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_PSIZE(bp) \ + BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define BP_SET_PSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) + +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_GET_ASIZE(bp) \ + (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_GET_UCSIZE(bp) \ + ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + +#define BP_GET_NDVAS(bp) \ + (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_COUNT_GANG(bp) \ + (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ + DVA_GET_GANG(&(bp)->blk_dva[1]) + \ + DVA_GET_GANG(&(bp)->blk_dva[2])) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ + ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ + ((zc1).zc_word[3] - (zc2).zc_word[3]))) + + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) +#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) +#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_pad[2] = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ + +typedef struct zio_block_tail { + uint64_t zbt_magic; /* for validation, endianness */ + zio_cksum_t zbt_cksum; /* 256-bit checksum */ +} zio_block_tail_t; + +#define VDEV_SKIP_SIZE (8 << 10) +#define VDEV_BOOT_HEADER_SIZE (8 << 10) +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCK_RING (128 << 10) + +#define VDEV_UBERBLOCK_SHIFT(vd) \ + MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_COUNT(vd) \ + (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) + +/* ZFS boot block */ +#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL +#define VDEV_BOOT_VERSION 1 /* version number */ + +typedef struct vdev_boot_header { + uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ + uint64_t vb_version; /* VDEV_BOOT_VERSION */ + uint64_t vb_offset; /* start offset (bytes) */ + uint64_t vb_size; /* size (bytes) */ + char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; +} vdev_boot_header_t; + +typedef struct vdev_phys { + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; + zio_block_tail_t vp_zbt; +} vdev_phys_t; + +typedef struct vdev_label { + char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ + vdev_boot_header_t vl_boot_header; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 112K */ + char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ +} vdev_label_t; /* 256K total */ + +/* + * vdev_dirty() flags + */ +#define VDD_METASLAB 0x01 +#define VDD_DTL 0x02 + +/* + * Size and offset of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * Size of label regions at the start and end of each leaf device. + */ +#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_LABELS 4 + +enum zio_checksum { + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_FUNCTIONS +}; + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_FUNCTIONS +}; + +#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +/* nvlist pack encoding */ +#define NV_ENCODE_NATIVE 0 +#define NV_ENCODE_XDR 1 + +typedef enum { + DATA_TYPE_UNKNOWN = 0, + DATA_TYPE_BOOLEAN, + DATA_TYPE_BYTE, + DATA_TYPE_INT16, + DATA_TYPE_UINT16, + DATA_TYPE_INT32, + DATA_TYPE_UINT32, + DATA_TYPE_INT64, + DATA_TYPE_UINT64, + DATA_TYPE_STRING, + DATA_TYPE_BYTE_ARRAY, + DATA_TYPE_INT16_ARRAY, + DATA_TYPE_UINT16_ARRAY, + DATA_TYPE_INT32_ARRAY, + DATA_TYPE_UINT32_ARRAY, + DATA_TYPE_INT64_ARRAY, + DATA_TYPE_UINT64_ARRAY, + DATA_TYPE_STRING_ARRAY, + DATA_TYPE_HRTIME, + DATA_TYPE_NVLIST, + DATA_TYPE_NVLIST_ARRAY, + DATA_TYPE_BOOLEAN_VALUE, + DATA_TYPE_INT8, + DATA_TYPE_UINT8, + DATA_TYPE_BOOLEAN_ARRAY, + DATA_TYPE_INT8_ARRAY, + DATA_TYPE_UINT8_ARRAY +} data_type_t; + +/* + * On-disk version number. + */ +#define ZFS_VERSION_1 1ULL +#define ZFS_VERSION_2 2ULL +#define ZFS_VERSION_3 3ULL +#define ZFS_VERSION_4 4ULL +#define ZFS_VERSION_5 5ULL +#define ZFS_VERSION_6 6ULL +/* + * When bumping up ZFS_VERSION, make sure GRUB ZFS understand the on-disk + * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*}, + * and do the appropriate changes. + */ +#define ZFS_VERSION ZFS_VERSION_6 +#define ZFS_VERSION_STRING "6" + +/* + * Symbolic names for the changes that caused a ZFS_VERSION switch. + * Used in the code when checking for presence or absence of a feature. + * Feel free to define multiple symbolic names for each version if there + * were multiple changes to on-disk structures during that version. + * + * NOTE: When checking the current ZFS_VERSION in your code, be sure + * to use spa_version() since it reports the version of the + * last synced uberblock. Checking the in-flight version can + * be dangerous in some cases. + */ +#define ZFS_VERSION_INITIAL ZFS_VERSION_1 +#define ZFS_VERSION_DITTO_BLOCKS ZFS_VERSION_2 +#define ZFS_VERSION_SPARES ZFS_VERSION_3 +#define ZFS_VERSION_RAID6 ZFS_VERSION_3 +#define ZFS_VERSION_BPLIST_ACCOUNT ZFS_VERSION_3 +#define ZFS_VERSION_RAIDZ_DEFLATE ZFS_VERSION_3 +#define ZFS_VERSION_DNODE_BYTES ZFS_VERSION_3 +#define ZFS_VERSION_ZPOOL_HISTORY ZFS_VERSION_4 +#define ZFS_VERSION_GZIP_COMPRESSION ZFS_VERSION_5 +#define ZFS_VERSION_BOOTFS ZFS_VERSION_6 + +/* + * The following are configuration names used in the nvlist describing a pool's + * configuration. + */ +#define ZPOOL_CONFIG_VERSION "version" +#define ZPOOL_CONFIG_POOL_NAME "name" +#define ZPOOL_CONFIG_POOL_STATE "state" +#define ZPOOL_CONFIG_POOL_TXG "txg" +#define ZPOOL_CONFIG_POOL_GUID "pool_guid" +#define ZPOOL_CONFIG_CREATE_TXG "create_txg" +#define ZPOOL_CONFIG_TOP_GUID "top_guid" +#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" +#define ZPOOL_CONFIG_TYPE "type" +#define ZPOOL_CONFIG_CHILDREN "children" +#define ZPOOL_CONFIG_ID "id" +#define ZPOOL_CONFIG_GUID "guid" +#define ZPOOL_CONFIG_PATH "path" +#define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" +#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" +#define ZPOOL_CONFIG_ASHIFT "ashift" +#define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_DTL "DTL" +#define ZPOOL_CONFIG_STATS "stats" +#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" +#define ZPOOL_CONFIG_OFFLINE "offline" +#define ZPOOL_CONFIG_ERRCOUNT "error_count" +#define ZPOOL_CONFIG_NOT_PRESENT "not_present" +#define ZPOOL_CONFIG_SPARES "spares" +#define ZPOOL_CONFIG_IS_SPARE "is_spare" +#define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_HOSTID "hostid" +#define ZPOOL_CONFIG_HOSTNAME "hostname" +#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ + +#define VDEV_TYPE_ROOT "root" +#define VDEV_TYPE_MIRROR "mirror" +#define VDEV_TYPE_REPLACING "replacing" +#define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DISK "disk" +#define VDEV_TYPE_FILE "file" +#define VDEV_TYPE_MISSING "missing" +#define VDEV_TYPE_SPARE "spare" + +/* + * This is needed in userland to report the minimum necessary device size. + */ +#define SPA_MINDEVSIZE (64ULL << 20) + +/* + * The location of the pool configuration repository, shared between kernel and + * userland. + */ +#define ZPOOL_CACHE_DIR "/boot/zfs" +#define ZPOOL_CACHE_FILE "zpool.cache" +#define ZPOOL_CACHE_TMP ".zpool.cache" + +#define ZPOOL_CACHE ZPOOL_CACHE_DIR "/" ZPOOL_CACHE_FILE + +/* + * vdev states are ordered from least to most healthy. + * A vdev that's CANT_OPEN or below is considered unusable. + */ +typedef enum vdev_state { + VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ + VDEV_STATE_CLOSED, /* Not currently open */ + VDEV_STATE_OFFLINE, /* Not allowed to open */ + VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ + VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ + VDEV_STATE_HEALTHY /* Presumed good */ +} vdev_state_t; + +/* + * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field + * of the vdev stats structure uses these constants to distinguish why. + */ +typedef enum vdev_aux { + VDEV_AUX_NONE, /* no error */ + VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ + VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ + VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ + VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ + VDEV_AUX_TOO_SMALL, /* vdev size is too small */ + VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ + VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ + VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ + VDEV_AUX_SPARED /* hot spare used in another pool */ +} vdev_aux_t; + +/* + * pool state. The following states are written to disk as part of the normal + * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are + * software abstractions used at various levels to communicate pool state. + */ +typedef enum pool_state { + POOL_STATE_ACTIVE = 0, /* In active use */ + POOL_STATE_EXPORTED, /* Explicitly exported */ + POOL_STATE_DESTROYED, /* Explicitly destroyed */ + POOL_STATE_SPARE, /* Reserved for hot spare use */ + POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ + POOL_STATE_UNAVAIL, /* Internal libzfs state */ + POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ +} pool_state_t; + +/* + * The uberblock version is incremented whenever an incompatible on-disk + * format change is made to the SPA, DMU, or ZAP. + * + * Note: the first two fields should never be moved. When a storage pool + * is opened, the uberblock must be read off the disk before the version + * can be checked. If the ub_version field is moved, we may not detect + * version mismatch. If the ub_magic field is moved, applications that + * expect the magic number in the first word won't work. + */ +#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ +#define UBERBLOCK_SHIFT 10 /* up to 1K */ + +struct uberblock { + uint64_t ub_magic; /* UBERBLOCK_MAGIC */ + uint64_t ub_version; /* ZFS_VERSION */ + uint64_t ub_txg; /* txg of last sync */ + uint64_t ub_guid_sum; /* sum of all vdev guids */ + uint64_t ub_timestamp; /* UTC time of last sync */ + blkptr_t ub_rootbp; /* MOS objset_phys_t */ +}; + +/* + * Flags. + */ +#define DNODE_MUST_BE_ALLOCATED 1 +#define DNODE_MUST_BE_FREE 2 + +/* + * Fixed constants. + */ +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 10 /* 1k */ +#define DN_MAX_INDBLKSHIFT 14 /* 16k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ + +/* + * Derived constants. + */ +#define DNODE_SIZE (1 << DNODE_SHIFT) +#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) + +#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) +#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) +#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) + +/* The +2 here is a cheesy way to round up */ +#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ + (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) + +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ + (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) + +#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ + (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) + +#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) + +/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ +#define DNODE_FLAG_USED_BYTES (1<<0) + +typedef struct dnode_phys { + uint8_t dn_type; /* dmu_object_type_t */ + uint8_t dn_indblkshift; /* ln2(indirect block size) */ + uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t dn_nblkptr; /* length of dn_blkptr */ + uint8_t dn_bonustype; /* type of data in bonus buffer */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_flags; /* DNODE_FLAG_* */ + uint16_t dn_datablkszsec; /* data block size in 512b sectors */ + uint16_t dn_bonuslen; /* length of dn_bonus */ + uint8_t dn_pad2[4]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t dn_maxblkid; /* largest allocated block ID */ + uint64_t dn_used; /* bytes (or sectors) of disk space */ + + uint64_t dn_pad3[4]; + + blkptr_t dn_blkptr[1]; + uint8_t dn_bonus[DN_MAX_BONUSLEN]; +} dnode_phys_t; + +typedef enum dmu_object_type { + DMU_OT_NONE, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPLIST, /* UINT64 */ + DMU_OT_BPLIST_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DIR, /* UINT64 */ + DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_DATASET, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_ACL, /* ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_UNLINKED_SET, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ + DMU_OT_SPA_HISTORY, /* UINT8 */ + DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ + DMU_OT_POOL_PROPS, /* ZAP */ + + DMU_OT_NUMTYPES +} dmu_object_type_t; + +typedef enum dmu_objset_type { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +} dmu_objset_type_t; + +/* + * Intent log header - this on disk structure holds fields to manage + * the log. All fields are 64 bit to easily handle cross architectures. + */ +typedef struct zil_header { + uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ + uint64_t zh_replay_seq; /* highest replayed sequence number */ + blkptr_t zh_log; /* log chain */ + uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_pad[5]; +} zil_header_t; + +typedef struct objset_phys { + dnode_phys_t os_meta_dnode; + zil_header_t os_zil_header; + uint64_t os_type; + char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - + sizeof (uint64_t)]; +} objset_phys_t; + +typedef struct dsl_dir_phys { + uint64_t dd_creation_time; /* not actually used */ + uint64_t dd_head_dataset_obj; + uint64_t dd_parent_obj; + uint64_t dd_clone_parent_obj; + uint64_t dd_child_dir_zapobj; + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + uint64_t dd_used_bytes; + uint64_t dd_compressed_bytes; + uint64_t dd_uncompressed_bytes; + /* Administrative quota setting */ + uint64_t dd_quota; + /* Administrative reservation setting */ + uint64_t dd_reserved; + uint64_t dd_props_zapobj; + uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */ +} dsl_dir_phys_t; + +typedef struct dsl_dataset_phys { + uint64_t ds_dir_obj; + uint64_t ds_prev_snap_obj; + uint64_t ds_prev_snap_txg; + uint64_t ds_next_snap_obj; + uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */ + uint64_t ds_num_children; /* clone/snap children; ==0 for head */ + uint64_t ds_creation_time; /* seconds since 1970 */ + uint64_t ds_creation_txg; + uint64_t ds_deadlist_obj; + uint64_t ds_used_bytes; + uint64_t ds_compressed_bytes; + uint64_t ds_uncompressed_bytes; + uint64_t ds_unique_bytes; /* only relevant to snapshots */ + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t ds_fsid_guid; + uint64_t ds_guid; + uint64_t ds_flags; + blkptr_t ds_bp; + uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */ +} dsl_dataset_phys_t; + +/* + * The names of zap entries in the DIRECTORY_OBJECT of the MOS. + */ +#define DMU_POOL_DIRECTORY_OBJECT 1 +#define DMU_POOL_CONFIG "config" +#define DMU_POOL_ROOT_DATASET "root_dataset" +#define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" +#define DMU_POOL_ERRLOG_LAST "errlog_last" +#define DMU_POOL_SPARES "spares" +#define DMU_POOL_DEFLATE "deflate" +#define DMU_POOL_HISTORY "history" +#define DMU_POOL_PROPS "pool_props" + +#define ZAP_MAGIC 0x2F52AB2ABULL + +#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_block_shift) + +#define ZAP_MAXCD (uint32_t)(-1) +#define ZAP_HASHBITS 28 +#define MZAP_ENT_LEN 64 +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT +#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) + +typedef struct mzap_ent_phys { + uint64_t mze_value; + uint32_t mze_cd; + uint16_t mze_pad; /* in case we want to chain them someday */ + char mze_name[MZAP_NAME_LEN]; +} mzap_ent_phys_t; + +typedef struct mzap_phys { + uint64_t mz_block_type; /* ZBT_MICRO */ + uint64_t mz_salt; + uint64_t mz_pad[6]; + mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ +} mzap_phys_t; + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: + * + * ptrtbl fits in first block: + * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ... + * + * ptrtbl too big for first block: + * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) +/* any other values are ptrtbl blocks */ + +/* + * the embedded pointer table takes up half a block: + * block size / entry size (2^3) / 2 + */ +#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) + +/* + * The embedded pointer table starts half-way through the block. Since + * the pointer table itself is half the block, it starts at (64-bit) + * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)). + */ +#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \ + ((uint64_t *)(zap)->zap_phys) \ + [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))] + +/* + * TAKE NOTE: + * If zap_phys_t is modified, zap_byteswap() must be modified. + */ +typedef struct zap_phys { + uint64_t zap_block_type; /* ZBT_HEADER */ + uint64_t zap_magic; /* ZAP_MAGIC */ + + struct zap_table_phys { + uint64_t zt_blk; /* starting block number */ + uint64_t zt_numblks; /* number of blocks */ + uint64_t zt_shift; /* bits to index it */ + uint64_t zt_nextblk; /* next (larger) copy start block */ + uint64_t zt_blks_copied; /* number source blocks copied */ + } zap_ptrtbl; + + uint64_t zap_freeblk; /* the next free block */ + uint64_t zap_num_leafs; /* number of leafs */ + uint64_t zap_num_entries; /* number of entries */ + uint64_t zap_salt; /* salt to stir into hash function */ + /* + * This structure is followed by padding, and then the embedded + * pointer table. The embedded pointer table takes up second + * half of the block. It is accessed using the + * ZAP_EMBEDDED_PTRTBL_ENT() macro. + */ +} zap_phys_t; + +typedef struct zap_table_phys zap_table_phys_t; + +typedef struct fat_zap { + int zap_block_shift; /* block size shift */ + zap_phys_t *zap_phys; +} fat_zap_t; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ +#define ZAP_LEAF_CHUNKSIZE 24 + +/* + * The amount of space available for chunks is: + * block size (1<<l->l_bs) - hash entry size (2) * number of hash + * entries - header space (2*chunksize) + */ +#define ZAP_LEAF_NUMCHUNKS(l) \ + (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ + ZAP_LEAF_CHUNKSIZE - 2) + +/* + * The amount of space within the chunk available for the array is: + * chunk size - space for type (1) - space for next pointer (2) + */ +#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) + +#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ + (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) + +/* + * Low water mark: when there are only this many chunks free, start + * growing the ptrtbl. Ideally, this should be larger than a + * "reasonably-sized" entry. 20 chunks is more than enough for the + * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), + * while still being only around 3% for 16k blocks. + */ +#define ZAP_LEAF_LOW_WATER (20) + +/* + * The leaf hash table has block size / 2^5 (32) number of entries, + * which should be more than enough for the maximum number of entries, + * which is less than block size / CHUNKSIZE (24) / minimum number of + * chunks per entry (3). + */ +#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) + +/* + * The chunks start immediately after the hash table. The end of the + * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a + * chunk_t. + */ +#define ZAP_LEAF_CHUNK(l, idx) \ + ((zap_leaf_chunk_t *) \ + ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] +#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) + +typedef enum zap_chunk_type { + ZAP_CHUNK_FREE = 253, + ZAP_CHUNK_ENTRY = 252, + ZAP_CHUNK_ARRAY = 251, + ZAP_CHUNK_TYPE_MAX = 250 +} zap_chunk_type_t; + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ +typedef struct zap_leaf_phys { + struct zap_leaf_header { + uint64_t lh_block_type; /* ZBT_LEAF */ + uint64_t lh_pad1; + uint64_t lh_prefix; /* hash prefix of this leaf */ + uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ + uint16_t lh_nfree; /* number free chunks */ + uint16_t lh_nentries; /* number of entries */ + uint16_t lh_prefix_len; /* num bits used to id this */ + +/* above is accessable to zap, below is zap_leaf private */ + + uint16_t lh_freelist; /* chunk head of free list */ + uint8_t lh_pad2[12]; + } l_hdr; /* 2 24-byte chunks */ + + /* + * The header is followed by a hash table with + * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is + * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) + * zap_leaf_chunk structures. These structures are accessed + * with the ZAP_LEAF_CHUNK() macro. + */ + + uint16_t l_hash[1]; +} zap_leaf_phys_t; + +typedef union zap_leaf_chunk { + struct zap_leaf_entry { + uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ + uint8_t le_int_size; /* size of ints */ + uint16_t le_next; /* next entry in hash chain */ + uint16_t le_name_chunk; /* first chunk of the name */ + uint16_t le_name_length; /* bytes in name, incl null */ + uint16_t le_value_chunk; /* first chunk of the value */ + uint16_t le_value_length; /* value length in ints */ + uint32_t le_cd; /* collision differentiator */ + uint64_t le_hash; /* hash value of the name */ + } l_entry; + struct zap_leaf_array { + uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ + uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; + uint16_t la_next; /* next blk or CHAIN_END */ + } l_array; + struct zap_leaf_free { + uint8_t lf_type; /* always ZAP_CHUNK_FREE */ + uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t lf_next; /* next in free list, or CHAIN_END */ + } l_free; +} zap_leaf_chunk_t; + +typedef struct zap_leaf { + int l_bs; /* block size shift */ + zap_leaf_phys_t *l_phys; +} zap_leaf_t; + +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ + +#define MASTER_NODE_OBJ 1 + +/* + * special attributes for master node. + */ + +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_OBJ "VERSION" +#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE" +#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS" + +#define ZFS_FLAG_BLOCKPERPAGE 0x1 +#define ZFS_FLAG_NOGROWBLOCKS 0x2 + +/* + * ZPL version - rev'd whenever an incompatible on-disk format change + * occurs. Independent of SPA/DMU/ZAP versioning. + */ + +#define ZPL_VERSION 1ULL + +/* + * The directory entry has the type (currently unused on Solaris) in the + * top 4 bits, and the object number in the low 48 bits. The "middle" + * 12 bits are unused. + */ +#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) +#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) +#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj) + +typedef struct ace { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ +} ace_t; + +#define ACE_SLOT_CNT 6 + +typedef struct zfs_znode_acl { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_pad; /* pad */ + ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_znode_acl_t; + +/* + * This is the persistent portion of the znode. It is stored + * in the "bonus buffer" of the file. Short symbolic links + * are also stored in the bonus buffer. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_pad[4]; /* 144 - future */ + zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we only use this space to store symbolic links. + */ +} znode_phys_t; + +/* + * In-core vdev representation. + */ +struct vdev; +typedef int vdev_read_t(struct vdev *vdev, void *priv, off_t offset, void *buf, size_t bytes); + +typedef STAILQ_HEAD(vdev_list, vdev) vdev_list_t; + +typedef struct vdev { + STAILQ_ENTRY(vdev) v_childlink; /* link in parent's child list */ + STAILQ_ENTRY(vdev) v_alllink; /* link in global vdev list */ + vdev_list_t v_children; /* children of this vdev */ + char *v_name; /* vdev name */ + uint64_t v_guid; /* vdev guid */ + int v_id; /* index in parent */ + vdev_state_t v_state; /* current state */ + vdev_read_t *v_read; /* function to read from this vdev */ + void *v_read_priv; /* private data for read function */ +} vdev_t; + +/* + * In-core pool representation. + */ +typedef STAILQ_HEAD(spa_list, spa) spa_list_t; + +typedef struct spa { + STAILQ_ENTRY(spa) spa_link; /* link in global pool list */ + char *spa_name; /* pool name */ + uint64_t spa_guid; /* pool guid */ + uint64_t spa_txg; /* most recent transaction */ + struct uberblock spa_uberblock; /* best uberblock so far */ + vdev_list_t spa_vdevs; /* list of all toplevel vdevs */ + objset_phys_t spa_mos; /* MOS for this pool */ + objset_phys_t spa_root_objset; /* current mounted ZPL objset */ +} spa_t; diff --git a/sys/cddl/boot/zfs/zfssubr.c b/sys/cddl/boot/zfs/zfssubr.c new file mode 100644 index 000000000000..1c859c0f995b --- /dev/null +++ b/sys/cddl/boot/zfs/zfssubr.c @@ -0,0 +1,193 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +static uint64_t zfs_crc64_table[256]; + +static void +zfs_init_crc(void) +{ + int i, j; + uint64_t *ct; + + /* + * Calculate the crc64 table (used for the zap hash + * function). + */ + if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { + memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + } +} + +static void +zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +/* + * Signature for checksum functions. + */ +typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); + +/* + * Information about each checksum function. + */ +typedef struct zio_checksum_info { + zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ + int ci_correctable; /* number of correctable bits */ + int ci_zbt; /* uses zio block tail? */ + const char *ci_name; /* descriptive name */ +} zio_checksum_info_t; + +#include "fletcher.c" +#include "sha256.c" + +static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { + {{NULL, NULL}, 0, 0, "inherit"}, + {{NULL, NULL}, 0, 0, "on"}, + {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, + {{zio_checksum_SHA256, NULL}, 1, 1, "label"}, + {{zio_checksum_SHA256, NULL}, 1, 1, "gang_header"}, + {{fletcher_2_native, NULL}, 0, 1, "zilog"}, + {{fletcher_2_native, NULL}, 0, 0, "fletcher2"}, + {{fletcher_4_native, NULL}, 1, 0, "fletcher4"}, + {{zio_checksum_SHA256, NULL}, 1, 0, "SHA256"}, +}; + +/* + * Common signature for all zio compress/decompress functions. + */ +typedef size_t zio_compress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +typedef int zio_decompress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); + +/* + * Information about each compression function. + */ +typedef struct zio_compress_info { + zio_compress_func_t *ci_compress; /* compression function */ + zio_decompress_func_t *ci_decompress; /* decompression function */ + int ci_level; /* level parameter */ + const char *ci_name; /* algorithm name */ +} zio_compress_info_t; + +#include "lzjb.c" + +/* + * Compression vectors. + */ +static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { + {NULL, NULL, 0, "inherit"}, + {NULL, NULL, 0, "on"}, + {NULL, NULL, 0, "uncompressed"}, + {NULL, lzjb_decompress, 0, "lzjb"}, + {NULL, NULL, 0, "empty"}, + {NULL, NULL, 1, "gzip-1"}, + {NULL, NULL, 2, "gzip-2"}, + {NULL, NULL, 3, "gzip-3"}, + {NULL, NULL, 4, "gzip-4"}, + {NULL, NULL, 5, "gzip-5"}, + {NULL, NULL, 6, "gzip-6"}, + {NULL, NULL, 7, "gzip-7"}, + {NULL, NULL, 8, "gzip-8"}, + {NULL, NULL, 9, "gzip-9"}, +}; + +static int +zio_checksum_error(const blkptr_t *bp, void *data) +{ + zio_cksum_t zc = bp->blk_cksum; + unsigned int checksum = BP_GET_CHECKSUM(bp); + uint64_t size = BP_GET_PSIZE(bp); + zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t actual_cksum, expected_cksum; + + if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) + return (EINVAL); + + if (ci->ci_zbt) { + expected_cksum = zbt->zbt_cksum; + zbt->zbt_cksum = zc; + ci->ci_func[0](data, size, &actual_cksum); + zbt->zbt_cksum = expected_cksum; + zc = expected_cksum; + } else { + /* ASSERT(!BP_IS_GANG(bp)); */ + ci->ci_func[0](data, size, &actual_cksum); + } + + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) { + /*printf("ZFS: read checksum failed\n");*/ + return (EIO); + } + + return (0); +} + +static int +zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, uint64_t destsize) +{ + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + /* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */ + if (!ci->ci_decompress) { + printf("ZFS: unsupported compression algorithm %d\n", cpfunc); + return (EIO); + } + + return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); +} + +static uint64_t +zap_hash(uint64_t salt, const char *name) +{ + const uint8_t *cp; + uint8_t c; + uint64_t crc = salt; + + /*ASSERT(crc != 0);*/ + /*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/ + for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the onces that we first pay attention to when + * chosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + + return (crc); +} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c b/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c index 37de21f4f548..bdb8f02ac20c 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c @@ -61,6 +61,15 @@ atomic_add_64(volatile uint64_t *target, int64_t delta) *target += delta; mtx_unlock(&atomic_mtx); } + +void +atomic_dec_64(volatile uint64_t *target) +{ + + mtx_lock(&atomic_mtx); + *target -= 1; + mtx_unlock(&atomic_mtx); +} #endif uint64_t diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c index 139d018d27a7..a24ca83e4556 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c @@ -94,7 +94,7 @@ zfs_kmem_free(void *buf, size_t size __unused) { #ifdef KMEM_DEBUG if (buf == NULL) { - printf("%s: attempt to free NULL\n",__func__); + printf("%s: attempt to free NULL\n", __func__); return; } struct kmem_item *i; @@ -156,7 +156,7 @@ kmem_cache_create(char *name, size_t bufsize, size_t align, cache->kc_constructor = constructor; cache->kc_destructor = destructor; cache->kc_private = private; -#ifdef _KERNEL +#if defined(_KERNEL) && !defined(KMEM_DEBUG) cache->kc_zone = uma_zcreate(cache->kc_name, bufsize, constructor != NULL ? kmem_std_constructor : NULL, destructor != NULL ? kmem_std_destructor : NULL, @@ -171,23 +171,23 @@ kmem_cache_create(char *name, size_t bufsize, size_t align, void kmem_cache_destroy(kmem_cache_t *cache) { +#if defined(_KERNEL) && !defined(KMEM_DEBUG) uma_zdestroy(cache->kc_zone); +#endif kmem_free(cache, sizeof(*cache)); } void * kmem_cache_alloc(kmem_cache_t *cache, int flags) { -#ifdef _KERNEL +#if defined(_KERNEL) && !defined(KMEM_DEBUG) return (uma_zalloc_arg(cache->kc_zone, cache, flags)); #else void *p; p = kmem_alloc(cache->kc_size, flags); - if (p != NULL) { - kmem_std_constructor(p, cache->kc_size, cache->kc_private, - flags); - } + if (p != NULL && cache->kc_constructor != NULL) + kmem_std_constructor(p, cache->kc_size, cache, flags); return (p); #endif } @@ -195,10 +195,11 @@ kmem_cache_alloc(kmem_cache_t *cache, int flags) void kmem_cache_free(kmem_cache_t *cache, void *buf) { -#ifdef _KERNEL +#if defined(_KERNEL) && !defined(KMEM_DEBUG) uma_zfree_arg(cache->kc_zone, buf, cache); #else - kmem_std_destructor(buf, cache->kc_size, cache->kc_private); + if (cache->kc_destructor != NULL) + kmem_std_destructor(buf, cache->kc_size, cache); kmem_free(buf, cache->kc_size); #endif } @@ -207,7 +208,9 @@ kmem_cache_free(kmem_cache_t *cache, void *buf) void kmem_cache_reap_now(kmem_cache_t *cache) { +#ifndef KMEM_DEBUG zone_drain(cache->kc_zone); +#endif } void @@ -253,6 +256,8 @@ kmem_show(void *dummy __unused) printf("KMEM_DEBUG: Leaked elements:\n\n"); LIST_FOREACH(i, &kmem_items, next) { printf("address=%p\n", i); + stack_print_ddb(&i->stack); + printf("\n"); } } mtx_unlock(&kmem_items_mtx); diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c new file mode 100644 index 000000000000..47df799b20b2 --- /dev/null +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c @@ -0,0 +1,112 @@ +/*- + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vnode.h> + +int +lookupname(char *dirname, enum uio_seg seg, enum symfollow follow, + vnode_t **dirvpp, vnode_t **compvpp) +{ + + return (lookupnameat(dirname, seg, follow, dirvpp, compvpp, NULL)); +} + +int +lookupnameat(char *dirname, enum uio_seg seg, enum symfollow follow, + vnode_t **dirvpp, vnode_t **compvpp, vnode_t *startvp) +{ + struct nameidata nd; + int error, ltype; + + ASSERT(dirvpp == NULL); + + vref(startvp); + ltype = VOP_ISLOCKED(startvp); + VOP_UNLOCK(startvp, 0); + NDINIT_ATVP(&nd, LOOKUP, LOCKLEAF | MPSAFE | follow, seg, dirname, + startvp, curthread); + error = namei(&nd); + *compvpp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + vn_lock(startvp, ltype | LK_RETRY); + return (error); +} + +int +traverse(vnode_t **cvpp, int lktype) +{ + kthread_t *td = curthread; + vnode_t *cvp; + vnode_t *tvp; + vfs_t *vfsp; + int error; + + cvp = *cvpp; + tvp = NULL; + + /* + * If this vnode is mounted on, then we transparently indirect + * to the vnode which is the root of the mounted file system. + * Before we do this we must check that an unmount is not in + * progress on this vnode. + */ + + for (;;) { + /* + * Reached the end of the mount chain? + */ + vfsp = vn_mountedvfs(cvp); + if (vfsp == NULL) + break; + /* + * tvp is NULL for *cvpp vnode, which we can't unlock. + */ + if (tvp != NULL) + vput(cvp); + else + vrele(cvp); + + /* + * The read lock must be held across the call to VFS_ROOT() to + * prevent a concurrent unmount from destroying the vfs. + */ + error = VFS_ROOT(vfsp, lktype, &tvp, td); + if (error != 0) + return (error); + cvp = tvp; + } + + *cvpp = cvp; + return (0); +} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c b/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c index a89d478d70e5..279ae4c042bb 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/libkern.h> +#include <sys/limits.h> #include <sys/misc.h> #include <sys/sunddi.h> @@ -40,17 +41,30 @@ struct opensolaris_utsname utsname = { }; int +ddi_strtol(const char *str, char **nptr, int base, long *result) +{ + + *result = strtol(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == LONG_MIN || *result == LONG_MAX) + return (ERANGE); + return (0); +} + +int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) { - char *end; if (str == hw_serial) { *result = hostid; return (0); } - *result = strtoul(str, &end, base); + *result = strtoul(str, nptr, base); if (*result == 0) return (EINVAL); + else if (*result == ULONG_MAX) + return (ERANGE); return (0); } diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c b/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c index 272fe59e21a7..837d736d15af 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c @@ -30,9 +30,20 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/priv.h> #include <sys/vnode.h> +#include <sys/mntent.h> #include <sys/mount.h> #include <sys/stat.h> +#include <sys/jail.h> #include <sys/policy.h> +#include <sys/zfs_vfsops.h> + +int +secpolicy_nfs(struct ucred *cred) +{ + + /* TODO: Change PRIV_ROOT! */ + return (priv_check_cred(cred, PRIV_ROOT, 0)); +} int secpolicy_zfs(struct ucred *cred) @@ -62,16 +73,32 @@ secpolicy_fs_unmount(struct ucred *cred, struct mount *vfsp __unused) return (priv_check_cred(cred, PRIV_VFS_UNMOUNT, 0)); } +int +secpolicy_fs_owner(struct mount *mp, struct ucred *cred) +{ + + if (zfs_super_owner) { + if (cred->cr_uid == mp->mnt_cred->cr_uid && + (!jailed(cred) || + cred->cr_prison == mp->mnt_cred->cr_prison)) { + return (0); + } + } + return (priv_check_cred(cred, PRIV_VFS_MOUNT_OWNER, 0)); +} + /* * This check is done in kern_link(), so we could just return 0 here. */ extern int hardlink_check_uid; int -secpolicy_basic_link(struct ucred *cred) +secpolicy_basic_link(struct vnode *vp, struct ucred *cred) { if (!hardlink_check_uid) return (0); + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); return (priv_check_cred(cred, PRIV_VFS_LINK, 0)); } @@ -83,9 +110,11 @@ secpolicy_vnode_stky_modify(struct ucred *cred) } int -secpolicy_vnode_remove(struct ucred *cred) +secpolicy_vnode_remove(struct vnode *vp, struct ucred *cred) { + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); return (priv_check_cred(cred, PRIV_VFS_ADMIN, 0)); } @@ -94,9 +123,11 @@ secpolicy_vnode_access(struct ucred *cred, struct vnode *vp, uint64_t owner, accmode_t accmode) { - if ((accmode & VREAD) && priv_check_cred(cred, PRIV_VFS_READ, 0) != 0) { + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); + + if ((accmode & VREAD) && priv_check_cred(cred, PRIV_VFS_READ, 0) != 0) return (EACCES); - } if ((accmode & VWRITE) && priv_check_cred(cred, PRIV_VFS_WRITE, 0) != 0) { return (EACCES); @@ -116,11 +147,13 @@ secpolicy_vnode_access(struct ucred *cred, struct vnode *vp, uint64_t owner, } int -secpolicy_vnode_setdac(struct ucred *cred, uid_t owner) +secpolicy_vnode_setdac(struct vnode *vp, struct ucred *cred, uid_t owner) { if (owner == cred->cr_uid) return (0); + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); return (priv_check_cred(cred, PRIV_VFS_ADMIN, 0)); } @@ -148,7 +181,7 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap, * In the specific case of creating a set-uid root * file, we need even more permissions. */ - error = secpolicy_vnode_setdac(cred, ovap->va_uid); + error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid); if (error) return (error); error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cred); @@ -158,7 +191,7 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap, vap->va_mode = ovap->va_mode; } if (mask & (AT_UID | AT_GID)) { - error = secpolicy_vnode_setdac(cred, ovap->va_uid); + error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid); if (error) return (error); @@ -170,14 +203,16 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap, if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || ((mask & AT_GID) && vap->va_gid != ovap->va_gid && !groupmember(vap->va_gid, cred))) { - error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0); - if (error) - return (error); + if (secpolicy_fs_owner(vp->v_mount, cred) != 0) { + error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0); + if (error) + return (error); + } } if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) { - secpolicy_setid_clear(vap, cred); + secpolicy_setid_clear(vap, vp, cred); } } if (mask & (AT_ATIME | AT_MTIME)) { @@ -189,7 +224,7 @@ secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap, * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. */ - error = secpolicy_vnode_setdac(cred, ovap->va_uid); + error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid); if (error && (vap->va_vaflags & VA_UTIMES_NULL)) error = unlocked_access(node, VWRITE, cred); if (error) @@ -206,25 +241,33 @@ secpolicy_vnode_create_gid(struct ucred *cred) } int -secpolicy_vnode_setids_setgids(struct ucred *cred, gid_t gid) +secpolicy_vnode_setids_setgids(struct vnode *vp, struct ucred *cred, gid_t gid) { - if (!groupmember(gid, cred)) - return (priv_check_cred(cred, PRIV_VFS_SETGID, 0)); - return (0); + if (groupmember(gid, cred)) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); + return (priv_check_cred(cred, PRIV_VFS_SETGID, 0)); } int -secpolicy_vnode_setid_retain(struct ucred *cred, boolean_t issuidroot __unused) +secpolicy_vnode_setid_retain(struct vnode *vp, struct ucred *cred, + boolean_t issuidroot __unused) { + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); return (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)); } void -secpolicy_setid_clear(struct vattr *vap, struct ucred *cred) +secpolicy_setid_clear(struct vattr *vap, struct vnode *vp, struct ucred *cred) { + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return; + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) { vap->va_mask |= AT_MODE; @@ -239,6 +282,9 @@ secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap, { int error; + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); + /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the process @@ -253,9 +299,61 @@ secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap, * group-id bit. */ if ((vap->va_mode & S_ISGID) != 0) { - error = secpolicy_vnode_setids_setgids(cred, ovap->va_gid); + error = secpolicy_vnode_setids_setgids(vp, cred, ovap->va_gid); if (error) return (error); } return (0); } + +int +secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp) +{ + + return (priv_check_cred(cr, PRIV_VFS_MOUNT, 0)); +} + +int +secpolicy_vnode_owner(struct vnode *vp, cred_t *cred, uid_t owner) +{ + + if (owner == cred->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); + + /* XXX: vfs_suser()? */ + return (priv_check_cred(cred, PRIV_VFS_MOUNT_OWNER, 0)); +} + +int +secpolicy_vnode_chown(struct vnode *vp, cred_t *cred, boolean_t check_self) +{ + + if (secpolicy_fs_owner(vp->v_mount, cred) == 0) + return (0); + return (priv_check_cred(cred, PRIV_VFS_CHOWN, 0)); +} + +void +secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp) +{ + + if (priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER, 0) != 0) { + MNT_ILOCK(vfsp); + vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER; + vfs_clearmntopt(vfsp, MNTOPT_SETUID); + vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0); + MNT_IUNLOCK(vfsp); + } +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype) +{ + + return (priv_check_cred(cr, PRIV_VFS_SYSFLAGS, 0)); +} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c index e9120ee0a8f1..f1bb4e25b85a 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> +#include <sys/malloc.h> #include <sys/mount.h> #include <sys/cred.h> #include <sys/vfs.h> @@ -110,60 +111,12 @@ vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) } int -traverse(vnode_t **cvpp, int lktype) -{ - kthread_t *td = curthread; - vnode_t *cvp; - vnode_t *tvp; - vfs_t *vfsp; - int error; - - cvp = *cvpp; - tvp = NULL; - - /* - * If this vnode is mounted on, then we transparently indirect - * to the vnode which is the root of the mounted file system. - * Before we do this we must check that an unmount is not in - * progress on this vnode. - */ - - for (;;) { - /* - * Reached the end of the mount chain? - */ - vfsp = vn_mountedvfs(cvp); - if (vfsp == NULL) - break; - /* - * tvp is NULL for *cvpp vnode, which we can't unlock. - */ - if (tvp != NULL) - vput(cvp); - else - vrele(cvp); - - /* - * The read lock must be held across the call to VFS_ROOT() to - * prevent a concurrent unmount from destroying the vfs. - */ - error = VFS_ROOT(vfsp, lktype, &tvp, td); - if (error != 0) - return (error); - cvp = tvp; - } - - *cvpp = cvp; - return (0); -} - -int domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath, char *fspec, int fsflags) { struct mount *mp; struct vfsconf *vfsp; - struct ucred *newcr, *oldcr; + struct ucred *cr; int error; /* @@ -203,29 +156,31 @@ domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath, /* * Set the mount level flags. - * crdup() can sleep, so do it before acquiring a mutex. */ - newcr = crdup(kcred); - MNT_ILOCK(mp); if (fsflags & MNT_RDONLY) mp->mnt_flag |= MNT_RDONLY; mp->mnt_flag &=~ MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS); /* * Unprivileged user can trigger mounting a snapshot, but we don't want - * him to unmount it, so we switch to privileged credentials. + * him to unmount it, so we switch to privileged of original mount. */ - oldcr = mp->mnt_cred; - mp->mnt_cred = newcr; + crfree(mp->mnt_cred); + mp->mnt_cred = crdup(vp->v_mount->mnt_cred); mp->mnt_stat.f_owner = mp->mnt_cred->cr_uid; - MNT_IUNLOCK(mp); - crfree(oldcr); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ + /* + * XXX: This is evil, but we can't mount a snapshot as a regular user. + * XXX: Is is safe when snapshot is mounted from within a jail? + */ + cr = td->td_ucred; + td->td_ucred = kcred; error = VFS_MOUNT(mp, td); + td->td_ucred = cr; if (!error) { if (mp->mnt_opt != NULL) diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c b/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c index 3059a787d98c..8489052373d0 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/queue.h> #include <sys/jail.h> +#include <sys/osd.h> #include <sys/priv.h> #include <sys/zone.h> @@ -52,7 +53,7 @@ typedef struct zone_dataset { LIST_HEAD(zone_dataset_head, zone_dataset); -static struct prison_service *zone_prison_service = NULL; +static int zone_slot; int zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) @@ -60,7 +61,7 @@ zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) struct zone_dataset_head *head; zone_dataset_t *zd, *zd2; struct prison *pr; - int error; + int dofree, error; if ((error = priv_check_cred(cred, PRIV_ZFS_JAIL, 0)) != 0) return (error); @@ -76,18 +77,33 @@ zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) return (ENOENT); } - head = prison_service_data_get(zone_prison_service, pr); - LIST_FOREACH(zd2, head, zd_next) { - if (strcmp(dataset, zd2->zd_dataset) == 0) { - free(zd, M_ZONES); - error = EEXIST; - goto failure; + head = osd_jail_get(pr, zone_slot); + if (head != NULL) { + dofree = 0; + LIST_FOREACH(zd2, head, zd_next) { + if (strcmp(dataset, zd2->zd_dataset) == 0) { + free(zd, M_ZONES); + error = EEXIST; + goto end; + } } + } else { + dofree = 1; + prison_hold_locked(pr); + mtx_unlock(&pr->pr_mtx); + head = malloc(sizeof(*head), M_ZONES, M_WAITOK); + LIST_INIT(head); + mtx_lock(&pr->pr_mtx); + error = osd_jail_set(pr, zone_slot, head); + KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", error)); } strcpy(zd->zd_dataset, dataset); LIST_INSERT_HEAD(head, zd, zd_next); -failure: - mtx_unlock(&pr->pr_mtx); +end: + if (dofree) + prison_free_locked(pr); + else + mtx_unlock(&pr->pr_mtx); return (error); } @@ -107,16 +123,25 @@ zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid) sx_sunlock(&allprison_lock); if (pr == NULL) return (ENOENT); - head = prison_service_data_get(zone_prison_service, pr); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) { + error = ENOENT; + goto end; + } LIST_FOREACH(zd, head, zd_next) { - if (strcmp(dataset, zd->zd_dataset) == 0) { - LIST_REMOVE(zd, zd_next); - free(zd, M_ZONES); - goto success; - } + if (strcmp(dataset, zd->zd_dataset) == 0) + break; } - error = ENOENT; -success: + if (zd == NULL) + error = ENOENT; + else { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + if (LIST_EMPTY(head)) + osd_jail_del(pr, zone_slot); + error = 0; + } +end: mtx_unlock(&pr->pr_mtx); return (error); } @@ -136,14 +161,16 @@ zone_dataset_visible(const char *dataset, int *write) if (dataset[0] == '\0') return (0); - if (INGLOBALZONE(curproc)) { + if (INGLOBALZONE(curthread)) { if (write != NULL) *write = 1; return (1); } pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); - head = prison_service_data_get(zone_prison_service, pr); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) + goto end; /* * Walk the list once, looking for datasets which match exactly, or @@ -188,49 +215,32 @@ end: return (ret); } -static int -zone_create(struct prison_service *psrv, struct prison *pr) -{ - struct zone_dataset_head *head; - - head = malloc(sizeof(*head), M_ZONES, M_WAITOK); - LIST_INIT(head); - mtx_lock(&pr->pr_mtx); - prison_service_data_set(psrv, pr, head); - mtx_unlock(&pr->pr_mtx); - return (0); -} - -static int -zone_destroy(struct prison_service *psrv, struct prison *pr) +static void +zone_destroy(void *arg) { struct zone_dataset_head *head; zone_dataset_t *zd; - mtx_lock(&pr->pr_mtx); - head = prison_service_data_del(psrv, pr); - mtx_unlock(&pr->pr_mtx); - while ((zd = LIST_FIRST(head)) != NULL) { - LIST_REMOVE(zd, zd_next); - free(zd, M_ZONES); - } - free(head, M_ZONES); - return (0); + head = arg; + while ((zd = LIST_FIRST(head)) != NULL) { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + } + free(head, M_ZONES); } static void zone_sysinit(void *arg __unused) { - zone_prison_service = prison_service_register("zfs", zone_create, - zone_destroy); + zone_slot = osd_jail_register(zone_destroy); } static void zone_sysuninit(void *arg __unused) { - prison_service_deregister(zone_prison_service); + osd_jail_deregister(zone_slot); } SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL); diff --git a/sys/cddl/compat/opensolaris/sys/atomic.h b/sys/cddl/compat/opensolaris/sys/atomic.h index 46752a58995d..1a8bd45964d7 100644 --- a/sys/cddl/compat/opensolaris/sys/atomic.h +++ b/sys/cddl/compat/opensolaris/sys/atomic.h @@ -38,6 +38,7 @@ #ifndef __LP64__ extern void atomic_add_64(volatile uint64_t *target, int64_t delta); +extern void atomic_dec_64(volatile uint64_t *target); extern void *atomic_cas_ptr(volatile void *target, void *cmp, void *newval); #endif #ifndef __sparc64__ @@ -83,6 +84,14 @@ atomic_dec_32_nv(volatile uint32_t *target) return (atomic_fetchadd_32(target, -1) - 1); } +#ifdef __LP64__ +static __inline void +atomic_dec_64(volatile uint64_t *target) +{ + atomic_subtract_64(target, 1); +} +#endif + static __inline void atomic_inc_32(volatile uint32_t *target) { diff --git a/sys/cddl/compat/opensolaris/sys/callb.h b/sys/cddl/compat/opensolaris/sys/callb.h deleted file mode 100644 index 070d0f973bf9..000000000000 --- a/sys/cddl/compat/opensolaris/sys/callb.h +++ /dev/null @@ -1,219 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - * - * $FreeBSD$ - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_CALLB_H -#define _SYS_CALLB_H - -#pragma ident "@(#)callb.h 1.29 05/06/23 SMI" - -#include <sys/kcondvar.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * definitions of callback classes (c_class) - * - * Callbacks belong in the same class if (1) their callback routines - * do the same kind of processing (ideally, using the same callback function) - * and (2) they can/should be executed at the same time in a cpr - * suspend/resume operation. - * - * Note: The DAEMON class, in particular, is for stopping kernel threads - * and nothing else. The CALLB_* macros below should be used to deal - * with kernel threads, and the callback function should be callb_generic_cpr. - * Another idiosyncrasy of the DAEMON class is that if a suspend operation - * fails, some of the callback functions may be called with the RESUME - * code which were never called with SUSPEND. Not a problem currently, - * but see bug 4201851. - */ -#define CB_CL_CPR_DAEMON 0 -#define CB_CL_CPR_VM 1 -#define CB_CL_CPR_CALLOUT 2 -#define CB_CL_CPR_OBP 3 -#define CB_CL_CPR_FB 4 -#define CB_CL_PANIC 5 -#define CB_CL_CPR_RPC 6 -#define CB_CL_CPR_PROMPRINTF 7 -#define CB_CL_UADMIN 8 -#define CB_CL_CPR_PM 9 -#define CB_CL_HALT 10 -#define CB_CL_CPR_DMA 11 -#define CB_CL_CPR_POST_USER 12 -#define CB_CL_UADMIN_PRE_VFS 13 -#define CB_CL_MDBOOT CB_CL_UADMIN -#define CB_CL_ENTER_DEBUGGER 14 -#define CB_CL_CPR_POST_KERNEL 15 -#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */ - -/* - * CB_CL_CPR_DAEMON class specific definitions are given below: - */ - -/* - * code for CPR callb_execute_class - */ -#define CB_CODE_CPR_CHKPT 0 -#define CB_CODE_CPR_RESUME 1 - -typedef void * callb_id_t; -/* - * Per kernel thread structure for CPR daemon callbacks. - * Must be protected by either a existing lock in the daemon or - * a new lock created for such a purpose. - */ -typedef struct callb_cpr { - kmutex_t *cc_lockp; /* lock to protect this struct */ - char cc_events; /* various events for CPR */ - callb_id_t cc_id; /* callb id address */ - kcondvar_t cc_callb_cv; /* cv for callback waiting */ - kcondvar_t cc_stop_cv; /* cv to checkpoint block */ -} callb_cpr_t; - -/* - * cc_events definitions - */ -#define CALLB_CPR_START 1 /* a checkpoint request's started */ -#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */ -#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */ - -/* - * Used when checking that all kernel threads are stopped. - */ -#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */ -#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */ -#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */ - /* due to pwr mgmt of disks, make -- */ - /* big enough for worst spinup time */ - -#ifdef _KERNEL -/* - * - * CALLB_CPR_INIT macro is used by kernel threads to add their entry to - * the callback table and perform other initialization. It automatically - * adds the thread as being in the callback class CB_CL_CPR_DAEMON. - * - * cp - ptr to the callb_cpr_t structure for this kernel thread - * - * lockp - pointer to mutex protecting the callb_cpr_t stuct - * - * func - pointer to the callback function for this kernel thread. - * It has the prototype boolean_t <func>(void *arg, int code) - * where: arg - ptr to the callb_cpr_t structure - * code - not used for this type of callback - * returns: B_TRUE if successful; B_FALSE if unsuccessful. - * - * name - a string giving the name of the kernel thread - * - * Note: lockp is the lock to protect the callb_cpr_t (cp) structure - * later on. No lock held is needed for this initialization. - */ -#define CALLB_CPR_INIT(cp, lockp, func, name) { \ - strlcpy(curthread->td_name, (name), \ - sizeof(curthread->td_name)); \ - strlcpy(curthread->td_proc->p_comm, (name), \ - sizeof(curthread->td_proc->p_comm)); \ - bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \ - (cp)->cc_lockp = lockp; \ - (cp)->cc_id = callb_add(func, (void *)(cp), \ - CB_CL_CPR_DAEMON, name); \ - } - -#ifndef __lock_lint -#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); -#else -#define CALLB_CPR_ASSERT(cp) -#endif -/* - * Some threads (like the idle threads) do not adhere to the callback - * protocol and are always considered safe. Such threads must never exit. - * They register their presence by calling this macro during their - * initialization. - * - * Args: - * t - thread pointer of the client kernel thread - * name - a string giving the name of the kernel thread - */ -#define CALLB_CPR_INIT_SAFE(t, name) { \ - (void) callb_add_thread(callb_generic_cpr_safe, \ - (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \ - name, t); \ - } -/* - * The lock to protect cp's content must be held before - * calling the following two macros. - * - * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END - * is safe for checkpoint/resume. - */ -#define CALLB_CPR_SAFE_BEGIN(cp) { \ - CALLB_CPR_ASSERT(cp) \ - (cp)->cc_events |= CALLB_CPR_SAFE; \ - if ((cp)->cc_events & CALLB_CPR_START) \ - cv_signal(&(cp)->cc_callb_cv); \ - } -#define CALLB_CPR_SAFE_END(cp, lockp) { \ - CALLB_CPR_ASSERT(cp) \ - while ((cp)->cc_events & CALLB_CPR_START) \ - cv_wait(&(cp)->cc_stop_cv, lockp); \ - (cp)->cc_events &= ~CALLB_CPR_SAFE; \ - } -/* - * cv_destroy is nop right now but may be needed in the future. - */ -#define CALLB_CPR_EXIT(cp) { \ - CALLB_CPR_ASSERT(cp) \ - (cp)->cc_events |= CALLB_CPR_SAFE; \ - if ((cp)->cc_events & CALLB_CPR_START) \ - cv_signal(&(cp)->cc_callb_cv); \ - mutex_exit((cp)->cc_lockp); \ - (void) callb_delete((cp)->cc_id); \ - cv_destroy(&(cp)->cc_callb_cv); \ - cv_destroy(&(cp)->cc_stop_cv); \ - } - -extern callb_cpr_t callb_cprinfo_safe; -extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *); -extern callb_id_t callb_add_thread(boolean_t (*)(void *, int), - void *, int, char *, kthread_id_t); -extern int callb_delete(callb_id_t); -extern void callb_execute(callb_id_t, int); -extern void *callb_execute_class(int, int); -extern boolean_t callb_generic_cpr(void *, int); -extern boolean_t callb_generic_cpr_safe(void *, int); -extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *); -extern void callb_lock_table(void); -extern void callb_unlock_table(void); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_CALLB_H */ diff --git a/sys/cddl/compat/opensolaris/sys/cred.h b/sys/cddl/compat/opensolaris/sys/cred.h index 85e79db81eda..b13ef6c62874 100644 --- a/sys/cddl/compat/opensolaris/sys/cred.h +++ b/sys/cddl/compat/opensolaris/sys/cred.h @@ -30,12 +30,14 @@ #define _OPENSOLARIS_SYS_CRED_H_ #include <sys/param.h> -#include_next <sys/ucred.h> - -#ifdef _KERNEL +#define _WANT_UCRED +#include <sys/ucred.h> +#undef _WANT_UCRED typedef struct ucred cred_t; +typedef struct ucred ucred_t; +#ifdef _KERNEL #define CRED() (curthread->td_ucred) /* @@ -43,9 +45,14 @@ typedef struct ucred cred_t; */ #define kcred (thread0.td_ucred) -#define crgetuid(cred) ((cred)->cr_uid) -#define crgetgid(cred) ((cred)->cr_gid) - -#endif /* _KERNEL */ +#define crgetuid(cred) ((cred)->cr_uid) +#define crgetgid(cred) ((cred)->cr_gid) +#define crgetgroups(cred) ((cred)->cr_groups) +#define crgetngroups(cred) ((cred)->cr_ngroups) +#define crgetsid(cred, i) (NULL) +#else /* !_KERNEL */ +#define kcred NULL +#define CRED() NULL +#endif /* !_KERNEL */ #endif /* _OPENSOLARIS_SYS_CRED_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/dnlc.h b/sys/cddl/compat/opensolaris/sys/dnlc.h index a2d4f01263fc..e978e975bc8a 100644 --- a/sys/cddl/compat/opensolaris/sys/dnlc.h +++ b/sys/cddl/compat/opensolaris/sys/dnlc.h @@ -35,6 +35,6 @@ #define dnlc_update(dvp, name, vp) do { } while (0) #define dnlc_remove(dvp, name) do { } while (0) #define dnlc_purge_vfsp(vfsp, count) (0) -#define dnlc_reduce_cache(percent) do { } while (0) +#define dnlc_reduce_cache(percent) EVENTHANDLER_INVOKE(vfs_lowvnodes, (int)(intptr_t)(percent)) #endif /* !_OPENSOLARIS_SYS_DNLC_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/file.h b/sys/cddl/compat/opensolaris/sys/file.h new file mode 100644 index 000000000000..afd10501d016 --- /dev/null +++ b/sys/cddl/compat/opensolaris/sys/file.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_FILE_H_ +#define _OPENSOLARIS_SYS_FILE_H_ + +#include_next <sys/file.h> + +#ifdef _KERNEL +typedef struct file file_t; + +static __inline file_t * +getf(int fd, int write) +{ + struct file *fp; + + if (write && fget_write(curthread, fd, &fp) == 0) + return (fp); + else if (!write && fget_read(curthread, fd, &fp) == 0) + return (fp); + return (NULL); +} + +static __inline void +releasef(file_t *fp) +{ + + fdrop(fp, curthread); +} +#endif /* _KERNEL */ + +#endif /* !_OPENSOLARIS_SYS_FILE_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/kidmap.h b/sys/cddl/compat/opensolaris/sys/kidmap.h new file mode 100644 index 000000000000..c2a33d2d3ebf --- /dev/null +++ b/sys/cddl/compat/opensolaris/sys/kidmap.h @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_KIDMAP_H_ +#define _OPENSOLARIS_SYS_KIDMAP_H_ + +#include <sys/idmap.h> + +typedef int32_t idmap_stat; +typedef void idmap_get_handle_t; + +#define kidmap_get_create() (NULL) +#define kidmap_get_destroy(hdl) do { } while (0) +#define kidmap_get_mappings(hdl) (NULL) + +#endif /* _OPENSOLARIS_SYS_KIDMAP_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/kmem.h b/sys/cddl/compat/opensolaris/sys/kmem.h index 5258cffe4edf..c103d18b4e3b 100644 --- a/sys/cddl/compat/opensolaris/sys/kmem.h +++ b/sys/cddl/compat/opensolaris/sys/kmem.h @@ -38,15 +38,16 @@ #include <vm/vm_extern.h> #define KM_SLEEP M_WAITOK +#define KM_PUSHPAGE M_WAITOK #define KM_NOSLEEP M_NOWAIT #define KMC_NODEBUG 0 typedef struct kmem_cache { char kc_name[32]; -#ifdef _KERNEL +#if defined(_KERNEL) && !defined(KMEM_DEBUG) uma_zone_t kc_zone; #else - size_t size; + size_t kc_size; #endif int (*kc_constructor)(void *, void *, int); void (*kc_destructor)(void *, void *); diff --git a/sys/cddl/compat/opensolaris/sys/misc.h b/sys/cddl/compat/opensolaris/sys/misc.h index a5a52b786eec..8e1a637a3b68 100644 --- a/sys/cddl/compat/opensolaris/sys/misc.h +++ b/sys/cddl/compat/opensolaris/sys/misc.h @@ -29,6 +29,13 @@ #ifndef _OPENSOLARIS_SYS_MISC_H_ #define _OPENSOLARIS_SYS_MISC_H_ +#define MAXUID 2147483647 + +#define SPEC_MAXOFFSET_T OFF_MAX + +#define _ACL_ACLENT_ENABLED 0x1 +#define _ACL_ACE_ENABLED 0x2 + #define _FIOFFS (INT_MIN) #define _FIOGDIO (INT_MIN+1) #define _FIOSDIO (INT_MIN+2) diff --git a/sys/cddl/compat/opensolaris/sys/mntent.h b/sys/cddl/compat/opensolaris/sys/mntent.h index e4bbc9da225b..3faea6b73430 100644 --- a/sys/cddl/compat/opensolaris/sys/mntent.h +++ b/sys/cddl/compat/opensolaris/sys/mntent.h @@ -54,5 +54,7 @@ #define MNTOPT_EXEC "exec" /* enable executables */ #define MNTOPT_NOEXEC "noexec" /* disable executables */ #define MNTOPT_RESTRICT "restrict" /* restricted autofs mount */ +#define MNTOPT_NBMAND "nbmand" /* allow non-blocking mandatory locks */ +#define MNTOPT_NONBMAND "nonbmand" /* deny non-blocking mandatory locks */ #endif /* !_OPENSOLARIS_MNTENT_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/param.h b/sys/cddl/compat/opensolaris/sys/param.h index 8d36a9d07700..609d22afe8c1 100644 --- a/sys/cddl/compat/opensolaris/sys/param.h +++ b/sys/cddl/compat/opensolaris/sys/param.h @@ -34,4 +34,8 @@ #define PAGESIZE PAGE_SIZE +#ifdef _KERNEL +#define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) +#endif + #endif diff --git a/sys/cddl/compat/opensolaris/sys/pathname.h b/sys/cddl/compat/opensolaris/sys/pathname.h new file mode 100644 index 000000000000..0d396231c65d --- /dev/null +++ b/sys/cddl/compat/opensolaris/sys/pathname.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_PATHNAME_H_ +#define _OPENSOLARIS_SYS_PATHNAME_H_ + +#ifdef _KERNEL + +#include <sys/param.h> +#include <sys/vnode.h> + +typedef struct pathname { + char *pn_buf; /* underlying storage */ + char *pn_path; /* remaining pathname */ + size_t pn_pathlen; /* remaining length */ + size_t pn_bufsize; /* total size of pn_buf */ +} pathname_t; + +#define pn_alloc(pnp) panic("pn_alloc() called") +#define pn_free(pnp) panic("pn_free() called") + +int lookupname(char *, enum uio_seg, enum symfollow, vnode_t **, vnode_t **); +int lookupnameat(char *, enum uio_seg, enum symfollow, vnode_t **, vnode_t **, + vnode_t *); +int traverse(vnode_t **, int); + +#endif /* _KERNEL */ + +#endif /* _OPENSOLARIS_SYS_PATHNAME_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/policy.h b/sys/cddl/compat/opensolaris/sys/policy.h index 2c764ef29c10..08db5ca763d7 100644 --- a/sys/cddl/compat/opensolaris/sys/policy.h +++ b/sys/cddl/compat/opensolaris/sys/policy.h @@ -33,30 +33,44 @@ #ifdef _KERNEL +#include <sys/vnode.h> + struct mount; struct ucred; struct vattr; struct vnode; -int secpolicy_zfs(struct ucred *cred); -int secpolicy_sys_config(struct ucred *cred, int checkonly); -int secpolicy_zinject(struct ucred *cred); -int secpolicy_fs_unmount(struct ucred *cred, struct mount *vfsp); -int secpolicy_basic_link(struct ucred *cred); +int secpolicy_nfs(struct ucred *cred); +int secpolicy_zfs(struct ucred *cred); +int secpolicy_sys_config(struct ucred *cred, int checkonly); +int secpolicy_zinject(struct ucred *cred); +int secpolicy_fs_unmount(struct ucred *cred, struct mount *vfsp); +int secpolicy_basic_link(struct vnode *vp, struct ucred *cred); +int secpolicy_vnode_owner(struct vnode *vp, cred_t *cred, uid_t owner); +int secpolicy_vnode_chown(struct vnode *vp, cred_t *cred, + boolean_t check_self); int secpolicy_vnode_stky_modify(struct ucred *cred); -int secpolicy_vnode_remove(struct ucred *cred); +int secpolicy_vnode_remove(struct vnode *vp, struct ucred *cred); int secpolicy_vnode_access(struct ucred *cred, struct vnode *vp, uint64_t owner, accmode_t accmode); -int secpolicy_vnode_setdac(struct ucred *cred, uid_t owner); +int secpolicy_vnode_setdac(struct vnode *vp, struct ucred *cred, + uid_t owner); int secpolicy_vnode_setattr(struct ucred *cred, struct vnode *vp, struct vattr *vap, const struct vattr *ovap, int flags, int unlocked_access(void *, int, struct ucred *), void *node); int secpolicy_vnode_create_gid(struct ucred *cred); -int secpolicy_vnode_setids_setgids(struct ucred *cred, gid_t gid); -int secpolicy_vnode_setid_retain(struct ucred *cred, boolean_t issuidroot); -void secpolicy_setid_clear(struct vattr *vap, struct ucred *cred); +int secpolicy_vnode_setids_setgids(struct vnode *vp, struct ucred *cred, + gid_t gid); +int secpolicy_vnode_setid_retain(struct vnode *vp, struct ucred *cred, + boolean_t issuidroot); +void secpolicy_setid_clear(struct vattr *vap, struct vnode *vp, + struct ucred *cred); int secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap, const struct vattr *ovap, struct ucred *cred); +int secpolicy_fs_owner(struct mount *vfsp, struct ucred *cred); +int secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp); +void secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp); +int secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype); #endif /* _KERNEL */ diff --git a/sys/cddl/compat/opensolaris/sys/proc.h b/sys/cddl/compat/opensolaris/sys/proc.h index 2410396d4a9a..73fbcdadf1d2 100644 --- a/sys/cddl/compat/opensolaris/sys/proc.h +++ b/sys/cddl/compat/opensolaris/sys/proc.h @@ -54,12 +54,6 @@ typedef struct thread kthread_t; typedef struct thread *kthread_id_t; typedef struct proc proc_t; -#if (KSTACK_PAGES * PAGE_SIZE) < 16384 -#define ZFS_KSTACK_PAGES (16384 / PAGE_SIZE) -#else -#define ZFS_KSTACK_PAGES 0 -#endif - static __inline kthread_t * thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, size_t len, proc_t *pp, int state, pri_t pri) @@ -71,11 +65,10 @@ thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, * Be sure there are no surprises. */ ASSERT(stk == NULL); - ASSERT(stksize == 0); ASSERT(len == 0); ASSERT(state == TS_RUN); - error = kproc_create(proc, arg, &p, 0, ZFS_KSTACK_PAGES, + error = kproc_create(proc, arg, &p, 0, stksize / PAGE_SIZE, "solthread %p", proc); return (error == 0 ? FIRST_THREAD_IN_PROC(p) : NULL); } diff --git a/sys/cddl/compat/opensolaris/sys/refstr.h b/sys/cddl/compat/opensolaris/sys/refstr.h new file mode 100644 index 000000000000..e4e177bf435c --- /dev/null +++ b/sys/cddl/compat/opensolaris/sys/refstr.h @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + $ $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_REFSTR_H_ +#define _OPENSOLARIS_SYS_REFSTR_H_ + +#define refstr_value(str) (str) + +#endif /* _OPENSOLARIS_SYS_REFSTR_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/sid.h b/sys/cddl/compat/opensolaris/sys/sid.h new file mode 100644 index 000000000000..eb8d0bed3eeb --- /dev/null +++ b/sys/cddl/compat/opensolaris/sys/sid.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SID_H_ +#define _OPENSOLARIS_SYS_SID_H_ + +typedef struct ksiddomain { + char kd_name[16]; /* Domain part of SID */ +} ksiddomain_t; +typedef void ksid_t; + +static __inline ksiddomain_t * +ksid_lookupdomain(const char *domain) +{ + ksiddomain_t *kd; + + kd = kmem_alloc(sizeof(*kd), KM_SLEEP); + strlcpy(kd->kd_name, "FreeBSD", sizeof(kd->kd_name)); + return (kd); +} + +static __inline void +ksiddomain_rele(ksiddomain_t *kd) +{ + + kmem_free(kd, sizeof(*kd)); +} + +#endif /* _OPENSOLARIS_SYS_SID_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/sig.h b/sys/cddl/compat/opensolaris/sys/sig.h new file mode 100644 index 000000000000..985896ee2679 --- /dev/null +++ b/sys/cddl/compat/opensolaris/sys/sig.h @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2008 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SIG_H_ +#define _OPENSOLARIS_SYS_SIG_H_ + +#ifdef _KERNEL + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/signalvar.h> +#include <sys/debug.h> + +#define FORREAL 0 +#define JUSTLOOKING 1 + +static __inline int +issig(int why) +{ + struct thread *td = curthread; + struct proc *p; + int sig; + + ASSERT(why == FORREAL || why == JUSTLOOKING); + if (SIGPENDING(td)) { + if (why == JUSTLOOKING) + return (1); + p = td->td_proc; + PROC_LOCK(p); + mtx_lock(&p->p_sigacts->ps_mtx); + sig = cursig(td); + mtx_unlock(&p->p_sigacts->ps_mtx); + PROC_UNLOCK(p); + if (sig != 0) + return (1); + } + return (0); +} + +#endif /* _KERNEL */ + +#endif /* _OPENSOLARIS_SYS_SIG_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/sunddi.h b/sys/cddl/compat/opensolaris/sys/sunddi.h index 192d5a992c34..1ca2bf09fd6d 100644 --- a/sys/cddl/compat/opensolaris/sys/sunddi.h +++ b/sys/cddl/compat/opensolaris/sys/sunddi.h @@ -29,8 +29,10 @@ #ifndef _OPENSOLARIS_SYS_SUNDDI_H_ #define _OPENSOLARIS_SYS_SUNDDI_H_ +#define ddi_driver_major(zfs_dip) (0) #define ddi_copyin(from, to, size, flag) (bcopy((from), (to), (size)), 0) #define ddi_copyout(from, to, size, flag) (bcopy((from), (to), (size)), 0) +int ddi_strtol(const char *str, char **nptr, int base, long *result); int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result); #endif /* _OPENSOLARIS_SYS_SUNDDI_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/sysmacros.h b/sys/cddl/compat/opensolaris/sys/sysmacros.h index a179c5f869da..7f4885bac600 100644 --- a/sys/cddl/compat/opensolaris/sys/sysmacros.h +++ b/sys/cddl/compat/opensolaris/sys/sysmacros.h @@ -39,6 +39,10 @@ extern "C" { #endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif + /* * Macro for checking power of 2 address alignment. */ diff --git a/sys/cddl/compat/opensolaris/sys/time.h b/sys/cddl/compat/opensolaris/sys/time.h index 770b25163191..0bf1e9bf6b82 100644 --- a/sys/cddl/compat/opensolaris/sys/time.h +++ b/sys/cddl/compat/opensolaris/sys/time.h @@ -40,6 +40,9 @@ typedef longlong_t hrtime_t; #define LBOLT ((gethrtime() * hz) / NANOSEC) +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX) + #ifdef _KERNEL #define lbolt64 (int64_t)(LBOLT) diff --git a/sys/cddl/compat/opensolaris/sys/types.h b/sys/cddl/compat/opensolaris/sys/types.h index 7d5d9e41ed14..069ad45b2f32 100644 --- a/sys/cddl/compat/opensolaris/sys/types.h +++ b/sys/cddl/compat/opensolaris/sys/types.h @@ -44,13 +44,15 @@ typedef u_char uchar_t; typedef u_short ushort_t; typedef u_long ulong_t; typedef long long longlong_t; -typedef unsigned long long u_longlong_t; +typedef unsigned long long u_longlong_t; typedef off_t off64_t; typedef id_t taskid_t; typedef id_t projid_t; typedef id_t poolid_t; typedef id_t zoneid_t; typedef id_t ctid_t; +typedef mode_t o_mode_t; +typedef uint64_t pgcnt_t; #ifdef _KERNEL @@ -60,8 +62,8 @@ typedef id_t ctid_t; typedef short index_t; typedef off_t offset_t; typedef long ptrdiff_t; /* pointer difference */ -typedef void pathname_t; typedef int64_t rlim64_t; +typedef int major_t; #else #ifdef NEED_SOLARIS_BOOLEAN @@ -80,7 +82,6 @@ typedef short pri_t; typedef int32_t daddr32_t; typedef int32_t time32_t; typedef u_longlong_t diskaddr_t; -typedef ushort_t o_mode_t; /* old file attribute type */ #endif /* !_KERNEL */ diff --git a/sys/cddl/compat/opensolaris/sys/uio.h b/sys/cddl/compat/opensolaris/sys/uio.h index d219ff0e960c..9e53457baf2b 100644 --- a/sys/cddl/compat/opensolaris/sys/uio.h +++ b/sys/cddl/compat/opensolaris/sys/uio.h @@ -60,6 +60,6 @@ zfs_uiomove(void *cp, size_t n, enum uio_rw dir, uio_t *uio) return (uiomove(cp, (int)n, uio)); } #define uiomove(cp, n, dir, uio) zfs_uiomove((cp), (n), (dir), (uio)) -#endif +#endif /* BUILDING_ZFS */ #endif /* !_OPENSOLARIS_SYS_UIO_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/vfs.h b/sys/cddl/compat/opensolaris/sys/vfs.h index c2d8a6b71119..be3e3cfe5fcf 100644 --- a/sys/cddl/compat/opensolaris/sys/vfs.h +++ b/sys/cddl/compat/opensolaris/sys/vfs.h @@ -45,6 +45,7 @@ typedef struct mount vfs_t; #define vfs_count mnt_ref #define vfs_fsid mnt_stat.f_fsid #define vfs_bsize mnt_stat.f_bsize +#define vfs_resource mnt_stat.f_mntfromname #define v_flag v_vflag #define v_vfsp v_mount @@ -64,6 +65,8 @@ typedef struct mount vfs_t; MNT_IUNLOCK(vfsp); \ } while (0) +#define fs_vscan(vp, cr, async) (0) + #define VROOT VV_ROOT /* @@ -107,10 +110,21 @@ void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, int flags __unused); void vfs_clearmntopt(vfs_t *vfsp, const char *name); int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp); -int traverse(vnode_t **cvpp, int lktype); int domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath, char *fspec, int fsflags); +typedef uint64_t vfs_feature_t; + +#define VFSFT_XVATTR 0x100000001 /* Supports xvattr for attrs */ +#define VFSFT_CASEINSENSITIVE 0x100000002 /* Supports case-insensitive */ +#define VFSFT_NOCASESENSITIVE 0x100000004 /* NOT case-sensitive */ +#define VFSFT_DIRENTFLAGS 0x100000008 /* Supports dirent flags */ +#define VFSFT_ACLONCREATE 0x100000010 /* Supports ACL on create */ +#define VFSFT_ACEMASKONACCESS 0x100000020 /* Can use ACEMASK for access */ + +#define vfs_set_feature(vfsp, feature) do { } while (0) +#define vfs_has_feature(vfsp, feature) (0) + #endif /* _KERNEL */ #endif /* _OPENSOLARIS_SYS_VFS_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h index a8a261cc2449..b490d337d5e7 100644 --- a/sys/cddl/compat/opensolaris/sys/vnode.h +++ b/sys/cddl/compat/opensolaris/sys/vnode.h @@ -29,24 +29,32 @@ #ifndef _OPENSOLARIS_SYS_VNODE_H_ #define _OPENSOLARIS_SYS_VNODE_H_ +struct vnode; +struct vattr; + +typedef struct vnode vnode_t; +typedef struct vattr vattr_t; +typedef enum vtype vtype_t; + +#include <sys/namei.h> +enum symfollow { NO_FOLLOW = NOFOLLOW }; + +#include <sys/proc.h> #include_next <sys/vnode.h> #include <sys/mount.h> #include <sys/cred.h> #include <sys/fcntl.h> -#include <sys/namei.h> -#include <sys/proc.h> +#include <sys/file.h> #include <sys/filedesc.h> #include <sys/syscallsubr.h> -typedef struct vnode vnode_t; -typedef struct vattr vattr_t; -typedef void caller_context_t; - typedef struct vop_vector vnodeops_t; #define vop_fid vop_vptofh #define vop_fid_args vop_vptofh_args #define a_fid a_fhp +#define IS_XATTRDIR(dvp) (0) + #define v_count v_usecount static __inline int @@ -59,23 +67,24 @@ vn_is_readonly(vnode_t *vp) #define vn_ismntpt(vp) ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) #define vn_mountedvfs(vp) ((vp)->v_mountedhere) #define vn_has_cached_data(vp) ((vp)->v_object != NULL && (vp)->v_object->resident_page_count > 0) +#define vn_exists(vp) do { } while (0) +#define vn_invalid(vp) do { } while (0) +#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) +#define vn_free(vp) do { } while (0) #define VN_HOLD(v) vref(v) #define VN_RELE(v) vrele(v) #define VN_URELE(v) vput(v) -#define VOP_REALVP(vp, vpp) (*(vpp) = (vp), 0) - -#define vnevent_remove(vp) do { } while (0) -#define vnevent_rmdir(vp) do { } while (0) -#define vnevent_rename_src(vp) do { } while (0) -#define vnevent_rename_dest(vp) do { } while (0) - +#define VOP_REALVP(vp, vpp, ct) (*(vpp) = (vp), 0) -#define IS_DEVVP(vp) \ - ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) - -#define MODEMASK ALLPERMS +#define vnevent_create(vp, ct) do { } while (0) +#define vnevent_link(vp, ct) do { } while (0) +#define vnevent_remove(vp, dvp, name, ct) do { } while (0) +#define vnevent_rmdir(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_src(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_dest_dir(vp, ct) do { } while (0) #define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) #define MANDMODE(mode) (0) @@ -98,24 +107,6 @@ vn_is_readonly(vnode_t *vp) #define MAXOFFSET_T OFF_MAX #define EXCL 0 -#define AT_TYPE 0x0001 -#define AT_MODE 0x0002 -#define AT_UID 0x0004 -#define AT_GID 0x0008 -#define AT_FSID 0x0010 -#define AT_NODEID 0x0020 -#define AT_NLINK 0x0040 -#define AT_SIZE 0x0080 -#define AT_ATIME 0x0100 -#define AT_MTIME 0x0200 -#define AT_CTIME 0x0400 -#define AT_RDEV 0x0800 -#define AT_BLKSIZE 0x1000 -#define AT_NBLOCKS 0x2000 -#define AT_SEQ 0x4000 -#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\ - AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) - #define ACCESSED (AT_ATIME) #define STATE_CHANGED (AT_CTIME) #define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) @@ -140,28 +131,37 @@ vattr_init_mask(vattr_t *vap) vap->va_mask |= AT_MTIME; if (vap->va_mode != (u_short)VNOVAL) vap->va_mask |= AT_MODE; + if (vap->va_flags != VNOVAL) + vap->va_mask |= AT_XVATTR; } -#define FCREAT O_CREAT -#define FTRUNC O_TRUNC -#define FDSYNC FFSYNC -#define FRSYNC FFSYNC -#define FSYNC FFSYNC -#define FOFFMAX 0x00 - -enum create { CRCREAT }; +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FDSYNC FFSYNC +#define FRSYNC FFSYNC +#define FSYNC FFSYNC +#define FOFFMAX 0x00 +#define FIGNORECASE 0x00 static __inline int -zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, - vnode_t **vpp, enum create crwhy, mode_t umask) +vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode, + vnode_t **vpp, enum create crwhy, mode_t umask, struct vnode *startvp, + int fd) { struct thread *td = curthread; struct nameidata nd; - int error; + int error, operation; ASSERT(seg == UIO_SYSSPACE); - ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX)); - ASSERT(crwhy == CRCREAT); + if ((filemode & FCREAT) != 0) { + ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX)); + ASSERT(crwhy == CRCREAT); + operation = CREATE; + } else { + ASSERT(filemode == (FREAD | FWRITE | FOFFMAX)); + ASSERT(crwhy == 0); + operation = LOOKUP; + } ASSERT(umask == 0); if (td->td_proc->p_fd->fd_rdir == NULL) @@ -169,7 +169,10 @@ zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, if (td->td_proc->p_fd->fd_cdir == NULL) td->td_proc->p_fd->fd_cdir = rootvnode; - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pnamep, td); + if (startvp != NULL) + vref(startvp); + NDINIT_ATVP(&nd, operation, NOFOLLOW | MPSAFE, UIO_SYSSPACE, pnamep, + startvp, td); error = vn_open_cred(&nd, &filemode, createmode, td->td_ucred, NULL); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == 0) { @@ -180,6 +183,15 @@ zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, } return (error); } + +static __inline int +zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, + vnode_t **vpp, enum create crwhy, mode_t umask) +{ + + return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy, + umask, NULL, -1)); +} #define vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask) \ zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask)) @@ -192,14 +204,16 @@ zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len, struct thread *td = curthread; int error, vfslocked, resid; - ASSERT(rw == UIO_WRITE); ASSERT(ioflag == 0); ASSERT(ulimit == RLIM64_INFINITY); - ioflag = IO_APPEND | IO_UNIT; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + if (rw == UIO_WRITE) { + ioflag = IO_SYNC; + VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); + } else { + ioflag = IO_DIRECT; + } error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED, &resid, td); VFS_UNLOCK_GIANT(vfslocked); @@ -229,7 +243,7 @@ drop: VFS_UNLOCK_GIANT(vfslocked); return (error); } -#define VOP_FSYNC(vp, flag, cr) zfs_vop_fsync((vp), (flag), (cr)) +#define VOP_FSYNC(vp, flag, cr, ct) zfs_vop_fsync((vp), (flag), (cr)) static __inline int zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) @@ -241,7 +255,7 @@ zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) return (vn_close(vp, flag, cr, curthread)); } -#define VOP_CLOSE(vp, oflags, count, offset, cr) \ +#define VOP_CLOSE(vp, oflags, count, offset, cr, ct) \ zfs_vop_close((vp), (oflags), (count), (offset), (cr)) static __inline int @@ -253,7 +267,6 @@ vn_rename(char *from, char *to, enum uio_seg seg) return (kern_rename(curthread, from, to, seg)); } -enum rm { RMFILE }; static __inline int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) { diff --git a/sys/cddl/compat/opensolaris/sys/zone.h b/sys/cddl/compat/opensolaris/sys/zone.h index 2e47eb17b44e..d761310a1a81 100644 --- a/sys/cddl/compat/opensolaris/sys/zone.h +++ b/sys/cddl/compat/opensolaris/sys/zone.h @@ -38,9 +38,9 @@ */ /* - * Is process in the global zone? + * Is thread in the global zone? */ -#define INGLOBALZONE(p) (!jailed((p)->p_ucred)) +#define INGLOBALZONE(thread) (!jailed((thread)->td_ucred)) /* * Attach the given dataset to the given jail. @@ -61,8 +61,6 @@ extern int zone_dataset_visible(const char *, int *); #define GLOBAL_ZONEID 0 -extern int getzoneid(void); - #endif /* _KERNEL */ #endif /* !_OPENSOLARIS_SYS_ZONE_H_ */ diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c index 2f32e7a8a78a..e6b678079635 100644 --- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c +++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,100 +19,245 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/types.h> -#include <sys/acl.h> #include <sys/stat.h> +#include <sys/avl.h> +#include <sys/misc.h> #if defined(_KERNEL) +#include <sys/kmem.h> #include <sys/systm.h> +#include <sys/sysmacros.h> +#include <acl/acl_common.h> #include <sys/debug.h> #else #include <errno.h> #include <stdlib.h> +#include <stddef.h> #include <strings.h> +#include <unistd.h> #include <assert.h> +#include <grp.h> +#include <pwd.h> +#include <acl_common.h> #define ASSERT assert #endif +#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \ + ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \ + ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL) + + +#define ACL_SYNCHRONIZE_SET_DENY 0x0000001 +#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002 +#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004 +#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008 + +#define ACL_WRITE_OWNER_SET_DENY 0x0000010 +#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020 +#define ACL_WRITE_OWNER_ERR_DENY 0x0000040 +#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080 + +#define ACL_DELETE_SET_DENY 0x0000100 +#define ACL_DELETE_SET_ALLOW 0x0000200 +#define ACL_DELETE_ERR_DENY 0x0000400 +#define ACL_DELETE_ERR_ALLOW 0x0000800 + +#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000 +#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000 +#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000 +#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000 + +#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000 +#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000 +#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000 +#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000 + +#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000 +#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000 +#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000 +#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000 + +#define ACL_READ_NAMED_READER_SET_DENY 0x1000000 +#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000 +#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000 +#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000 + + +#define ACE_VALID_MASK_BITS (\ + ACE_READ_DATA | \ + ACE_LIST_DIRECTORY | \ + ACE_WRITE_DATA | \ + ACE_ADD_FILE | \ + ACE_APPEND_DATA | \ + ACE_ADD_SUBDIRECTORY | \ + ACE_READ_NAMED_ATTRS | \ + ACE_WRITE_NAMED_ATTRS | \ + ACE_EXECUTE | \ + ACE_DELETE_CHILD | \ + ACE_READ_ATTRIBUTES | \ + ACE_WRITE_ATTRIBUTES | \ + ACE_DELETE | \ + ACE_READ_ACL | \ + ACE_WRITE_ACL | \ + ACE_WRITE_OWNER | \ + ACE_SYNCHRONIZE) + +#define ACE_MASK_UNDEFINED 0x80000000 + +#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \ + ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \ + ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE) + +/* + * ACL conversion helpers + */ + +typedef enum { + ace_unused, + ace_user_obj, + ace_user, + ace_group, /* includes GROUP and GROUP_OBJ */ + ace_other_obj +} ace_to_aent_state_t; + +typedef struct acevals { + uid_t key; + avl_node_t avl; + uint32_t mask; + uint32_t allowed; + uint32_t denied; + int aent_type; +} acevals_t; + +typedef struct ace_list { + acevals_t user_obj; + avl_tree_t user; + int numusers; + acevals_t group_obj; + avl_tree_t group; + int numgroups; + acevals_t other_obj; + uint32_t acl_mask; + int hasmask; + int dfacl_flag; + ace_to_aent_state_t state; + int seen; /* bitmask of all aclent_t a_type values seen */ +} ace_list_t; ace_t trivial_acl[] = { - {-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE}, - {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| + {(uid_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE}, + {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE}, - {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_DENIED_ACE_TYPE}, - {-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_ALLOWED_ACE_TYPE}, - {-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES| + {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, + ACE_ACCESS_DENIED_ACE_TYPE}, + {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, + ACE_ACCESS_ALLOWED_ACE_TYPE}, + {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES| ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE}, - {-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + {(uid_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE} }; void -adjust_ace_pair(ace_t *pair, mode_t mode) +adjust_ace_pair_common(void *pair, size_t access_off, + size_t pairsize, mode_t mode) { + char *datap = (char *)pair; + uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off); + uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize + + access_off); if (mode & S_IROTH) - pair[1].a_access_mask |= ACE_READ_DATA; + *amask1 |= ACE_READ_DATA; else - pair[0].a_access_mask |= ACE_READ_DATA; + *amask0 |= ACE_READ_DATA; if (mode & S_IWOTH) - pair[1].a_access_mask |= - ACE_WRITE_DATA|ACE_APPEND_DATA; + *amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA; else - pair[0].a_access_mask |= - ACE_WRITE_DATA|ACE_APPEND_DATA; + *amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA; if (mode & S_IXOTH) - pair[1].a_access_mask |= ACE_EXECUTE; + *amask1 |= ACE_EXECUTE; else - pair[0].a_access_mask |= ACE_EXECUTE; + *amask0 |= ACE_EXECUTE; +} + +void +adjust_ace_pair(ace_t *pair, mode_t mode) +{ + adjust_ace_pair_common(pair, offsetof(ace_t, a_access_mask), + sizeof (ace_t), mode); +} + +static void +ace_allow_deny_helper(uint16_t type, boolean_t *allow, boolean_t *deny) +{ + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE) + *allow = B_TRUE; + else if (type == ACE_ACCESS_DENIED_ACE_TYPE) + *deny = B_TRUE; } /* * ace_trivial: * determine whether an ace_t acl is trivial * - * Trivialness implys that the acl is composed of only + * Trivialness implies that the acl is composed of only * owner, group, everyone entries. ACL can't * have read_acl denied, and write_owner/write_acl/write_attributes * can only be owner@ entry. */ int -ace_trivial(ace_t *acep, int aclcnt) +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) { - int i; - int owner_seen = 0; - int group_seen = 0; - int everyone_seen = 0; + boolean_t owner_allow = B_FALSE; + boolean_t group_allow = B_FALSE; + boolean_t everyone_allow = B_FALSE; + boolean_t owner_deny = B_FALSE; + boolean_t group_deny = B_FALSE; + boolean_t everyone_deny = B_FALSE; + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; - for (i = 0; i != aclcnt; i++) { - switch (acep[i].a_flags & 0xf040) { + while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) { + switch (flags & ACE_TYPE_FLAGS) { case ACE_OWNER: - if (group_seen || everyone_seen) + if (group_allow || group_deny || everyone_allow || + everyone_deny) return (1); - owner_seen++; + ace_allow_deny_helper(type, &owner_allow, &owner_deny); break; case ACE_GROUP|ACE_IDENTIFIER_GROUP: - if (everyone_seen || owner_seen == 0) + if (everyone_allow || everyone_deny && + (!owner_allow && !owner_deny)) return (1); - group_seen++; + ace_allow_deny_helper(type, &group_allow, &group_deny); break; case ACE_EVERYONE: - if (owner_seen == 0 || group_seen == 0) + if (!owner_allow && !owner_deny && + !group_allow && !group_deny) return (1); - everyone_seen++; + ace_allow_deny_helper(type, + &everyone_allow, &everyone_deny); break; default: return (1); } - if (acep[i].a_flags & (ACE_FILE_INHERIT_ACE| + if (flags & (ACE_FILE_INHERIT_ACE| ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| ACE_INHERIT_ONLY_ACE)) return (1); @@ -124,27 +268,49 @@ ace_trivial(ace_t *acep, int aclcnt) * Don't allow anybody to deny reading basic * attributes or a files ACL. */ - if ((acep[i].a_access_mask & - (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && - (acep[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE)) + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) return (1); /* * Allow on owner@ to allow * write_acl/write_owner/write_attributes */ - if (acep[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE && - (!(acep[i].a_flags & ACE_OWNER) && (acep[i].a_access_mask & + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & (ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES)))) return (1); + } - if ((owner_seen == 0) || (group_seen == 0) || (everyone_seen == 0)) - return (1); + if (!owner_allow || !owner_deny || !group_allow || !group_deny || + !everyone_allow || !everyone_deny) + return (1); return (0); } +uint64_t +ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags, + uint16_t *type, uint32_t *mask) +{ + ace_t *acep = datap; + + if (cookie >= aclcnt) + return (0); + + *flags = acep[cookie].a_flags; + *type = acep[cookie].a_type; + *mask = acep[cookie++].a_access_mask; + + return (cookie); +} + +int +ace_trivial(ace_t *acep, int aclcnt) +{ + return (ace_trivial_common(acep, aclcnt, ace_walk)); +} /* * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. @@ -171,8 +337,8 @@ ksort(caddr_t v, int n, int s, int (*f)()) for (g = n / 2; g > 0; g /= 2) { for (i = g; i < n; i++) { for (j = i - g; j >= 0 && - (*f)(v + j * s, v + (j + g) * s) == 1; - j -= g) { + (*f)(v + j * s, v + (j + g) * s) == 1; + j -= g) { p1 = (void *)(v + j * s); p2 = (void *)(v + (j + g) * s); for (ii = 0; ii < s / 4; ii++) { @@ -215,3 +381,1347 @@ cmp2acls(void *a, void *b) /* Totally equal */ return (0); } + +/*ARGSUSED*/ +static void * +cacl_realloc(void *ptr, size_t size, size_t new_size) +{ +#if defined(_KERNEL) + void *tmp; + + tmp = kmem_alloc(new_size, KM_SLEEP); + (void) memcpy(tmp, ptr, (size < new_size) ? size : new_size); + kmem_free(ptr, size); + return (tmp); +#else + return (realloc(ptr, new_size)); +#endif +} + +static int +cacl_malloc(void **ptr, size_t size) +{ +#if defined(_KERNEL) + *ptr = kmem_zalloc(size, KM_SLEEP); + return (0); +#else + *ptr = calloc(1, size); + if (*ptr == NULL) + return (errno); + + return (0); +#endif +} + +/*ARGSUSED*/ +static void +cacl_free(void *ptr, size_t size) +{ +#if defined(_KERNEL) + kmem_free(ptr, size); +#else + free(ptr); +#endif +} + +acl_t * +acl_alloc(enum acl_type type) +{ + acl_t *aclp; + + if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0) + return (NULL); + + aclp->acl_aclp = NULL; + aclp->acl_cnt = 0; + + switch (type) { + case ACE_T: + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + break; + case ACLENT_T: + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + break; + default: + acl_free(aclp); + aclp = NULL; + } + return (aclp); +} + +/* + * Free acl_t structure + */ +void +acl_free(acl_t *aclp) +{ + int acl_size; + + if (aclp == NULL) + return; + + if (aclp->acl_aclp) { + acl_size = aclp->acl_cnt * aclp->acl_entry_size; + cacl_free(aclp->acl_aclp, acl_size); + } + + cacl_free(aclp, sizeof (acl_t)); +} + +static uint32_t +access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow) +{ + uint32_t access_mask = 0; + int acl_produce; + int synchronize_set = 0, write_owner_set = 0; + int delete_set = 0, write_attrs_set = 0; + int read_named_set = 0, write_named_set = 0; + + acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_WRITER_SET_DENY); + + if (isallow) { + synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW; + write_owner_set = ACL_WRITE_OWNER_SET_ALLOW; + delete_set = ACL_DELETE_SET_ALLOW; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_ALLOW; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + } else { + + synchronize_set = ACL_SYNCHRONIZE_SET_DENY; + write_owner_set = ACL_WRITE_OWNER_SET_DENY; + delete_set = ACL_DELETE_SET_DENY; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_DENY; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY; + else + /* + * If the entity is not the owner and does not + * have write permissions ACE_WRITE_ATTRIBUTES will + * always go in the DENY ACE. + */ + access_mask |= ACE_WRITE_ATTRIBUTES; + } + + if (acl_produce & synchronize_set) + access_mask |= ACE_SYNCHRONIZE; + if (acl_produce & write_owner_set) + access_mask |= ACE_WRITE_OWNER; + if (acl_produce & delete_set) + access_mask |= ACE_DELETE; + if (acl_produce & write_attrs_set) + access_mask |= ACE_WRITE_ATTRIBUTES; + if (acl_produce & read_named_set) + access_mask |= ACE_READ_NAMED_ATTRS; + if (acl_produce & write_named_set) + access_mask |= ACE_WRITE_NAMED_ATTRS; + + return (access_mask); +} + +/* + * Given an mode_t, convert it into an access_mask as used + * by nfsace, assuming aclent_t -> nfsace semantics. + */ +static uint32_t +mode_to_ace_access(mode_t mode, int isdir, int isowner, int isallow) +{ + uint32_t access = 0; + int haswriteperm = 0; + int hasreadperm = 0; + + if (isallow) { + haswriteperm = (mode & S_IWOTH); + hasreadperm = (mode & S_IROTH); + } else { + haswriteperm = !(mode & S_IWOTH); + hasreadperm = !(mode & S_IROTH); + } + + /* + * The following call takes care of correctly setting the following + * mask bits in the access_mask: + * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE, + * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS + */ + access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow); + + if (isallow) { + access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES; + if (isowner) + access |= ACE_WRITE_ACL; + } else { + if (! isowner) + access |= ACE_WRITE_ACL; + } + + /* read */ + if (mode & S_IROTH) { + access |= ACE_READ_DATA; + } + /* write */ + if (mode & S_IWOTH) { + access |= ACE_WRITE_DATA | + ACE_APPEND_DATA; + if (isdir) + access |= ACE_DELETE_CHILD; + } + /* exec */ + if (mode & 01) { + access |= ACE_EXECUTE; + } + + return (access); +} + +/* + * Given an nfsace (presumably an ALLOW entry), make a + * corresponding DENY entry at the address given. + */ +static void +ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner) +{ + (void) memcpy(deny, allow, sizeof (ace_t)); + + deny->a_who = allow->a_who; + + deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS; + if (isdir) + deny->a_access_mask ^= ACE_DELETE_CHILD; + + deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER | + ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS | + ACE_WRITE_NAMED_ATTRS); + deny->a_access_mask |= access_mask_set((allow->a_access_mask & + ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner, + B_FALSE); +} +/* + * Make an initial pass over an array of aclent_t's. Gather + * information such as an ACL_MASK (if any), number of users, + * number of groups, and whether the array needs to be sorted. + */ +static int +ln_aent_preprocess(aclent_t *aclent, int n, + int *hasmask, mode_t *mask, + int *numuser, int *numgroup, int *needsort) +{ + int error = 0; + int i; + int curtype = 0; + + *hasmask = 0; + *mask = 07; + *needsort = 0; + *numuser = 0; + *numgroup = 0; + + for (i = 0; i < n; i++) { + if (aclent[i].a_type < curtype) + *needsort = 1; + else if (aclent[i].a_type > curtype) + curtype = aclent[i].a_type; + if (aclent[i].a_type & USER) + (*numuser)++; + if (aclent[i].a_type & (GROUP | GROUP_OBJ)) + (*numgroup)++; + if (aclent[i].a_type & CLASS_OBJ) { + if (*hasmask) { + error = EINVAL; + goto out; + } else { + *hasmask = 1; + *mask = aclent[i].a_perm; + } + } + } + + if ((! *hasmask) && (*numuser + *numgroup > 1)) { + error = EINVAL; + goto out; + } + +out: + return (error); +} + +/* + * Convert an array of aclent_t into an array of nfsace entries, + * following POSIX draft -> nfsv4 conversion semantics as outlined in + * the IETF draft. + */ +static int +ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir) +{ + int error = 0; + mode_t mask; + int numuser, numgroup, needsort; + int resultsize = 0; + int i, groupi = 0, skip; + ace_t *acep, *result = NULL; + int hasmask; + + error = ln_aent_preprocess(aclent, n, &hasmask, &mask, + &numuser, &numgroup, &needsort); + if (error != 0) + goto out; + + /* allow + deny for each aclent */ + resultsize = n * 2; + if (hasmask) { + /* + * stick extra deny on the group_obj and on each + * user|group for the mask (the group_obj was added + * into the count for numgroup) + */ + resultsize += numuser + numgroup; + /* ... and don't count the mask itself */ + resultsize -= 2; + } + + /* sort the source if necessary */ + if (needsort) + ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls); + + if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0) + goto out; + + acep = result; + + for (i = 0; i < n; i++) { + /* + * don't process CLASS_OBJ (mask); mask was grabbed in + * ln_aent_preprocess() + */ + if (aclent[i].a_type & CLASS_OBJ) + continue; + + /* If we need an ACL_MASK emulator, prepend it now */ + if ((hasmask) && + (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) { + acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + acep->a_flags = 0; + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= + (ACE_IDENTIFIER_GROUP|ACE_GROUP); + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + } else { + acep->a_who = aclent[i].a_id; + acep->a_flags |= ACE_IDENTIFIER_GROUP; + } + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + /* + * Set the access mask for the prepended deny + * ace. To do this, we invert the mask (found + * in ln_aent_preprocess()) then convert it to an + * DENY ace access_mask. + */ + acep->a_access_mask = mode_to_ace_access((mask ^ 07), + isdir, 0, 0); + acep += 1; + } + + /* handle a_perm -> access_mask */ + acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm, + isdir, aclent[i].a_type & USER_OBJ, 1); + + /* emulate a default aclent */ + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + + /* + * handle a_perm and a_id + * + * this must be done last, since it involves the + * corresponding deny aces, which are handled + * differently for each different a_type. + */ + if (aclent[i].a_type & USER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_OWNER; + ace_make_deny(acep, acep + 1, isdir, B_TRUE); + acep += 2; + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) { + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_GROUP; + } else { + acep->a_who = aclent[i].a_id; + } + acep->a_flags |= ACE_IDENTIFIER_GROUP; + /* + * Set the corresponding deny for the group ace. + * + * The deny aces go after all of the groups, unlike + * everything else, where they immediately follow + * the allow ace. + * + * We calculate "skip", the number of slots to + * skip ahead for the deny ace, here. + * + * The pattern is: + * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3 + * thus, skip is + * (2 * numgroup) - 1 - groupi + * (2 * numgroup) to account for MD + A + * - 1 to account for the fact that we're on the + * access (A), not the mask (MD) + * - groupi to account for the fact that we have + * passed up groupi number of MD's. + */ + skip = (2 * numgroup) - 1 - groupi; + ace_make_deny(acep, acep + skip, isdir, B_FALSE); + /* + * If we just did the last group, skip acep past + * all of the denies; else, just move ahead one. + */ + if (++groupi >= numgroup) + acep += numgroup + 1; + else + acep += 1; + } else if (aclent[i].a_type & OTHER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_EVERYONE; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else { + error = EINVAL; + goto out; + } + } + + *acepp = result; + *rescount = resultsize; + +out: + if (error != 0) { + if ((result != NULL) && (resultsize > 0)) { + cacl_free(result, resultsize * sizeof (ace_t)); + } + } + + return (error); +} + +static int +convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir, + ace_t **retacep, int *retacecnt) +{ + ace_t *acep; + ace_t *dfacep; + int acecnt = 0; + int dfacecnt = 0; + int dfaclstart = 0; + int dfaclcnt = 0; + aclent_t *aclp; + int i; + int error; + int acesz, dfacesz; + + ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls); + + for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) { + if (aclp->a_type & ACL_DEFAULT) + break; + } + + if (i < aclcnt) { + dfaclstart = i; + dfaclcnt = aclcnt - i; + } + + if (dfaclcnt && isdir == 0) { + return (EINVAL); + } + + error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir); + if (error) + return (error); + + if (dfaclcnt) { + error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt, + &dfacep, &dfacecnt, isdir); + if (error) { + if (acep) { + cacl_free(acep, acecnt * sizeof (ace_t)); + } + return (error); + } + } + + if (dfacecnt != 0) { + acesz = sizeof (ace_t) * acecnt; + dfacesz = sizeof (ace_t) * dfacecnt; + acep = cacl_realloc(acep, acesz, acesz + dfacesz); + if (acep == NULL) + return (ENOMEM); + if (dfaclcnt) { + (void) memcpy(acep + acecnt, dfacep, dfacesz); + } + } + if (dfaclcnt) + cacl_free(dfacep, dfacecnt * sizeof (ace_t)); + + *retacecnt = acecnt + dfacecnt; + *retacep = acep; + return (0); +} + +static int +ace_mask_to_mode(uint32_t mask, o_mode_t *modep, int isdir) +{ + int error = 0; + o_mode_t mode = 0; + uint32_t bits, wantbits; + + /* read */ + if (mask & ACE_READ_DATA) + mode |= S_IROTH; + + /* write */ + wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA); + if (isdir) + wantbits |= ACE_DELETE_CHILD; + bits = mask & wantbits; + if (bits != 0) { + if (bits != wantbits) { + error = ENOTSUP; + goto out; + } + mode |= S_IWOTH; + } + + /* exec */ + if (mask & ACE_EXECUTE) { + mode |= S_IXOTH; + } + + *modep = mode; + +out: + return (error); +} + +static void +acevals_init(acevals_t *vals, uid_t key) +{ + bzero(vals, sizeof (*vals)); + vals->allowed = ACE_MASK_UNDEFINED; + vals->denied = ACE_MASK_UNDEFINED; + vals->mask = ACE_MASK_UNDEFINED; + vals->key = key; +} + +static void +ace_list_init(ace_list_t *al, int dfacl_flag) +{ + acevals_init(&al->user_obj, 0); + acevals_init(&al->group_obj, 0); + acevals_init(&al->other_obj, 0); + al->numusers = 0; + al->numgroups = 0; + al->acl_mask = 0; + al->hasmask = 0; + al->state = ace_unused; + al->seen = 0; + al->dfacl_flag = dfacl_flag; +} + +/* + * Find or create an acevals holder for a given id and avl tree. + * + * Note that only one thread will ever touch these avl trees, so + * there is no need for locking. + */ +static acevals_t * +acevals_find(ace_t *ace, avl_tree_t *avl, int *num) +{ + acevals_t key, *rc; + avl_index_t where; + + key.key = ace->a_who; + rc = avl_find(avl, &key, &where); + if (rc != NULL) + return (rc); + + /* this memory is freed by ln_ace_to_aent()->ace_list_free() */ + if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0) + return (NULL); + + acevals_init(rc, ace->a_who); + avl_insert(avl, rc, where); + (*num)++; + + return (rc); +} + +static int +access_mask_check(ace_t *acep, int mask_bit, int isowner) +{ + int set_deny, err_deny; + int set_allow, err_allow; + int acl_consume; + int haswriteperm, hasreadperm; + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1; + } else { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0; + } + + acl_consume = (ACL_SYNCHRONIZE_ERR_DENY | + ACL_DELETE_ERR_DENY | + ACL_WRITE_OWNER_ERR_DENY | + ACL_WRITE_OWNER_ERR_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_ERR_DENY | + ACL_WRITE_ATTRS_WRITER_SET_DENY | + ACL_WRITE_ATTRS_WRITER_ERR_ALLOW | + ACL_WRITE_NAMED_WRITER_ERR_DENY | + ACL_READ_NAMED_READER_ERR_DENY); + + if (mask_bit == ACE_SYNCHRONIZE) { + set_deny = ACL_SYNCHRONIZE_SET_DENY; + err_deny = ACL_SYNCHRONIZE_ERR_DENY; + set_allow = ACL_SYNCHRONIZE_SET_ALLOW; + err_allow = ACL_SYNCHRONIZE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_OWNER) { + set_deny = ACL_WRITE_OWNER_SET_DENY; + err_deny = ACL_WRITE_OWNER_ERR_DENY; + set_allow = ACL_WRITE_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_OWNER_ERR_ALLOW; + } else if (mask_bit == ACE_DELETE) { + set_deny = ACL_DELETE_SET_DENY; + err_deny = ACL_DELETE_ERR_DENY; + set_allow = ACL_DELETE_SET_ALLOW; + err_allow = ACL_DELETE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_ATTRIBUTES) { + if (isowner) { + set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW; + } else if (haswriteperm) { + set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW; + } else { + if ((acep->a_access_mask & mask_bit) && + (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) { + return (ENOTSUP); + } + return (0); + } + } else if (mask_bit == ACE_READ_NAMED_ATTRS) { + if (!hasreadperm) + return (0); + + set_deny = ACL_READ_NAMED_READER_SET_DENY; + err_deny = ACL_READ_NAMED_READER_ERR_DENY; + set_allow = ACL_READ_NAMED_READER_SET_ALLOW; + err_allow = ACL_READ_NAMED_READER_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) { + if (!haswriteperm) + return (0); + + set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY; + err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY; + set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW; + } else { + return (EINVAL); + } + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + if (acl_consume & set_deny) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_deny) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } else { + /* ACE_ACCESS_ALLOWED_ACE_TYPE */ + if (acl_consume & set_allow) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_allow) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } + return (0); +} + +static int +ace_to_aent_legal(ace_t *acep) +{ + int error = 0; + int isowner; + + /* only ALLOW or DENY */ + if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) && + (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid flags */ + if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) { + error = EINVAL; + goto out; + } + + /* some flags are illegal */ + if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG | + ACE_FAILED_ACCESS_ACE_FLAG | + ACE_NO_PROPAGATE_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid masks */ + if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) { + error = EINVAL; + goto out; + } + + if ((acep->a_flags & ACE_OWNER)) { + isowner = 1; + } else { + isowner = 0; + } + + error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_OWNER, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_DELETE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner); + if (error) + goto out; + + /* more detailed checking of masks */ + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (! (acep->a_access_mask & ACE_APPEND_DATA))) { + error = ENOTSUP; + goto out; + } + if ((! (acep->a_access_mask & ACE_WRITE_DATA)) && + (acep->a_access_mask & ACE_APPEND_DATA)) { + error = ENOTSUP; + goto out; + } + } + + /* ACL enforcement */ + if ((acep->a_access_mask & ACE_READ_ACL) && + (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + if (acep->a_access_mask & ACE_WRITE_ACL) { + if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) && + (isowner)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) && + (! isowner)) { + error = ENOTSUP; + goto out; + } + } + +out: + return (error); +} + +static int +ace_allow_to_mode(uint32_t mask, o_mode_t *modep, int isdir) +{ + /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */ + if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) != + (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) { + return (ENOTSUP); + } + + return (ace_mask_to_mode(mask, modep, isdir)); +} + +static int +acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list, + uid_t owner, gid_t group, int isdir) +{ + int error; + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + + if (isdir) + flips |= ACE_DELETE_CHILD; + if (vals->allowed != (vals->denied ^ flips)) { + error = ENOTSUP; + goto out; + } + if ((list->hasmask) && (list->acl_mask != vals->mask) && + (vals->aent_type & (USER | GROUP | GROUP_OBJ))) { + error = ENOTSUP; + goto out; + } + error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir); + if (error != 0) + goto out; + dest->a_type = vals->aent_type; + if (dest->a_type & (USER | GROUP)) { + dest->a_id = vals->key; + } else if (dest->a_type & USER_OBJ) { + dest->a_id = owner; + } else if (dest->a_type & GROUP_OBJ) { + dest->a_id = group; + } else if (dest->a_type & OTHER_OBJ) { + dest->a_id = 0; + } else { + error = EINVAL; + goto out; + } + +out: + return (error); +} + + +static int +ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt, + uid_t owner, gid_t group, int isdir) +{ + int error = 0; + aclent_t *aent, *result = NULL; + acevals_t *vals; + int resultcount; + + if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) != + (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) { + error = ENOTSUP; + goto out; + } + if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) { + error = ENOTSUP; + goto out; + } + + resultcount = 3 + list->numusers + list->numgroups; + /* + * This must be the same condition as below, when we add the CLASS_OBJ + * (aka ACL mask) + */ + if ((list->hasmask) || (! list->dfacl_flag)) + resultcount += 1; + + if (cacl_malloc((void **)&result, + resultcount * sizeof (aclent_t)) != 0) { + error = ENOMEM; + goto out; + } + aent = result; + + /* USER_OBJ */ + if (!(list->user_obj.aent_type & USER_OBJ)) { + error = EINVAL; + goto out; + } + + error = acevals_to_aent(&list->user_obj, aent, list, owner, group, + isdir); + + if (error != 0) + goto out; + ++aent; + /* USER */ + vals = NULL; + for (vals = avl_first(&list->user); vals != NULL; + vals = AVL_NEXT(&list->user, vals)) { + if (!(vals->aent_type & USER)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* GROUP_OBJ */ + if (!(list->group_obj.aent_type & GROUP_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->group_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + /* GROUP */ + vals = NULL; + for (vals = avl_first(&list->group); vals != NULL; + vals = AVL_NEXT(&list->group, vals)) { + if (!(vals->aent_type & GROUP)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* + * CLASS_OBJ (aka ACL_MASK) + * + * An ACL_MASK is not fabricated if the ACL is a default ACL. + * This is to follow UFS's behavior. + */ + if ((list->hasmask) || (! list->dfacl_flag)) { + if (list->hasmask) { + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + if (isdir) + flips |= ACE_DELETE_CHILD; + error = ace_mask_to_mode(list->acl_mask ^ flips, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } else { + /* fabricate the ACL_MASK from the group permissions */ + error = ace_mask_to_mode(list->group_obj.allowed, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } + aent->a_id = 0; + aent->a_type = CLASS_OBJ | list->dfacl_flag; + ++aent; + } + /* OTHER_OBJ */ + if (!(list->other_obj.aent_type & OTHER_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->other_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + + *aclentp = result; + *aclcnt = resultcount; + +out: + if (error != 0) { + if (result != NULL) + cacl_free(result, resultcount * sizeof (aclent_t)); + } + + return (error); +} + + +/* + * free all data associated with an ace_list + */ +static void +ace_list_free(ace_list_t *al) +{ + acevals_t *node; + void *cookie; + + if (al == NULL) + return; + + cookie = NULL; + while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + cookie = NULL; + while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + + avl_destroy(&al->user); + avl_destroy(&al->group); + + /* free the container itself */ + cacl_free(al, sizeof (ace_list_t)); +} + +static int +acevals_compare(const void *va, const void *vb) +{ + const acevals_t *a = va, *b = vb; + + if (a->key == b->key) + return (0); + + if (a->key > b->key) + return (1); + + else + return (-1); +} + +/* + * Convert a list of ace_t entries to equivalent regular and default + * aclent_t lists. Return error (ENOTSUP) when conversion is not possible. + */ +static int +ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group, + aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt, + int isdir) +{ + int error = 0; + ace_t *acep; + uint32_t bits; + int i; + ace_list_t *normacl = NULL, *dfacl = NULL, *acl; + acevals_t *vals; + + *aclentp = NULL; + *aclcnt = 0; + *dfaclentp = NULL; + *dfaclcnt = 0; + + /* we need at least user_obj, group_obj, and other_obj */ + if (n < 6) { + error = ENOTSUP; + goto out; + } + if (ace == NULL) { + error = EINVAL; + goto out; + } + + error = cacl_malloc((void **)&normacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&normacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&normacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + + ace_list_init(normacl, 0); + + error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + ace_list_init(dfacl, ACL_DEFAULT); + + /* process every ace_t... */ + for (i = 0; i < n; i++) { + acep = &ace[i]; + + /* rule out certain cases quickly */ + error = ace_to_aent_legal(acep); + if (error != 0) + goto out; + + /* + * Turn off these bits in order to not have to worry about + * them when doing the checks for compliments. + */ + acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE | + ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS); + + /* see if this should be a regular or default acl */ + bits = acep->a_flags & + (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE); + if (bits != 0) { + /* all or nothing on these inherit bits */ + if (bits != (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + acl = dfacl; + } else { + acl = normacl; + } + + if ((acep->a_flags & ACE_OWNER)) { + if (acl->state > ace_user_obj) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user_obj; + acl->seen |= USER_OBJ; + vals = &acl->user_obj; + vals->aent_type = USER_OBJ | acl->dfacl_flag; + } else if ((acep->a_flags & ACE_EVERYONE)) { + acl->state = ace_other_obj; + acl->seen |= OTHER_OBJ; + vals = &acl->other_obj; + vals->aent_type = OTHER_OBJ | acl->dfacl_flag; + } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) { + if (acl->state > ace_group) { + error = ENOTSUP; + goto out; + } + if ((acep->a_flags & ACE_GROUP)) { + acl->seen |= GROUP_OBJ; + vals = &acl->group_obj; + vals->aent_type = GROUP_OBJ | acl->dfacl_flag; + } else { + acl->seen |= GROUP; + vals = acevals_find(acep, &acl->group, + &acl->numgroups); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = GROUP | acl->dfacl_flag; + } + acl->state = ace_group; + } else { + if (acl->state > ace_user) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user; + acl->seen |= USER; + vals = acevals_find(acep, &acl->user, + &acl->numusers); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = USER | acl->dfacl_flag; + } + + if (!(acl->state > ace_unused)) { + error = EINVAL; + goto out; + } + + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + /* no more than one allowed per aclent_t */ + if (vals->allowed != ACE_MASK_UNDEFINED) { + error = ENOTSUP; + goto out; + } + vals->allowed = acep->a_access_mask; + } else { + /* + * it's a DENY; if there was a previous DENY, it + * must have been an ACL_MASK. + */ + if (vals->denied != ACE_MASK_UNDEFINED) { + /* ACL_MASK is for USER and GROUP only */ + if ((acl->state != ace_user) && + (acl->state != ace_group)) { + error = ENOTSUP; + goto out; + } + + if (! acl->hasmask) { + acl->hasmask = 1; + acl->acl_mask = vals->denied; + /* check for mismatched ACL_MASK emulations */ + } else if (acl->acl_mask != vals->denied) { + error = ENOTSUP; + goto out; + } + vals->mask = vals->denied; + } + vals->denied = acep->a_access_mask; + } + } + + /* done collating; produce the aclent_t lists */ + if (normacl->state != ace_unused) { + error = ace_list_to_aent(normacl, aclentp, aclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + if (dfacl->state != ace_unused) { + error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + +out: + if (normacl != NULL) + ace_list_free(normacl); + if (dfacl != NULL) + ace_list_free(dfacl); + + return (error); +} + +static int +convert_ace_to_aent(ace_t *acebufp, int acecnt, int isdir, + uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt) +{ + int error = 0; + aclent_t *aclentp, *dfaclentp; + int aclcnt, dfaclcnt; + int aclsz, dfaclsz; + + error = ln_ace_to_aent(acebufp, acecnt, owner, group, + &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir); + + if (error) + return (error); + + + if (dfaclcnt != 0) { + /* + * Slap aclentp and dfaclentp into a single array. + */ + aclsz = sizeof (aclent_t) * aclcnt; + dfaclsz = sizeof (aclent_t) * dfaclcnt; + aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz); + if (aclentp != NULL) { + (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz); + } else { + error = ENOMEM; + } + } + + if (aclentp) { + *retaclentp = aclentp; + *retaclcnt = aclcnt + dfaclcnt; + } + + if (dfaclentp) + cacl_free(dfaclentp, dfaclsz); + + return (error); +} + + +int +acl_translate(acl_t *aclp, int target_flavor, int isdir, uid_t owner, + gid_t group) +{ + int aclcnt; + void *acldata; + int error; + + /* + * See if we need to translate + */ + if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) || + (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACLENT_T)) + return (0); + + if (target_flavor == -1) { + error = EINVAL; + goto out; + } + + if (target_flavor == _ACL_ACE_ENABLED && + aclp->acl_type == ACLENT_T) { + error = convert_aent_to_ace(aclp->acl_aclp, + aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt); + if (error) + goto out; + + } else if (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACE_T) { + error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt, + isdir, owner, group, (aclent_t **)&acldata, &aclcnt); + if (error) + goto out; + } else { + error = ENOTSUP; + goto out; + } + + /* + * replace old acl with newly translated acl + */ + cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size); + aclp->acl_aclp = acldata; + aclp->acl_cnt = aclcnt; + if (target_flavor == _ACL_ACE_ENABLED) { + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + } else { + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + } + return (0); + +out: + +#if !defined(_KERNEL) + errno = error; + return (-1); +#else + return (error); +#endif +} diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h index 2227ad77ea93..84bd04f52fd6 100644 --- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h +++ b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#ifndef _ACL_ACL_UTILS_H -#define _ACL_ACL_UTILS_H +#ifndef _ACL_COMMON_H +#define _ACL_COMMON_H #pragma ident "%Z%%M% %I% %E% SMI" @@ -34,7 +33,7 @@ #include <sys/acl.h> #include <sys/stat.h> -#ifdef __cplusplus +#ifdef __cplusplus extern "C" { #endif @@ -42,15 +41,21 @@ extern ace_t trivial_acl[6]; extern int acltrivial(const char *); extern void adjust_ace_pair(ace_t *pair, mode_t mode); +extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t); extern int ace_trivial(ace_t *acep, int aclcnt); +extern int ace_trivial_common(void *, int, + uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *, + uint32_t *mask)); +extern acl_t *acl_alloc(acl_type_t); +extern void acl_free(acl_t *aclp); +extern int acl_translate(acl_t *aclp, int target_flavor, + int isdir, uid_t owner, gid_t group); void ksort(caddr_t v, int n, int s, int (*f)()); int cmp2acls(void *a, void *b); - - -#ifdef __cplusplus +#ifdef __cplusplus } #endif -#endif /* _ACL_ACL_UTILS_H */ +#endif /* _ACL_COMMON_H */ diff --git a/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S index 2e62aa420d91..6851086c1f96 100644 --- a/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S +++ b/sys/cddl/contrib/opensolaris/common/atomic/amd64/atomic.S @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,14 +18,13 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - .ident "%Z%%M% %I% %E% SMI" - - .file "%M%" + .file "atomic.s" #define _ASM #include <sys/asm_linkage.h> diff --git a/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S index bc7f22acc8d4..57f7d0a47b53 100644 --- a/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S +++ b/sys/cddl/contrib/opensolaris/common/atomic/i386/atomic.S @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,18 +18,54 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - .ident "%Z%%M% %I% %E% SMI" - - .file "%M%" + .file "atomic.s" #define _ASM #include <sys/asm_linkage.h> + /* + * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever + * separated, it is important to edit the libc i386 platform + * specific mapfile and remove the NODYNSORT attribute + * from atomic_dec_64_nv. + */ + ENTRY(atomic_dec_64) + ALTENTRY(atomic_dec_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi // %edi = target address + movl (%edi), %eax + movl 4(%edi), %edx // %edx:%eax = old value +1: + xorl %ebx, %ebx + xorl %ecx, %ecx + not %ecx + not %ebx // %ecx:%ebx = -1 + addl %eax, %ebx + adcl %edx, %ecx // add in the carry from inc + lock + cmpxchg8b (%edi) // try to stick it in + jne 1b + movl %ebx, %eax + movl %ecx, %edx // return new value + popl %ebx + popl %edi + ret + SET_SIZE(atomic_dec_64_nv) + SET_SIZE(atomic_dec_64) + + /* + * NOTE: If atomic_add_64 and atomic_add_64_nv are ever + * separated, it is important to edit the libc i386 platform + * specific mapfile and remove the NODYNSORT attribute + * from atomic_add_64_nv. + */ ENTRY(atomic_add_64) ALTENTRY(atomic_add_64_nv) pushl %edi diff --git a/sys/cddl/contrib/opensolaris/common/avl/avl.c b/sys/cddl/contrib/opensolaris/common/avl/avl.c index 1fa2236607bf..01aa3cb2fa9d 100644 --- a/sys/cddl/contrib/opensolaris/common/avl/avl.c +++ b/sys/cddl/contrib/opensolaris/common/avl/avl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -808,6 +808,64 @@ avl_remove(avl_tree_t *tree, void *data) } while (parent != NULL); } +#define AVL_REINSERT(tree, obj) \ + avl_remove((tree), (obj)); \ + avl_add((tree), (obj)) + +boolean_t +avl_update_lt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) <= 0)); + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update_gt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) >= 0)); + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update(avl_tree_t *t, void *obj) +{ + void *neighbor; + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + /* * initialize a new AVL tree */ @@ -853,6 +911,12 @@ avl_numnodes(avl_tree_t *tree) return (tree->avl_numnodes); } +boolean_t +avl_is_empty(avl_tree_t *tree) +{ + ASSERT(tree); + return (tree->avl_numnodes == 0); +} #define CHILDBIT (1L) diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c index d3d5bed525ab..9cdc534ffc1b 100644 --- a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c +++ b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -34,15 +34,17 @@ #if defined(_KERNEL) && !defined(_BOOT) #include <sys/varargs.h> +#include <sys/sunddi.h> #else #include <stdarg.h> -#include <strings.h> +#include <stdlib.h> +#include <string.h> #endif #ifndef offsetof -#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) #endif - +#define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++ /* * nvpair.c - Provides kernel & userland interfaces for manipulating @@ -201,7 +203,7 @@ nv_mem_free(nvpriv_t *nvp, void *buf, size_t size) static void nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat) { - bzero(priv, sizeof (priv)); + bzero(priv, sizeof (nvpriv_t)); priv->nvp_nva = nva; priv->nvp_stat = stat; @@ -395,6 +397,9 @@ i_validate_type_nelem(data_type_t type, uint_t nelem) case DATA_TYPE_STRING: case DATA_TYPE_HRTIME: case DATA_TYPE_NVLIST: +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: +#endif if (nelem != 1) return (EINVAL); break; @@ -733,6 +738,11 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem) case DATA_TYPE_UINT64: value_sz = sizeof (uint64_t); break; +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: + value_sz = sizeof (double); + break; +#endif case DATA_TYPE_STRING: if (data == NULL) value_sz = 0; @@ -1017,6 +1027,14 @@ nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val)); } +#if !defined(_KERNEL) +int +nvlist_add_double(nvlist_t *nvl, const char *name, double val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val)); +} +#endif + int nvlist_add_string(nvlist_t *nvl, const char *name, const char *val) { @@ -1123,13 +1141,15 @@ nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp) curr = NVPAIR2I_NVP(nvp); /* - * Ensure that nvp is an valid pointer. + * Ensure that nvp is a valid nvpair on this nvlist. + * NB: nvp_curr is used only as a hint so that we don't always + * have to walk the list to determine if nvp is still on the list. */ if (nvp == NULL) curr = priv->nvp_list; - else if (priv->nvp_curr == curr) + else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) curr = curr->nvi_next; - else if (nvlist_contains_nvp(nvl, nvp) == 0) + else curr = NULL; priv->nvp_curr = curr; @@ -1149,6 +1169,27 @@ nvpair_type(nvpair_t *nvp) return (NVP_TYPE(nvp)); } +int +nvpair_type_is_array(nvpair_t *nvp) +{ + data_type_t type = NVP_TYPE(nvp); + + if ((type == DATA_TYPE_BYTE_ARRAY) || + (type == DATA_TYPE_UINT8_ARRAY) || + (type == DATA_TYPE_INT16_ARRAY) || + (type == DATA_TYPE_UINT16_ARRAY) || + (type == DATA_TYPE_INT32_ARRAY) || + (type == DATA_TYPE_UINT32_ARRAY) || + (type == DATA_TYPE_INT64_ARRAY) || + (type == DATA_TYPE_UINT64_ARRAY) || + (type == DATA_TYPE_BOOLEAN_ARRAY) || + (type == DATA_TYPE_STRING_ARRAY) || + (type == DATA_TYPE_NVLIST_ARRAY)) + return (1); + return (0); + +} + static int nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) { @@ -1176,6 +1217,9 @@ nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: +#endif if (data == NULL) return (EINVAL); bcopy(NVP_VALUE(nvp), data, @@ -1312,6 +1356,14 @@ nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val) return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val)); } +#if !defined(_KERNEL) +int +nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val)); +} +#endif + int nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val) { @@ -1446,6 +1498,9 @@ nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) case DATA_TYPE_HRTIME: case DATA_TYPE_STRING: case DATA_TYPE_NVLIST: +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: +#endif val = va_arg(ap, void *); ret = nvlist_lookup_common(nvl, name, type, NULL, val); break; @@ -1479,6 +1534,224 @@ nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) return (ret); } +/* + * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function + * returns zero and a pointer to the matching nvpair is returned in '*ret' + * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate + * multiple levels of embedded nvlists, with 'sep' as the separator. As an + * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or + * "a.d[3].e[1]". This matches the C syntax for array embed (for convience, + * code also supports "a.d[3]e[1]" syntax). + * + * If 'ip' is non-NULL and the last name component is an array, return the + * value of the "...[index]" array index in *ip. For an array reference that + * is not indexed, *ip will be returned as -1. If there is a syntax error in + * 'name', and 'ep' is non-NULL then *ep will be set to point to the location + * inside the 'name' string where the syntax error was detected. + */ +static int +nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, + nvpair_t **ret, int *ip, char **ep) +{ + nvpair_t *nvp; + const char *np; + char *sepp; + char *idxp, *idxep; + nvlist_t **nva; + long idx; + int n; + + if (ip) + *ip = -1; /* not indexed */ + if (ep) + *ep = NULL; + + if ((nvl == NULL) || (name == NULL)) + return (EINVAL); + + /* step through components of name */ + for (np = name; np && *np; np = sepp) { + /* ensure unique names */ + if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME)) + return (ENOTSUP); + + /* skip white space */ + skip_whitespace(np); + if (*np == 0) + break; + + /* set 'sepp' to end of current component 'np' */ + if (sep) + sepp = strchr(np, sep); + else + sepp = NULL; + + /* find start of next "[ index ]..." */ + idxp = strchr(np, '['); + + /* if sepp comes first, set idxp to NULL */ + if (sepp && idxp && (sepp < idxp)) + idxp = NULL; + + /* + * At this point 'idxp' is set if there is an index + * expected for the current component. + */ + if (idxp) { + /* set 'n' to length of current 'np' name component */ + n = idxp++ - np; + + /* keep sepp up to date for *ep use as we advance */ + skip_whitespace(idxp); + sepp = idxp; + + /* determine the index value */ +#if defined(_KERNEL) && !defined(_BOOT) + if (ddi_strtol(idxp, &idxep, 0, &idx)) + goto fail; +#else + idx = strtol(idxp, &idxep, 0); +#endif + if (idxep == idxp) + goto fail; + + /* keep sepp up to date for *ep use as we advance */ + sepp = idxep; + + /* skip white space index value and check for ']' */ + skip_whitespace(sepp); + if (*sepp++ != ']') + goto fail; + + /* for embedded arrays, support C syntax: "a[1].b" */ + skip_whitespace(sepp); + if (sep && (*sepp == sep)) + sepp++; + } else if (sepp) { + n = sepp++ - np; + } else { + n = strlen(np); + } + + /* trim trailing whitespace by reducing length of 'np' */ + if (n == 0) + goto fail; + for (n--; (np[n] == ' ') || (np[n] == '\t'); n--) + ; + n++; + + /* skip whitespace, and set sepp to NULL if complete */ + if (sepp) { + skip_whitespace(sepp); + if (*sepp == 0) + sepp = NULL; + } + + /* + * At this point: + * o 'n' is the length of current 'np' component. + * o 'idxp' is set if there was an index, and value 'idx'. + * o 'sepp' is set to the beginning of the next component, + * and set to NULL if we have no more components. + * + * Search for nvpair with matching component name. + */ + for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(nvl, nvp)) { + + /* continue if no match on name */ + if (strncmp(np, nvpair_name(nvp), n) || + (strlen(nvpair_name(nvp)) != n)) + continue; + + /* if indexed, verify type is array oriented */ + if (idxp && !nvpair_type_is_array(nvp)) + goto fail; + + /* + * Full match found, return nvp and idx if this + * was the last component. + */ + if (sepp == NULL) { + if (ret) + *ret = nvp; + if (ip && idxp) + *ip = (int)idx; /* return index */ + return (0); /* found */ + } + + /* + * More components: current match must be + * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY + * to support going deeper. + */ + if (nvpair_type(nvp) == DATA_TYPE_NVLIST) { + nvl = EMBEDDED_NVL(nvp); + break; + } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) { + (void) nvpair_value_nvlist_array(nvp, + &nva, (uint_t *)&n); + if ((n < 0) || (idx >= n)) + goto fail; + nvl = nva[idx]; + break; + } + + /* type does not support more levels */ + goto fail; + } + if (nvp == NULL) + goto fail; /* 'name' not found */ + + /* search for match of next component in embedded 'nvl' list */ + } + +fail: if (ep && sepp) + *ep = sepp; + return (EINVAL); +} + +/* + * Return pointer to nvpair with specified 'name'. + */ +int +nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret) +{ + return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL)); +} + +/* + * Determine if named nvpair exists in nvlist (use embedded separator of '.' + * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed + * description. + */ +int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl, + const char *name, nvpair_t **ret, int *ip, char **ep) +{ + return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep)); +} + +boolean_t +nvlist_exists(nvlist_t *nvl, const char *name) +{ + nvpriv_t *priv; + nvpair_t *nvp; + i_nvp_t *curr; + + if (name == NULL || nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (B_FALSE); + + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { + nvp = &curr->nvi_nvp; + + if (strcmp(name, NVP_NAME(nvp)) == 0) + return (B_TRUE); + } + + return (B_FALSE); +} + int nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val) { @@ -1539,6 +1812,14 @@ nvpair_value_uint64(nvpair_t *nvp, uint64_t *val) return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val)); } +#if !defined(_KERNEL) +int +nvpair_value_double(nvpair_t *nvp, double *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val)); +} +#endif + int nvpair_value_string(nvpair_t *nvp, char **val) { @@ -2676,7 +2957,11 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) */ ret = xdr_longlong_t(xdr, (void *)buf); break; - +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: + ret = xdr_double(xdr, (void *)buf); + break; +#endif case DATA_TYPE_STRING: ret = xdr_string(xdr, &buf, buflen - 1); break; @@ -2782,6 +3067,9 @@ nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: +#endif nvp_sz += 8; break; diff --git a/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c new file mode 100644 index 000000000000..73cf74a4d159 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/unicode/u8_textprep.c @@ -0,0 +1,2130 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). + * + * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), + * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also + * the section 3C man pages. + * Interface stability: Committed. + */ + +#include <sys/types.h> +#ifdef _KERNEL +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#else +#include <strings.h> +#endif /* _KERNEL */ +#include <sys/byteorder.h> +#include <sys/errno.h> +#include <sys/u8_textprep.h> +#include <sys/u8_textprep_data.h> + + +/* The maximum possible number of bytes in a UTF-8 character. */ +#define U8_MB_CUR_MAX (4) + +/* + * The maximum number of bytes needed for a UTF-8 character to cover + * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. + */ +#define U8_MAX_BYTES_UCS2 (3) + +/* The maximum possible number of bytes in a Stream-Safe Text. */ +#define U8_STREAM_SAFE_TEXT_MAX (128) + +/* + * The maximum number of characters in a combining/conjoining sequence and + * the actual upperbound limit of a combining/conjoining sequence. + */ +#define U8_MAX_CHARS_A_SEQ (32) +#define U8_UPPER_LIMIT_IN_A_SEQ (31) + +/* The combining class value for Starter. */ +#define U8_COMBINING_CLASS_STARTER (0) + +/* + * Some Hangul related macros at below. + * + * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, + * Vowels, and optional Trailing consonants in Unicode scalar values. + * + * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not + * the actual U+11A8. This is due to that the trailing consonant is optional + * and thus we are doing a pre-calculation of subtracting one. + * + * Each of 19 modern leading consonants has total 588 possible syllables since + * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for + * no trailing consonant case, i.e., 21 x 28 = 588. + * + * We also have bunch of Hangul related macros at below. Please bear in mind + * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is + * a Hangul Jamo or not but the value does not guarantee that it is a Hangul + * Jamo; it just guarantee that it will be most likely. + */ +#define U8_HANGUL_SYL_FIRST (0xAC00U) +#define U8_HANGUL_SYL_LAST (0xD7A3U) + +#define U8_HANGUL_JAMO_L_FIRST (0x1100U) +#define U8_HANGUL_JAMO_L_LAST (0x1112U) +#define U8_HANGUL_JAMO_V_FIRST (0x1161U) +#define U8_HANGUL_JAMO_V_LAST (0x1175U) +#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) +#define U8_HANGUL_JAMO_T_LAST (0x11C2U) + +#define U8_HANGUL_V_COUNT (21) +#define U8_HANGUL_VT_COUNT (588) +#define U8_HANGUL_T_COUNT (28) + +#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) + +#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ + (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ + (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ + (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); + +#define U8_HANGUL_JAMO_L(u) \ + ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) + +#define U8_HANGUL_JAMO_V(u) \ + ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) + +#define U8_HANGUL_JAMO_T(u) \ + ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) + +#define U8_HANGUL_JAMO(u) \ + ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) + +#define U8_HANGUL_SYLLABLE(u) \ + ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) + +#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ + ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) + +#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ + ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) + +/* The types of decomposition mappings. */ +#define U8_DECOMP_BOTH (0xF5U) +#define U8_DECOMP_CANONICAL (0xF6U) + +/* The indicator for 16-bit table. */ +#define U8_16BIT_TABLE_INDICATOR (0x8000U) + +/* The following are some convenience macros. */ +#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ + (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \ + (uint32_t)(b3) & 0x3F; + +#define U8_SIMPLE_SWAP(a, b, t) \ + (t) = (a); \ + (a) = (b); \ + (b) = (t); + +#define U8_ASCII_TOUPPER(c) \ + (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) + +#define U8_ASCII_TOLOWER(c) \ + (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) + +#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) +/* + * The following macro assumes that the two characters that are to be + * swapped are adjacent to each other and 'a' comes before 'b'. + * + * If the assumptions are not met, then, the macro will fail. + */ +#define U8_SWAP_COMB_MARKS(a, b) \ + for (k = 0; k < disp[(a)]; k++) \ + u8t[k] = u8s[start[(a)] + k]; \ + for (k = 0; k < disp[(b)]; k++) \ + u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ + start[(b)] = start[(a)] + disp[(b)]; \ + for (k = 0; k < disp[(a)]; k++) \ + u8s[start[(b)] + k] = u8t[k]; \ + U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ + U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); + +/* The possible states during normalization. */ +typedef enum { + U8_STATE_START = 0, + U8_STATE_HANGUL_L = 1, + U8_STATE_HANGUL_LV = 2, + U8_STATE_HANGUL_LVT = 3, + U8_STATE_HANGUL_V = 4, + U8_STATE_HANGUL_T = 5, + U8_STATE_COMBINING_MARK = 6 +} u8_normalization_states_t; + +/* + * The three vectors at below are used to check bytes of a given UTF-8 + * character are valid and not containing any malformed byte values. + * + * We used to have a quite relaxed UTF-8 binary representation but then there + * was some security related issues and so the Unicode Consortium defined + * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it + * one more time at the Unicode 3.2. The following three tables are based on + * that. + */ + +#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) + +#define I_ U8_ILLEGAL_CHAR +#define O_ U8_OUT_OF_RANGE_CHAR + +const int8_t u8_number_of_bytes[0x100] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ + I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + +/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ + 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, +}; + +#undef I_ +#undef O_ + +const uint8_t u8_valid_min_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* C8 C9 CA CB CC CD CE CF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* D8 D9 DA DB DC DD DE DF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* E8 E9 EA EB EC ED EE EF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +const uint8_t u8_valid_max_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* C8 C9 CA CB CC CD CE CF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* D8 D9 DA DB DC DD DE DF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* E8 E9 EA EB EC ED EE EF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + + +/* + * The u8_validate() validates on the given UTF-8 character string and + * calculate the byte length. It is quite similar to mblen(3C) except that + * this will validate against the list of characters if required and + * specific to UTF-8 and Unicode. + */ +int +u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) +{ + uchar_t *ib; + uchar_t *ibtail; + uchar_t **p; + uchar_t *s1; + uchar_t *s2; + uchar_t f; + int sz; + size_t i; + int ret_val; + boolean_t second; + boolean_t no_need_to_validate_entire; + boolean_t check_additional; + boolean_t validate_ucs2_range_only; + + if (! u8str) + return (0); + + ib = (uchar_t *)u8str; + ibtail = ib + n; + + ret_val = 0; + + no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); + check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; + validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; + + while (ib < ibtail) { + /* + * The first byte of a UTF-8 character tells how many + * bytes will follow for the character. If the first byte + * is an illegal byte value or out of range value, we just + * return -1 with an appropriate error number. + */ + sz = u8_number_of_bytes[*ib]; + if (sz == U8_ILLEGAL_CHAR) { + *errnum = EILSEQ; + return (-1); + } + + if (sz == U8_OUT_OF_RANGE_CHAR || + (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { + *errnum = ERANGE; + return (-1); + } + + /* + * If we don't have enough bytes to check on, that's also + * an error. As you can see, we give illegal byte sequence + * checking higher priority then EINVAL cases. + */ + if ((ibtail - ib) < sz) { + *errnum = EINVAL; + return (-1); + } + + if (sz == 1) { + ib++; + ret_val++; + } else { + /* + * Check on the multi-byte UTF-8 character. For more + * details on this, see comment added for the used + * data structures at the beginning of the file. + */ + f = *ib++; + ret_val++; + second = B_TRUE; + for (i = 1; i < sz; i++) { + if (second) { + if (*ib < u8_valid_min_2nd_byte[f] || + *ib > u8_valid_max_2nd_byte[f]) { + *errnum = EILSEQ; + return (-1); + } + second = B_FALSE; + } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { + *errnum = EILSEQ; + return (-1); + } + ib++; + ret_val++; + } + } + + if (check_additional) { + for (p = (uchar_t **)list, i = 0; p[i]; i++) { + s1 = ib - sz; + s2 = p[i]; + while (s1 < ib) { + if (*s1 != *s2 || *s2 == '\0') + break; + s1++; + s2++; + } + + if (s1 >= ib && *s2 == '\0') { + *errnum = EBADF; + return (-1); + } + } + } + + if (no_need_to_validate_entire) + break; + } + + return (ret_val); +} + +/* + * The do_case_conv() looks at the mapping tables and returns found + * bytes if any. If not found, the input bytes are returned. The function + * always terminate the return bytes with a null character assuming that + * there are plenty of room to do so. + * + * The case conversions are simple case conversions mapping a character to + * another character as specified in the Unicode data. The byte size of + * the mapped character could be different from that of the input character. + * + * The return value is the byte length of the returned character excluding + * the terminating null byte. + */ +static size_t +do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) +{ + size_t i; + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + + /* + * At this point, the only possible values for sz are 2, 3, and 4. + * The u8s should point to a vector that is well beyond the size of + * 5 bytes. + */ + if (sz == 2) { + b3 = u8s[0] = s[0]; + b4 = u8s[1] = s[1]; + } else if (sz == 3) { + b2 = u8s[0] = s[0]; + b3 = u8s[1] = s[1]; + b4 = u8s[2] = s[2]; + } else if (sz == 4) { + b1 = u8s[0] = s[0]; + b2 = u8s[1] = s[1]; + b3 = u8s[2] = s[2]; + b4 = u8s[3] = s[3]; + } else { + /* This is not possible but just in case as a fallback. */ + if (is_it_toupper) + *u8s = U8_ASCII_TOUPPER(*s); + else + *u8s = U8_ASCII_TOLOWER(*s); + u8s[1] = '\0'; + + return (1); + } + u8s[sz] = '\0'; + + /* + * Let's find out if we have a corresponding character. + */ + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b2 = u8_case_common_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + if (is_it_toupper) { + b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; + + /* Either there is no match or an error at the table. */ + if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) + return ((size_t)sz); + + b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; + } else { + b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; + + if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) + return ((size_t)sz); + + b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; + } + + /* + * If i is still zero, that means there is no corresponding character. + */ + if (i == 0) + return ((size_t)sz); + + u8s[i] = '\0'; + + return (i); +} + +/* + * The do_case_compare() function compares the two input strings, s1 and s2, + * one character at a time doing case conversions if applicable and return + * the comparison result as like strcmp(). + * + * Since, in empirical sense, most of text data are 7-bit ASCII characters, + * we treat the 7-bit ASCII characters as a special case trying to yield + * faster processing time. + */ +static int +do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, + size_t n2, boolean_t is_it_toupper, int *errnum) +{ + int f; + int sz1; + int sz2; + size_t j; + size_t i1; + size_t i2; + uchar_t u8s1[U8_MB_CUR_MAX + 1]; + uchar_t u8s2[U8_MB_CUR_MAX + 1]; + + i1 = i2 = 0; + while (i1 < n1 && i2 < n2) { + /* + * Find out what would be the byte length for this UTF-8 + * character at string s1 and also find out if this is + * an illegal start byte or not and if so, issue a proper + * error number and yet treat this byte as a character. + */ + sz1 = u8_number_of_bytes[*s1]; + if (sz1 < 0) { + *errnum = EILSEQ; + sz1 = 1; + } + + /* + * For 7-bit ASCII characters mainly, we do a quick case + * conversion right at here. + * + * If we don't have enough bytes for this character, issue + * an EINVAL error and use what are available. + * + * If we have enough bytes, find out if there is + * a corresponding uppercase character and if so, copy over + * the bytes for a comparison later. If there is no + * corresponding uppercase character, then, use what we have + * for the comparison. + */ + if (sz1 == 1) { + if (is_it_toupper) + u8s1[0] = U8_ASCII_TOUPPER(*s1); + else + u8s1[0] = U8_ASCII_TOLOWER(*s1); + s1++; + u8s1[1] = '\0'; + } else if ((i1 + sz1) > n1) { + *errnum = EINVAL; + for (j = 0; (i1 + j) < n1; ) + u8s1[j++] = *s1++; + u8s1[j] = '\0'; + } else { + (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); + s1 += sz1; + } + + /* Do the same for the string s2. */ + sz2 = u8_number_of_bytes[*s2]; + if (sz2 < 0) { + *errnum = EILSEQ; + sz2 = 1; + } + + if (sz2 == 1) { + if (is_it_toupper) + u8s2[0] = U8_ASCII_TOUPPER(*s2); + else + u8s2[0] = U8_ASCII_TOLOWER(*s2); + s2++; + u8s2[1] = '\0'; + } else if ((i2 + sz2) > n2) { + *errnum = EINVAL; + for (j = 0; (i2 + j) < n2; ) + u8s2[j++] = *s2++; + u8s2[j] = '\0'; + } else { + (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); + s2 += sz2; + } + + /* Now compare the two characters. */ + if (sz1 == 1 && sz2 == 1) { + if (*u8s1 > *u8s2) + return (1); + if (*u8s1 < *u8s2) + return (-1); + } else { + f = strcmp((const char *)u8s1, (const char *)u8s2); + if (f != 0) + return (f); + } + + /* + * They were the same. Let's move on to the next + * characters then. + */ + i1 += sz1; + i2 += sz2; + } + + /* + * We compared until the end of either or both strings. + * + * If we reached to or went over the ends for the both, that means + * they are the same. + * + * If we reached only one of the two ends, that means the other string + * has something which then the fact can be used to determine + * the return value. + */ + if (i1 >= n1) { + if (i2 >= n2) + return (0); + return (-1); + } + return (1); +} + +/* + * The combining_class() function checks on the given bytes and find out + * the corresponding Unicode combining class value. The return value 0 means + * it is a Starter. Any illegal UTF-8 character will also be treated as + * a Starter. + */ +static uchar_t +combining_class(size_t uv, uchar_t *s, size_t sz) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b4 = 0; + + if (sz == 1 || sz > 4) + return (0); + + if (sz == 2) { + b3 = s[0]; + b4 = s[1]; + } else if (sz == 3) { + b2 = s[0]; + b3 = s[1]; + b4 = s[2]; + } else if (sz == 4) { + b1 = s[0]; + b2 = s[1]; + b3 = s[2]; + b4 = s[3]; + } + + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + b2 = u8_combining_class_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + b3 = u8_combining_class_b3_tbl[uv][b2][b3]; + if (b3 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + return (u8_combining_class_b4_tbl[uv][b3][b4]); +} + +/* + * The do_decomp() function finds out a matching decomposition if any + * and return. If there is no match, the input bytes are copied and returned. + * The function also checks if there is a Hangul, decomposes it if necessary + * and returns. + * + * To save time, a single byte 7-bit ASCII character should be handled by + * the caller. + * + * The function returns the number of bytes returned sans always terminating + * the null byte. It will also return a state that will tell if there was + * a Hangul character decomposed which then will be used by the caller. + */ +static size_t +do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, + boolean_t canonical_decomposition, u8_normalization_states_t *state) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + size_t i; + uint32_t u1; + + if (sz == 2) { + b3 = u8s[0] = s[0]; + b4 = u8s[1] = s[1]; + u8s[2] = '\0'; + } else if (sz == 3) { + /* Convert it to a Unicode scalar value. */ + U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); + + /* + * If this is a Hangul syllable, we decompose it into + * a leading consonant, a vowel, and an optional trailing + * consonant and then return. + */ + if (U8_HANGUL_SYLLABLE(u1)) { + u1 -= U8_HANGUL_SYL_FIRST; + + b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; + b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) + / U8_HANGUL_T_COUNT; + b3 = u1 % U8_HANGUL_T_COUNT; + + U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); + U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); + if (b3) { + b3 += U8_HANGUL_JAMO_T_FIRST; + U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); + + u8s[9] = '\0'; + *state = U8_STATE_HANGUL_LVT; + return (9); + } + + u8s[6] = '\0'; + *state = U8_STATE_HANGUL_LV; + return (6); + } + + b2 = u8s[0] = s[0]; + b3 = u8s[1] = s[1]; + b4 = u8s[2] = s[2]; + u8s[3] = '\0'; + + /* + * If this is a Hangul Jamo, we know there is nothing + * further that we can decompose. + */ + if (U8_HANGUL_JAMO_L(u1)) { + *state = U8_STATE_HANGUL_L; + return (3); + } + + if (U8_HANGUL_JAMO_V(u1)) { + if (*state == U8_STATE_HANGUL_L) + *state = U8_STATE_HANGUL_LV; + else + *state = U8_STATE_HANGUL_V; + return (3); + } + + if (U8_HANGUL_JAMO_T(u1)) { + if (*state == U8_STATE_HANGUL_LV) + *state = U8_STATE_HANGUL_LVT; + else + *state = U8_STATE_HANGUL_T; + return (3); + } + } else if (sz == 4) { + b1 = u8s[0] = s[0]; + b2 = u8s[1] = s[1]; + b3 = u8s[2] = s[2]; + b4 = u8s[3] = s[3]; + u8s[4] = '\0'; + } else { + /* + * This is a fallback and should not happen if the function + * was called properly. + */ + u8s[0] = s[0]; + u8s[1] = '\0'; + *state = U8_STATE_START; + return (1); + } + + /* + * At this point, this rountine does not know what it would get. + * The caller should sort it out if the state isn't a Hangul one. + */ + *state = U8_STATE_START; + + /* Try to find matching decomposition mapping byte sequence. */ + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b2 = u8_decomp_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + /* + * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR + * which is 0x8000, this means we couldn't fit the mappings into + * the cardinality of a unsigned byte. + */ + if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { + b3_tbl -= U8_16BIT_TABLE_INDICATOR; + start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; + end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; + } else { + start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; + } + + /* This also means there wasn't any matching decomposition. */ + if (start_id >= end_id) + return ((size_t)sz); + + /* + * The final table for decomposition mappings has three types of + * byte sequences depending on whether a mapping is for compatibility + * decomposition, canonical decomposition, or both like the following: + * + * (1) Compatibility decomposition mappings: + * + * +---+---+-...-+---+ + * | B0| B1| ... | Bm| + * +---+---+-...-+---+ + * + * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). + * + * (2) Canonical decomposition mappings: + * + * +---+---+---+-...-+---+ + * | T | b0| b1| ... | bn| + * +---+---+---+-...-+---+ + * + * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). + * + * (3) Both mappings: + * + * +---+---+---+---+-...-+---+---+---+-...-+---+ + * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| + * +---+---+---+---+-...-+---+---+---+-...-+---+ + * + * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement + * byte, b0 to bn are canonical mapping bytes and B0 to Bm are + * compatibility mapping bytes. + * + * Note that compatibility decomposition means doing recursive + * decompositions using both compatibility decomposition mappings and + * canonical decomposition mappings. On the other hand, canonical + * decomposition means doing recursive decompositions using only + * canonical decomposition mappings. Since the table we have has gone + * through the recursions already, we do not need to do so during + * runtime, i.e., the table has been completely flattened out + * already. + */ + + b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; + + /* Get the type, T, of the byte sequence. */ + b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; + + /* + * If necessary, adjust start_id, end_id, or both. Note that if + * this is compatibility decomposition mapping, there is no + * adjustment. + */ + if (canonical_decomposition) { + /* Is the mapping only for compatibility decomposition? */ + if (b1 < U8_DECOMP_BOTH) + return ((size_t)sz); + + start_id++; + + if (b1 == U8_DECOMP_BOTH) { + end_id = start_id + + u8_decomp_final_tbl[uv][b3_base + start_id]; + start_id++; + } + } else { + /* + * Unless this is a compatibility decomposition mapping, + * we adjust the start_id. + */ + if (b1 == U8_DECOMP_BOTH) { + start_id++; + start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; + } else if (b1 == U8_DECOMP_CANONICAL) { + start_id++; + } + } + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; + u8s[i] = '\0'; + + return (i); +} + +/* + * The find_composition_start() function uses the character bytes given and + * find out the matching composition mappings if any and return the address + * to the composition mappings as explained in the do_composition(). + */ +static uchar_t * +find_composition_start(size_t uv, uchar_t *s, size_t sz) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + + if (sz == 1) { + b4 = s[0]; + } else if (sz == 2) { + b3 = s[0]; + b4 = s[1]; + } else if (sz == 3) { + b2 = s[0]; + b3 = s[1]; + b4 = s[2]; + } else if (sz == 4) { + b1 = s[0]; + b2 = s[1]; + b3 = s[2]; + b4 = s[3]; + } else { + /* + * This is a fallback and should not happen if the function + * was called properly. + */ + return (NULL); + } + + b1 = u8_composition_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + b2 = u8_composition_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { + b3_tbl -= U8_16BIT_TABLE_INDICATOR; + start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; + end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; + } else { + start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; + } + + if (start_id >= end_id) + return (NULL); + + b3_base = u8_composition_b3_tbl[uv][b2][b3].base; + + return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); +} + +/* + * The blocked() function checks on the combining class values of previous + * characters in this sequence and return whether it is blocked or not. + */ +static boolean_t +blocked(uchar_t *comb_class, size_t last) +{ + uchar_t my_comb_class; + size_t i; + + my_comb_class = comb_class[last]; + for (i = 1; i < last; i++) + if (comb_class[i] >= my_comb_class || + comb_class[i] == U8_COMBINING_CLASS_STARTER) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The do_composition() reads the character string pointed by 's' and + * do necessary canonical composition and then copy over the result back to + * the 's'. + * + * The input argument 's' cannot contain more than 32 characters. + */ +static size_t +do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, + uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) +{ + uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t tc[U8_MB_CUR_MAX]; + uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; + size_t saved_marks_count; + uchar_t *p; + uchar_t *saved_p; + uchar_t *q; + size_t i; + size_t saved_i; + size_t j; + size_t k; + size_t l; + size_t C; + size_t saved_l; + size_t size; + uint32_t u1; + uint32_t u2; + boolean_t match_not_found = B_TRUE; + + /* + * This should never happen unless the callers are doing some strange + * and unexpected things. + * + * The "last" is the index pointing to the last character not last + 1. + */ + if (last >= U8_MAX_CHARS_A_SEQ) + last = U8_UPPER_LIMIT_IN_A_SEQ; + + for (i = l = 0; i <= last; i++) { + /* + * The last or any non-Starters at the beginning, we don't + * have any chance to do composition and so we just copy them + * to the temporary buffer. + */ + if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { +SAVE_THE_CHAR: + p = s + start[i]; + size = disp[i]; + for (k = 0; k < size; k++) + t[l++] = *p++; + continue; + } + + /* + * If this could be a start of Hangul Jamos, then, we try to + * conjoin them. + */ + if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { + U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], + s[start[i] + 1], s[start[i] + 2]); + U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], + s[start[i] + 4], s[start[i] + 5]); + + if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { + u1 -= U8_HANGUL_JAMO_L_FIRST; + u2 -= U8_HANGUL_JAMO_V_FIRST; + u1 = U8_HANGUL_SYL_FIRST + + (u1 * U8_HANGUL_V_COUNT + u2) * + U8_HANGUL_T_COUNT; + + i += 2; + if (i <= last) { + U8_PUT_3BYTES_INTO_UTF32(u2, + s[start[i]], s[start[i] + 1], + s[start[i] + 2]); + + if (U8_HANGUL_JAMO_T(u2)) { + u1 += u2 - + U8_HANGUL_JAMO_T_FIRST; + i++; + } + } + + U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); + i--; + l += 3; + continue; + } + } + + /* + * Let's then find out if this Starter has composition + * mapping. + */ + p = find_composition_start(uv, s + start[i], disp[i]); + if (p == NULL) + goto SAVE_THE_CHAR; + + /* + * We have a Starter with composition mapping and the next + * character is a non-Starter. Let's try to find out if + * we can do composition. + */ + + saved_p = p; + saved_i = i; + saved_l = l; + saved_marks_count = 0; + +TRY_THE_NEXT_MARK: + q = s + start[++i]; + size = disp[i]; + + /* + * The next for() loop compares the non-Starter pointed by + * 'q' with the possible (joinable) characters pointed by 'p'. + * + * The composition final table entry pointed by the 'p' + * looks like the following: + * + * +---+---+---+-...-+---+---+---+---+-...-+---+---+ + * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | + * +---+---+---+-...-+---+---+---+---+-...-+---+---+ + * + * where C is the count byte indicating the number of + * mapping pairs where each pair would be look like + * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second + * character of a canonical decomposition and the B0-Bm are + * the bytes of a matching composite character. The F is + * a filler byte after each character as the separator. + */ + + match_not_found = B_TRUE; + + for (C = *p++; C > 0; C--) { + for (k = 0; k < size; p++, k++) + if (*p != q[k]) + break; + + /* Have we found it? */ + if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { + match_not_found = B_FALSE; + + l = saved_l; + + while (*++p != U8_TBL_ELEMENT_FILLER) + t[l++] = *p; + + break; + } + + /* We didn't find; skip to the next pair. */ + if (*p != U8_TBL_ELEMENT_FILLER) + while (*++p != U8_TBL_ELEMENT_FILLER) + ; + while (*++p != U8_TBL_ELEMENT_FILLER) + ; + p++; + } + + /* + * If there was no match, we will need to save the combining + * mark for later appending. After that, if the next one + * is a non-Starter and not blocked, then, we try once + * again to do composition with the next non-Starter. + * + * If there was no match and this was a Starter, then, + * this is a new start. + * + * If there was a match and a composition done and we have + * more to check on, then, we retrieve a new composition final + * table entry for the composite and then try to do the + * composition again. + */ + + if (match_not_found) { + if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { + i--; + goto SAVE_THE_CHAR; + } + + saved_marks[saved_marks_count++] = i; + } + + if (saved_l == l) { + while (i < last) { + if (blocked(comb_class, i + 1)) + saved_marks[saved_marks_count++] = ++i; + else + break; + } + if (i < last) { + p = saved_p; + goto TRY_THE_NEXT_MARK; + } + } else if (i < last) { + p = find_composition_start(uv, t + saved_l, + l - saved_l); + if (p != NULL) { + saved_p = p; + goto TRY_THE_NEXT_MARK; + } + } + + /* + * There is no more composition possible. + * + * If there was no composition what so ever then we copy + * over the original Starter and then append any non-Starters + * remaining at the target string sequentially after that. + */ + + if (saved_l == l) { + p = s + start[saved_i]; + size = disp[saved_i]; + for (j = 0; j < size; j++) + t[l++] = *p++; + } + + for (k = 0; k < saved_marks_count; k++) { + p = s + start[saved_marks[k]]; + size = disp[saved_marks[k]]; + for (j = 0; j < size; j++) + t[l++] = *p++; + } + } + + /* + * If the last character is a Starter and if we have a character + * (possibly another Starter) that can be turned into a composite, + * we do so and we do so until there is no more of composition + * possible. + */ + if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { + p = *os; + saved_l = l - disp[last]; + + while (p < oslast) { + size = u8_number_of_bytes[*p]; + if (size <= 1 || (p + size) > oslast) + break; + + saved_p = p; + + for (i = 0; i < size; i++) + tc[i] = *p++; + + q = find_composition_start(uv, t + saved_l, + l - saved_l); + if (q == NULL) { + p = saved_p; + break; + } + + match_not_found = B_TRUE; + + for (C = *q++; C > 0; C--) { + for (k = 0; k < size; q++, k++) + if (*q != tc[k]) + break; + + if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { + match_not_found = B_FALSE; + + l = saved_l; + + while (*++q != U8_TBL_ELEMENT_FILLER) { + /* + * This is practically + * impossible but we don't + * want to take any chances. + */ + if (l >= + U8_STREAM_SAFE_TEXT_MAX) { + p = saved_p; + goto SAFE_RETURN; + } + t[l++] = *q; + } + + break; + } + + if (*q != U8_TBL_ELEMENT_FILLER) + while (*++q != U8_TBL_ELEMENT_FILLER) + ; + while (*++q != U8_TBL_ELEMENT_FILLER) + ; + q++; + } + + if (match_not_found) { + p = saved_p; + break; + } + } +SAFE_RETURN: + *os = p; + } + + /* + * Now we copy over the temporary string to the target string. + * Since composition always reduces the number of characters or + * the number of characters stay, we don't need to worry about + * the buffer overflow here. + */ + for (i = 0; i < l; i++) + s[i] = t[i]; + s[l] = '\0'; + + return (l); +} + +/* + * The collect_a_seq() function checks on the given string s, collect + * a sequence of characters at u8s, and return the sequence. While it collects + * a sequence, it also applies case conversion, canonical or compatibility + * decomposition, canonical decomposition, or some or all of them and + * in that order. + * + * The collected sequence cannot be bigger than 32 characters since if + * it is having more than 31 characters, the sequence will be terminated + * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into + * a Stream-Safe Text. The collected sequence is always terminated with + * a null byte and the return value is the byte length of the sequence + * including 0. The return value does not include the terminating + * null byte. + */ +static size_t +collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, + boolean_t is_it_toupper, + boolean_t is_it_tolower, + boolean_t canonical_decomposition, + boolean_t compatibility_decomposition, + boolean_t canonical_composition, + int *errnum, u8_normalization_states_t *state) +{ + uchar_t *s; + int sz; + int saved_sz; + size_t i; + size_t j; + size_t k; + size_t l; + uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; + uchar_t disp[U8_MAX_CHARS_A_SEQ]; + uchar_t start[U8_MAX_CHARS_A_SEQ]; + uchar_t u8t[U8_MB_CUR_MAX]; + uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t tc; + size_t last; + size_t saved_last; + uint32_t u1; + + /* + * Save the source string pointer which we will return a changed + * pointer if we do processing. + */ + s = *source; + + /* + * The following is a fallback for just in case callers are not + * checking the string boundaries before the calling. + */ + if (s >= slast) { + u8s[0] = '\0'; + + return (0); + } + + /* + * As the first thing, let's collect a character and do case + * conversion if necessary. + */ + + sz = u8_number_of_bytes[*s]; + + if (sz < 0) { + *errnum = EILSEQ; + + u8s[0] = *s++; + u8s[1] = '\0'; + + *source = s; + + return (1); + } + + if (sz == 1) { + if (is_it_toupper) + u8s[0] = U8_ASCII_TOUPPER(*s); + else if (is_it_tolower) + u8s[0] = U8_ASCII_TOLOWER(*s); + else + u8s[0] = *s; + s++; + u8s[1] = '\0'; + } else if ((s + sz) > slast) { + *errnum = EINVAL; + + for (i = 0; s < slast; ) + u8s[i++] = *s++; + u8s[i] = '\0'; + + *source = s; + + return (i); + } else { + if (is_it_toupper || is_it_tolower) { + i = do_case_conv(uv, u8s, s, sz, is_it_toupper); + s += sz; + sz = i; + } else { + for (i = 0; i < sz; ) + u8s[i++] = *s++; + u8s[i] = '\0'; + } + } + + /* + * And then canonical/compatibility decomposition followed by + * an optional canonical composition. Please be noted that + * canonical composition is done only when a decomposition is + * done. + */ + if (canonical_decomposition || compatibility_decomposition) { + if (sz == 1) { + *state = U8_STATE_START; + + saved_sz = 1; + + comb_class[0] = 0; + start[0] = 0; + disp[0] = 1; + + last = 1; + } else { + saved_sz = do_decomp(uv, u8s, u8s, sz, + canonical_decomposition, state); + + last = 0; + + for (i = 0; i < saved_sz; ) { + sz = u8_number_of_bytes[u8s[i]]; + + comb_class[last] = combining_class(uv, + u8s + i, sz); + start[last] = i; + disp[last] = sz; + + last++; + i += sz; + } + + /* + * Decomposition yields various Hangul related + * states but not on combining marks. We need to + * find out at here by checking on the last + * character. + */ + if (*state == U8_STATE_START) { + if (comb_class[last - 1]) + *state = U8_STATE_COMBINING_MARK; + } + } + + saved_last = last; + + while (s < slast) { + sz = u8_number_of_bytes[*s]; + + /* + * If this is an illegal character, an incomplete + * character, or an 7-bit ASCII Starter character, + * then we have collected a sequence; break and let + * the next call deal with the two cases. + * + * Note that this is okay only if you are using this + * function with a fixed length string, not on + * a buffer with multiple calls of one chunk at a time. + */ + if (sz <= 1) { + break; + } else if ((s + sz) > slast) { + break; + } else { + /* + * If the previous character was a Hangul Jamo + * and this character is a Hangul Jamo that + * can be conjoined, we collect the Jamo. + */ + if (*s == U8_HANGUL_JAMO_1ST_BYTE) { + U8_PUT_3BYTES_INTO_UTF32(u1, + *s, *(s + 1), *(s + 2)); + + if (U8_HANGUL_COMPOSABLE_L_V(*state, + u1)) { + i = 0; + *state = U8_STATE_HANGUL_LV; + goto COLLECT_A_HANGUL; + } + + if (U8_HANGUL_COMPOSABLE_LV_T(*state, + u1)) { + i = 0; + *state = U8_STATE_HANGUL_LVT; + goto COLLECT_A_HANGUL; + } + } + + /* + * Regardless of whatever it was, if this is + * a Starter, we don't collect the character + * since that's a new start and we will deal + * with it at the next time. + */ + i = combining_class(uv, s, sz); + if (i == U8_COMBINING_CLASS_STARTER) + break; + + /* + * We know the current character is a combining + * mark. If the previous character wasn't + * a Starter (not Hangul) or a combining mark, + * then, we don't collect this combining mark. + */ + if (*state != U8_STATE_START && + *state != U8_STATE_COMBINING_MARK) + break; + + *state = U8_STATE_COMBINING_MARK; +COLLECT_A_HANGUL: + /* + * If we collected a Starter and combining + * marks up to 30, i.e., total 31 characters, + * then, we terminate this degenerately long + * combining sequence with a U+034F COMBINING + * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in + * UTF-8 and turn this into a Stream-Safe + * Text. This will be extremely rare but + * possible. + * + * The following will also guarantee that + * we are not writing more than 32 characters + * plus a NULL at u8s[]. + */ + if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { +TURN_STREAM_SAFE: + *state = U8_STATE_START; + comb_class[last] = 0; + start[last] = saved_sz; + disp[last] = 2; + last++; + + u8s[saved_sz++] = 0xCD; + u8s[saved_sz++] = 0x8F; + + break; + } + + /* + * Some combining marks also do decompose into + * another combining mark or marks. + */ + if (*state == U8_STATE_COMBINING_MARK) { + k = last; + l = sz; + i = do_decomp(uv, uts, s, sz, + canonical_decomposition, state); + for (j = 0; j < i; ) { + sz = u8_number_of_bytes[uts[j]]; + + comb_class[last] = + combining_class(uv, + uts + j, sz); + start[last] = saved_sz + j; + disp[last] = sz; + + last++; + if (last >= + U8_UPPER_LIMIT_IN_A_SEQ) { + last = k; + goto TURN_STREAM_SAFE; + } + j += sz; + } + + *state = U8_STATE_COMBINING_MARK; + sz = i; + s += l; + + for (i = 0; i < sz; i++) + u8s[saved_sz++] = uts[i]; + } else { + comb_class[last] = i; + start[last] = saved_sz; + disp[last] = sz; + last++; + + for (i = 0; i < sz; i++) + u8s[saved_sz++] = *s++; + } + + /* + * If this is U+0345 COMBINING GREEK + * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., + * iota subscript, and need to be converted to + * uppercase letter, convert it to U+0399 GREEK + * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), + * i.e., convert to capital adscript form as + * specified in the Unicode standard. + * + * This is the only special case of (ambiguous) + * case conversion at combining marks and + * probably the standard will never have + * anything similar like this in future. + */ + if (is_it_toupper && sz >= 2 && + u8s[saved_sz - 2] == 0xCD && + u8s[saved_sz - 1] == 0x85) { + u8s[saved_sz - 2] = 0xCE; + u8s[saved_sz - 1] = 0x99; + } + } + } + + /* + * Let's try to ensure a canonical ordering for the collected + * combining marks. We do this only if we have collected + * at least one more non-Starter. (The decomposition mapping + * data tables have fully (and recursively) expanded and + * canonically ordered decompositions.) + * + * The U8_SWAP_COMB_MARKS() convenience macro has some + * assumptions and we are meeting the assumptions. + */ + last--; + if (last >= saved_last) { + for (i = 0; i < last; i++) + for (j = last; j > i; j--) + if (comb_class[j] && + comb_class[j - 1] > comb_class[j]) { + U8_SWAP_COMB_MARKS(j - 1, j); + } + } + + *source = s; + + if (! canonical_composition) { + u8s[saved_sz] = '\0'; + return (saved_sz); + } + + /* + * Now do the canonical composition. Note that we do this + * only after a canonical or compatibility decomposition to + * finish up NFC or NFKC. + */ + sz = do_composition(uv, u8s, comb_class, start, disp, last, + &s, slast); + } + + *source = s; + + return ((size_t)sz); +} + +/* + * The do_norm_compare() function does string comparion based on Unicode + * simple case mappings and Unicode Normalization definitions. + * + * It does so by collecting a sequence of character at a time and comparing + * the collected sequences from the strings. + * + * The meanings on the return values are the same as the usual strcmp(). + */ +static int +do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, + int flag, int *errnum) +{ + int result; + size_t sz1; + size_t sz2; + uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t *s1last; + uchar_t *s2last; + boolean_t is_it_toupper; + boolean_t is_it_tolower; + boolean_t canonical_decomposition; + boolean_t compatibility_decomposition; + boolean_t canonical_composition; + u8_normalization_states_t state; + + s1last = s1 + n1; + s2last = s2 + n2; + + is_it_toupper = flag & U8_TEXTPREP_TOUPPER; + is_it_tolower = flag & U8_TEXTPREP_TOLOWER; + canonical_decomposition = flag & U8_CANON_DECOMP; + compatibility_decomposition = flag & U8_COMPAT_DECOMP; + canonical_composition = flag & U8_CANON_COMP; + + while (s1 < s1last && s2 < s2last) { + /* + * If the current character is a 7-bit ASCII and the last + * character, or, if the current character and the next + * character are both some 7-bit ASCII characters then + * we treat the current character as a sequence. + * + * In any other cases, we need to call collect_a_seq(). + */ + + if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || + ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { + if (is_it_toupper) + u8s1[0] = U8_ASCII_TOUPPER(*s1); + else if (is_it_tolower) + u8s1[0] = U8_ASCII_TOLOWER(*s1); + else + u8s1[0] = *s1; + u8s1[1] = '\0'; + sz1 = 1; + s1++; + } else { + state = U8_STATE_START; + sz1 = collect_a_seq(uv, u8s1, &s1, s1last, + is_it_toupper, is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, errnum, &state); + } + + if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || + ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { + if (is_it_toupper) + u8s2[0] = U8_ASCII_TOUPPER(*s2); + else if (is_it_tolower) + u8s2[0] = U8_ASCII_TOLOWER(*s2); + else + u8s2[0] = *s2; + u8s2[1] = '\0'; + sz2 = 1; + s2++; + } else { + state = U8_STATE_START; + sz2 = collect_a_seq(uv, u8s2, &s2, s2last, + is_it_toupper, is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, errnum, &state); + } + + /* + * Now compare the two characters. If they are the same, + * we move on to the next character sequences. + */ + if (sz1 == 1 && sz2 == 1) { + if (*u8s1 > *u8s2) + return (1); + if (*u8s1 < *u8s2) + return (-1); + } else { + result = strcmp((const char *)u8s1, (const char *)u8s2); + if (result != 0) + return (result); + } + } + + /* + * We compared until the end of either or both strings. + * + * If we reached to or went over the ends for the both, that means + * they are the same. + * + * If we reached only one end, that means the other string has + * something which then can be used to determine the return value. + */ + if (s1 >= s1last) { + if (s2 >= s2last) + return (0); + return (-1); + } + return (1); +} + +/* + * The u8_strcmp() function compares two UTF-8 strings quite similar to + * the strcmp(). For the comparison, however, Unicode Normalization specific + * equivalency and Unicode simple case conversion mappings based equivalency + * can be requested and checked against. + */ +int +u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, + int *errnum) +{ + int f; + size_t n1; + size_t n2; + + *errnum = 0; + + /* + * Check on the requested Unicode version, case conversion, and + * normalization flag values. + */ + + if (uv > U8_UNICODE_LATEST) { + *errnum = ERANGE; + uv = U8_UNICODE_LATEST; + } + + if (flag == 0) { + flag = U8_STRCMP_CS; + } else { + f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | + U8_STRCMP_CI_LOWER); + if (f == 0) { + flag |= U8_STRCMP_CS; + } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && + f != U8_STRCMP_CI_LOWER) { + *errnum = EBADF; + flag = U8_STRCMP_CS; + } + + f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); + if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && + f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { + *errnum = EBADF; + flag = U8_STRCMP_CS; + } + } + + if (flag == U8_STRCMP_CS) { + return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); + } + + n1 = strlen(s1); + n2 = strlen(s2); + if (n != 0) { + if (n < n1) + n1 = n; + if (n < n2) + n2 = n; + } + + /* + * Simple case conversion can be done much faster and so we do + * them separately here. + */ + if (flag == U8_STRCMP_CI_UPPER) { + return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, + n1, n2, B_TRUE, errnum)); + } else if (flag == U8_STRCMP_CI_LOWER) { + return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, + n1, n2, B_FALSE, errnum)); + } + + return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, + flag, errnum)); +} + +size_t +u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, + int flag, size_t unicode_version, int *errnum) +{ + int f; + int sz; + uchar_t *ib; + uchar_t *ibtail; + uchar_t *ob; + uchar_t *obtail; + boolean_t do_not_ignore_null; + boolean_t do_not_ignore_invalid; + boolean_t is_it_toupper; + boolean_t is_it_tolower; + boolean_t canonical_decomposition; + boolean_t compatibility_decomposition; + boolean_t canonical_composition; + size_t ret_val; + size_t i; + size_t j; + uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; + u8_normalization_states_t state; + + if (unicode_version > U8_UNICODE_LATEST) { + *errnum = ERANGE; + return ((size_t)-1); + } + + f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); + if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { + *errnum = EBADF; + return ((size_t)-1); + } + + f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); + if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && + f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { + *errnum = EBADF; + return ((size_t)-1); + } + + if (inarray == NULL || *inlen == 0) + return (0); + + if (outarray == NULL) { + *errnum = E2BIG; + return ((size_t)-1); + } + + ib = (uchar_t *)inarray; + ob = (uchar_t *)outarray; + ibtail = ib + *inlen; + obtail = ob + *outlen; + + do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); + do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); + is_it_toupper = flag & U8_TEXTPREP_TOUPPER; + is_it_tolower = flag & U8_TEXTPREP_TOLOWER; + + ret_val = 0; + + /* + * If we don't have a normalization flag set, we do the simple case + * conversion based text preparation separately below. Text + * preparation involving Normalization will be done in the false task + * block, again, separately since it will take much more time and + * resource than doing simple case conversions. + */ + if (f == 0) { + while (ib < ibtail) { + if (*ib == '\0' && do_not_ignore_null) + break; + + sz = u8_number_of_bytes[*ib]; + + if (sz < 0) { + if (do_not_ignore_invalid) { + *errnum = EILSEQ; + ret_val = (size_t)-1; + break; + } + + sz = 1; + ret_val++; + } + + if (sz == 1) { + if (ob >= obtail) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + if (is_it_toupper) + *ob = U8_ASCII_TOUPPER(*ib); + else if (is_it_tolower) + *ob = U8_ASCII_TOLOWER(*ib); + else + *ob = *ib; + ib++; + ob++; + } else if ((ib + sz) > ibtail) { + if (do_not_ignore_invalid) { + *errnum = EINVAL; + ret_val = (size_t)-1; + break; + } + + if ((obtail - ob) < (ibtail - ib)) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + /* + * We treat the remaining incomplete character + * bytes as a character. + */ + ret_val++; + + while (ib < ibtail) + *ob++ = *ib++; + } else { + if (is_it_toupper || is_it_tolower) { + i = do_case_conv(unicode_version, u8s, + ib, sz, is_it_toupper); + + if ((obtail - ob) < i) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + ib += sz; + + for (sz = 0; sz < i; sz++) + *ob++ = u8s[sz]; + } else { + if ((obtail - ob) < sz) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + for (i = 0; i < sz; i++) + *ob++ = *ib++; + } + } + } + } else { + canonical_decomposition = flag & U8_CANON_DECOMP; + compatibility_decomposition = flag & U8_COMPAT_DECOMP; + canonical_composition = flag & U8_CANON_COMP; + + while (ib < ibtail) { + if (*ib == '\0' && do_not_ignore_null) + break; + + /* + * If the current character is a 7-bit ASCII + * character and it is the last character, or, + * if the current character is a 7-bit ASCII + * character and the next character is also a 7-bit + * ASCII character, then, we copy over this + * character without going through collect_a_seq(). + * + * In any other cases, we need to look further with + * the collect_a_seq() function. + */ + if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || + ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { + if (ob >= obtail) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + if (is_it_toupper) + *ob = U8_ASCII_TOUPPER(*ib); + else if (is_it_tolower) + *ob = U8_ASCII_TOLOWER(*ib); + else + *ob = *ib; + ib++; + ob++; + } else { + *errnum = 0; + state = U8_STATE_START; + + j = collect_a_seq(unicode_version, u8s, + &ib, ibtail, + is_it_toupper, + is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, + errnum, &state); + + if (*errnum && do_not_ignore_invalid) { + ret_val = (size_t)-1; + break; + } + + if ((obtail - ob) < j) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + for (i = 0; i < j; i++) + *ob++ = u8s[i]; + } + } + } + + *inlen = ibtail - ib; + *outlen = obtail - ob; + + return (ret_val); +} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c new file mode 100644 index 000000000000..74517a3f6920 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file is intended for functions that ought to be common between user + * land (libzfs) and the kernel. When many common routines need to be shared + * then a separate file should to be created. + */ + +#if defined(_KERNEL) +#include <sys/systm.h> +#endif + +#include <sys/types.h> +#include <sys/fs/zfs.h> +#include <sys/nvpair.h> + +/* + * Are there allocatable vdevs? + */ +boolean_t +zfs_allocatable_devs(nvlist_t *nv) +{ + uint64_t is_log; + uint_t c; + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + return (B_FALSE); + } + for (c = 0; c < children; c++) { + is_log = 0; + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (!is_log) + return (B_TRUE); + } + return (B_FALSE); +} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h new file mode 100644 index 000000000000..f517044a80a0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_COMUTIL_H +#define _ZFS_COMUTIL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/fs/zfs.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t zfs_allocatable_devs(nvlist_t *nv); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_COMUTIL_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c new file mode 100644 index 000000000000..0fd5800a84dc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c @@ -0,0 +1,234 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#if defined(_KERNEL) +#include <sys/systm.h> +#include <sys/sunddi.h> +#include <sys/ctype.h> +#else +#include <stdio.h> +#include <unistd.h> +#include <strings.h> +#include <libnvpair.h> +#include <ctype.h> +#endif +/* XXX includes zfs_context.h, so why bother with the above? */ +#include <sys/dsl_deleg.h> +#include "zfs_prop.h" +#include "zfs_deleg.h" +#include "zfs_namecheck.h" + +/* + * permission table + * + * Keep this table in sorted order + * + * This table is used for displaying all permissions for + * zfs allow + */ + +zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { + {ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW}, + {ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, + {ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, + {ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, + {ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, + {ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, + {ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, + {ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, + {ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, + {ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, + {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, + {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE }, + {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, + {NULL, ZFS_DELEG_NOTE_NONE } +}; + +static int +zfs_valid_permission_name(const char *perm) +{ + if (zfs_deleg_canonicalize_perm(perm)) + return (0); + + return (permset_namecheck(perm, NULL, NULL)); +} + +const char * +zfs_deleg_canonicalize_perm(const char *perm) +{ + int i; + zfs_prop_t prop; + + for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) { + if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0) + return (perm); + } + + prop = zfs_name_to_prop(perm); + if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop)) + return (zfs_prop_to_name(prop)); + return (NULL); + +} + +static int +zfs_validate_who(char *who) +{ + char *p; + + if (who[2] != ZFS_DELEG_FIELD_SEP_CHR) + return (-1); + + switch (who[0]) { + case ZFS_DELEG_USER: + case ZFS_DELEG_GROUP: + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_GROUP_SETS: + if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT) + return (-1); + for (p = &who[3]; *p; p++) + if (!isdigit(*p)) + return (-1); + break; + + case ZFS_DELEG_NAMED_SET: + case ZFS_DELEG_NAMED_SET_SETS: + if (who[1] != ZFS_DELEG_NA) + return (-1); + return (permset_namecheck(&who[3], NULL, NULL)); + + case ZFS_DELEG_CREATE: + case ZFS_DELEG_CREATE_SETS: + if (who[1] != ZFS_DELEG_NA) + return (-1); + if (who[3] != '\0') + return (-1); + break; + + case ZFS_DELEG_EVERYONE: + case ZFS_DELEG_EVERYONE_SETS: + if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT) + return (-1); + if (who[3] != '\0') + return (-1); + break; + + default: + return (-1); + } + + return (0); +} + +int +zfs_deleg_verify_nvlist(nvlist_t *nvp) +{ + nvpair_t *who, *perm_name; + nvlist_t *perms; + int error; + + if (nvp == NULL) + return (-1); + + who = nvlist_next_nvpair(nvp, NULL); + if (who == NULL) + return (-1); + + do { + if (zfs_validate_who(nvpair_name(who))) + return (-1); + + error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms); + + if (error && error != ENOENT) + return (-1); + if (error == ENOENT) + continue; + + perm_name = nvlist_next_nvpair(perms, NULL); + if (perm_name == NULL) { + return (-1); + } + do { + error = zfs_valid_permission_name( + nvpair_name(perm_name)); + if (error) + return (-1); + } while (perm_name = nvlist_next_nvpair(perms, perm_name)); + } while (who = nvlist_next_nvpair(nvp, who)); + return (0); +} + +/* + * Construct the base attribute name. The base attribute names + * are the "key" to locate the jump objects which contain the actual + * permissions. The base attribute names are encoded based on + * type of entry and whether it is a local or descendent permission. + * + * Arguments: + * attr - attribute name return string, attribute is assumed to be + * ZFS_MAX_DELEG_NAME long. + * type - type of entry to construct + * inheritchr - inheritance type (local,descendent, or NA for create and + * permission set definitions + * data - is either a permission set name or a 64 bit uid/gid. + */ +void +zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, + char inheritchr, void *data) +{ + int len = ZFS_MAX_DELEG_NAME; + uint64_t *id = data; + + switch (type) { + case ZFS_DELEG_USER: + case ZFS_DELEG_GROUP: + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_GROUP_SETS: + (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr, + ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id); + break; + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + (void) snprintf(attr, len, "%c-%c%s", type, + ZFS_DELEG_FIELD_SEP_CHR, (char *)data); + break; + case ZFS_DELEG_CREATE: + case ZFS_DELEG_CREATE_SETS: + (void) snprintf(attr, len, "%c-%c", type, + ZFS_DELEG_FIELD_SEP_CHR); + break; + case ZFS_DELEG_EVERYONE: + case ZFS_DELEG_EVERYONE_SETS: + (void) snprintf(attr, len, "%c%c%c", type, inheritchr, + ZFS_DELEG_FIELD_SEP_CHR); + break; + default: + ASSERT(!"bad zfs_deleg_who_type_t"); + } +} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h new file mode 100644 index 000000000000..561b73e63df4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_DELEG_H +#define _ZFS_DELEG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */ +#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */ + +/* + * Max name length for a delegation attribute + */ +#define ZFS_MAX_DELEG_NAME 128 + +#define ZFS_DELEG_LOCAL 'l' +#define ZFS_DELEG_DESCENDENT 'd' +#define ZFS_DELEG_NA '-' + +typedef enum { + ZFS_DELEG_NOTE_CREATE, + ZFS_DELEG_NOTE_DESTROY, + ZFS_DELEG_NOTE_SNAPSHOT, + ZFS_DELEG_NOTE_ROLLBACK, + ZFS_DELEG_NOTE_CLONE, + ZFS_DELEG_NOTE_PROMOTE, + ZFS_DELEG_NOTE_RENAME, + ZFS_DELEG_NOTE_RECEIVE, + ZFS_DELEG_NOTE_ALLOW, + ZFS_DELEG_NOTE_USERPROP, + ZFS_DELEG_NOTE_MOUNT, + ZFS_DELEG_NOTE_SHARE, + ZFS_DELEG_NOTE_NONE +} zfs_deleg_note_t; + +typedef struct zfs_deleg_perm_tab { + char *z_perm; + zfs_deleg_note_t z_note; +} zfs_deleg_perm_tab_t; + +extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[]; + +int zfs_deleg_verify_nvlist(nvlist_t *nvlist); +void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, + char checkflag, void *data); +const char *zfs_deleg_canonicalize_perm(const char *perm); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_DELEG_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c index 2004d860d329..a9d109be20ab 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,7 +44,9 @@ #endif #include <sys/param.h> +#include <sys/nvpair.h> #include "zfs_namecheck.h" +#include "zfs_deleg.h" static int valid_char(char c) @@ -52,7 +54,7 @@ valid_char(char c) return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || - c == '-' || c == '_' || c == '.' || c == ':'); + c == '-' || c == '_' || c == '.' || c == ':' || c == ' '); } /* @@ -90,6 +92,32 @@ snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) return (0); } + +/* + * Permissions set name must start with the letter '@' followed by the + * same character restrictions as snapshot names, except that the name + * cannot exceed 64 characters. + */ +int +permset_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + if (strlen(path) >= ZFS_PERMSET_MAXLEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + + if (path[0] != '@') { + if (why) { + *why = NAME_ERR_NO_AT; + *what = path[0]; + } + return (-1); + } + + return (snapshot_namecheck(&path[1], why, what)); +} + /* * Dataset names must be of the following form: * @@ -98,7 +126,10 @@ snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) * Where each component is made up of alphanumeric characters plus the following * characters: * - * [-_.:] + * [-_.:%] + * + * We allow '%' here as we use that character internally to create unique + * names for temporary clones (for online recv). */ int dataset_namecheck(const char *path, namecheck_err_t *why, char *what) @@ -114,6 +145,7 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what) * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all * places using MAXNAMELEN. */ + if (strlen(path) >= MAXNAMELEN) { if (why) *why = NAME_ERR_TOOLONG; @@ -167,7 +199,7 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what) /* Validate the contents of this component */ while (loc != end) { - if (!valid_char(*loc)) { + if (!valid_char(*loc) && *loc != '%') { if (why) { *why = NAME_ERR_INVALCHAR; *what = *loc; @@ -211,6 +243,50 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what) } } + +/* + * mountpoint names must be of the following form: + * + * /[component][/]*[component][/] + */ +int +mountpoint_namecheck(const char *path, namecheck_err_t *why) +{ + const char *start, *end; + + /* + * Make sure none of the mountpoint component names are too long. + * If a component name is too long then the mkdir of the mountpoint + * will fail but then the mountpoint property will be set to a value + * that can never be mounted. Better to fail before setting the prop. + * Extra slashes are OK, they will be tossed by the mountpoint mkdir. + */ + + if (path == NULL || *path != '/') { + if (why) + *why = NAME_ERR_LEADING_SLASH; + return (-1); + } + + /* Skip leading slash */ + start = &path[1]; + do { + end = start; + while (*end != '/' && *end != '\0') + end++; + + if (end - start >= MAXNAMELEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + start = end + 1; + + } while (*end != '\0'); + + return (0); +} + /* * For pool names, we have the same set of valid characters as described in * dataset names, with the additional restriction that the pool name must begin diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h index 7e0cda974cc6..ec85e62f72e8 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,12 +42,17 @@ typedef enum { NAME_ERR_RESERVED, /* entire name is reserved */ NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */ NAME_ERR_TOOLONG, /* name is too long */ + NAME_ERR_NO_AT, /* permission set is missing '@' */ } namecheck_err_t; +#define ZFS_PERMSET_MAXLEN 64 + int pool_namecheck(const char *, namecheck_err_t *, char *); int dataset_namecheck(const char *, namecheck_err_t *, char *); +int mountpoint_namecheck(const char *, namecheck_err_t *); int dataset_name_hidden(const char *); int snapshot_namecheck(const char *, namecheck_err_t *, char *); +int permset_namecheck(const char *, namecheck_err_t *, char *); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c index 71256192c092..27a6f2e3dd00 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c @@ -19,40 +19,19 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Master property table. - * - * This table keeps track of all the properties supported by ZFS, and their - * various attributes. Not all of these are needed by the kernel, and several - * are only used by a single libzfs client. But having them here centralizes - * all property information in one location. - * - * name The human-readable string representing this property - * proptype Basic type (string, boolean, number) - * default Default value for the property. Sadly, C only allows - * you to initialize the first member of a union, so we - * have two default members for each property. - * attr Attributes (readonly, inheritable) for the property - * types Valid dataset types to which this applies - * values String describing acceptable values for the property - * colname The column header for 'zfs list' - * colfmt The column formatting for 'zfs list' - * - * This table must match the order of property types in libzfs.h. - */ - #include <sys/zio.h> #include <sys/spa.h> +#include <sys/u8_textprep.h> #include <sys/zfs_acl.h> #include <sys/zfs_ioctl.h> +#include <sys/zfs_znode.h> #include "zfs_prop.h" +#include "zfs_deleg.h" #if defined(_KERNEL) #include <sys/systm.h> @@ -62,244 +41,283 @@ #include <ctype.h> #endif -typedef enum { - prop_default, - prop_readonly, - prop_inherit -} prop_attr_t; - -typedef struct { - const char *pd_name; - zfs_proptype_t pd_proptype; - uint64_t pd_numdefault; - const char *pd_strdefault; - prop_attr_t pd_attr; - int pd_types; - const char *pd_values; - const char *pd_colname; - boolean_t pd_rightalign; - boolean_t pd_visible; -} prop_desc_t; - -static prop_desc_t zfs_prop_table[] = { - { "type", prop_type_string, 0, NULL, prop_readonly, - ZFS_TYPE_ANY, "filesystem | volume | snapshot", "TYPE", B_TRUE, - B_TRUE }, - { "creation", prop_type_number, 0, NULL, prop_readonly, - ZFS_TYPE_ANY, "<date>", "CREATION", B_FALSE, B_TRUE }, - { "used", prop_type_number, 0, NULL, prop_readonly, - ZFS_TYPE_ANY, "<size>", "USED", B_TRUE, B_TRUE }, - { "available", prop_type_number, 0, NULL, prop_readonly, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL", B_TRUE, - B_TRUE }, - { "referenced", prop_type_number, 0, NULL, prop_readonly, - ZFS_TYPE_ANY, - "<size>", "REFER", B_TRUE, B_TRUE }, - { "compressratio", prop_type_number, 0, NULL, prop_readonly, - ZFS_TYPE_ANY, "<1.00x or higher if compressed>", "RATIO", B_TRUE, - B_TRUE }, - { "mounted", prop_type_boolean, 0, NULL, prop_readonly, - ZFS_TYPE_FILESYSTEM, "yes | no | -", "MOUNTED", B_TRUE, B_TRUE }, - { "origin", prop_type_string, 0, NULL, prop_readonly, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN", - B_FALSE, B_TRUE }, - { "quota", prop_type_number, 0, NULL, prop_default, - ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA", B_TRUE, B_TRUE }, - { "reservation", prop_type_number, 0, NULL, prop_default, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "<size> | none", "RESERV", B_TRUE, B_TRUE }, - { "volsize", prop_type_number, 0, NULL, prop_default, - ZFS_TYPE_VOLUME, "<size>", "VOLSIZE", B_TRUE, B_TRUE }, - { "volblocksize", prop_type_number, 8192, NULL, prop_readonly, - ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK", B_TRUE, - B_TRUE }, - { "recordsize", prop_type_number, SPA_MAXBLOCKSIZE, NULL, - prop_inherit, - ZFS_TYPE_FILESYSTEM, - "512 to 128k, power of 2", "RECSIZE", B_TRUE, B_TRUE }, - { "mountpoint", prop_type_string, 0, "/", prop_inherit, - ZFS_TYPE_FILESYSTEM, - "<path> | legacy | none", "MOUNTPOINT", B_FALSE, B_TRUE }, - { "sharenfs", prop_type_string, 0, "off", prop_inherit, - ZFS_TYPE_FILESYSTEM, - "on | off | exports(5) options", "SHARENFS", B_FALSE, B_TRUE }, - { "checksum", prop_type_index, ZIO_CHECKSUM_DEFAULT, "on", - prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", B_TRUE, - B_TRUE }, - { "compression", prop_type_index, ZIO_COMPRESS_DEFAULT, "off", - prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", B_TRUE, B_TRUE }, - { "atime", prop_type_boolean, 1, NULL, prop_inherit, - ZFS_TYPE_FILESYSTEM, - "on | off", "ATIME", B_TRUE, B_TRUE }, - { "devices", prop_type_boolean, 1, NULL, prop_inherit, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "on | off", "DEVICES", B_TRUE, B_TRUE }, - { "exec", prop_type_boolean, 1, NULL, prop_inherit, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "on | off", "EXEC", B_TRUE, B_TRUE }, - { "setuid", prop_type_boolean, 1, NULL, prop_inherit, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", - B_TRUE, B_TRUE }, - { "readonly", prop_type_boolean, 0, NULL, prop_inherit, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off", "RDONLY", B_TRUE, B_TRUE }, - { "jailed", prop_type_boolean, 0, NULL, prop_inherit, - ZFS_TYPE_FILESYSTEM, - "on | off", "JAILED", B_TRUE, B_TRUE }, - { "snapdir", prop_type_index, ZFS_SNAPDIR_HIDDEN, "hidden", - prop_inherit, - ZFS_TYPE_FILESYSTEM, - "hidden | visible", "SNAPDIR", B_TRUE, B_TRUE }, - { "aclmode", prop_type_index, ZFS_ACL_GROUPMASK, "groupmask", - prop_inherit, ZFS_TYPE_FILESYSTEM, - "discard | groupmask | passthrough", "ACLMODE", B_TRUE, B_TRUE }, - { "aclinherit", prop_type_index, ZFS_ACL_SECURE, "secure", - prop_inherit, ZFS_TYPE_FILESYSTEM, - "discard | noallow | secure | passthrough", "ACLINHERIT", B_TRUE, - B_TRUE }, - { "createtxg", prop_type_number, 0, NULL, prop_readonly, - ZFS_TYPE_ANY, NULL, NULL, B_FALSE, B_FALSE }, - { "name", prop_type_string, 0, NULL, prop_readonly, - ZFS_TYPE_ANY, NULL, "NAME", B_FALSE, B_FALSE }, - { "canmount", prop_type_boolean, 1, NULL, prop_default, - ZFS_TYPE_FILESYSTEM, - "on | off", "CANMOUNT", B_TRUE, B_TRUE }, - { "shareiscsi", prop_type_string, 0, "off", prop_inherit, - ZFS_TYPE_ANY, - "on | off | type=<type>", "SHAREISCSI", B_FALSE, B_TRUE }, - { "iscsioptions", prop_type_string, 0, NULL, prop_inherit, - ZFS_TYPE_VOLUME, NULL, "ISCSIOPTIONS", B_FALSE, B_FALSE }, - { "xattr", prop_type_boolean, 1, NULL, prop_inherit, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "on | off", "XATTR", B_TRUE, B_TRUE }, - { "numclones", prop_type_number, 0, NULL, prop_readonly, - ZFS_TYPE_SNAPSHOT, NULL, NULL, B_FALSE, B_FALSE }, - { "copies", prop_type_index, 1, "1", prop_inherit, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "1 | 2 | 3", "COPIES", B_TRUE, B_TRUE }, - { "bootfs", prop_type_string, 0, NULL, prop_default, - ZFS_TYPE_POOL, "<filesystem>", "BOOTFS", B_FALSE, B_TRUE }, -}; - -#define ZFS_PROP_COUNT ((sizeof (zfs_prop_table))/(sizeof (prop_desc_t))) - -/* - * Returns TRUE if the property applies to the given dataset types. - */ -int -zfs_prop_valid_for_type(zfs_prop_t prop, int types) -{ - return ((zfs_prop_table[prop].pd_types & types) != 0); -} - -/* - * Determine if the specified property is visible or not. - */ -boolean_t -zfs_prop_is_visible(zfs_prop_t prop) -{ - if (prop < 0) - return (B_FALSE); - - return (zfs_prop_table[prop].pd_visible); -} - -/* - * Iterate over all properties, calling back into the specified function - * for each property. We will continue to iterate until we either - * reach the end or the callback function something other than - * ZFS_PROP_CONT. - */ -zfs_prop_t -zfs_prop_iter_common(zfs_prop_f func, void *cb, zfs_type_t type, - boolean_t show_all) -{ - int i; - - for (i = 0; i < ZFS_PROP_COUNT; i++) { - if (zfs_prop_valid_for_type(i, type) && - (zfs_prop_is_visible(i) || show_all)) { - if (func(i, cb) != ZFS_PROP_CONT) - return (i); - } - } - return (ZFS_PROP_CONT); -} +static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; -zfs_prop_t -zfs_prop_iter(zfs_prop_f func, void *cb, boolean_t show_all) +zprop_desc_t * +zfs_prop_get_table(void) { - return (zfs_prop_iter_common(func, cb, ZFS_TYPE_ANY, show_all)); + return (zfs_prop_table); } -zpool_prop_t -zpool_prop_iter(zpool_prop_f func, void *cb, boolean_t show_all) +void +zfs_prop_init(void) { - return (zfs_prop_iter_common(func, cb, ZFS_TYPE_POOL, show_all)); -} + static zprop_index_t checksum_table[] = { + { "on", ZIO_CHECKSUM_ON }, + { "off", ZIO_CHECKSUM_OFF }, + { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, + { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, + { "sha256", ZIO_CHECKSUM_SHA256 }, + { NULL } + }; + + static zprop_index_t compress_table[] = { + { "on", ZIO_COMPRESS_ON }, + { "off", ZIO_COMPRESS_OFF }, + { "lzjb", ZIO_COMPRESS_LZJB }, + { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */ + { "gzip-1", ZIO_COMPRESS_GZIP_1 }, + { "gzip-2", ZIO_COMPRESS_GZIP_2 }, + { "gzip-3", ZIO_COMPRESS_GZIP_3 }, + { "gzip-4", ZIO_COMPRESS_GZIP_4 }, + { "gzip-5", ZIO_COMPRESS_GZIP_5 }, + { "gzip-6", ZIO_COMPRESS_GZIP_6 }, + { "gzip-7", ZIO_COMPRESS_GZIP_7 }, + { "gzip-8", ZIO_COMPRESS_GZIP_8 }, + { "gzip-9", ZIO_COMPRESS_GZIP_9 }, + { NULL } + }; + + static zprop_index_t snapdir_table[] = { + { "hidden", ZFS_SNAPDIR_HIDDEN }, + { "visible", ZFS_SNAPDIR_VISIBLE }, + { NULL } + }; + + static zprop_index_t acl_mode_table[] = { + { "discard", ZFS_ACL_DISCARD }, + { "groupmask", ZFS_ACL_GROUPMASK }, + { "passthrough", ZFS_ACL_PASSTHROUGH }, + { NULL } + }; + + static zprop_index_t acl_inherit_table[] = { + { "discard", ZFS_ACL_DISCARD }, + { "noallow", ZFS_ACL_NOALLOW }, + { "restricted", ZFS_ACL_RESTRICTED }, + { "passthrough", ZFS_ACL_PASSTHROUGH }, + { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */ + { NULL } + }; + + static zprop_index_t case_table[] = { + { "sensitive", ZFS_CASE_SENSITIVE }, + { "insensitive", ZFS_CASE_INSENSITIVE }, + { "mixed", ZFS_CASE_MIXED }, + { NULL } + }; + + static zprop_index_t copies_table[] = { + { "1", 1 }, + { "2", 2 }, + { "3", 3 }, + { NULL } + }; -zfs_proptype_t -zfs_prop_get_type(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_proptype); -} - -static boolean_t -propname_match(const char *p, zfs_prop_t prop, size_t len) -{ - const char *propname = zfs_prop_table[prop].pd_name; -#ifndef _KERNEL - const char *colname = zfs_prop_table[prop].pd_colname; - int c; -#endif - -#ifndef _KERNEL - if (colname == NULL) - return (B_FALSE); -#endif - - if (len == strlen(propname) && - strncmp(p, propname, len) == 0) - return (B_TRUE); - -#ifndef _KERNEL - if (len != strlen(colname)) - return (B_FALSE); - - for (c = 0; c < len; c++) - if (p[c] != tolower(colname[c])) - break; - - return (colname[c] == '\0'); -#else - return (B_FALSE); -#endif -} - -zfs_prop_t -zfs_name_to_prop_cb(zfs_prop_t prop, void *cb_data) -{ - const char *propname = cb_data; - - if (propname_match(propname, prop, strlen(propname))) - return (prop); - - return (ZFS_PROP_CONT); + /* + * Use the unique flags we have to send to u8_strcmp() and/or + * u8_textprep() to represent the various normalization property + * values. + */ + static zprop_index_t normalize_table[] = { + { "none", 0 }, + { "formD", U8_TEXTPREP_NFD }, + { "formKC", U8_TEXTPREP_NFKC }, + { "formC", U8_TEXTPREP_NFC }, + { "formKD", U8_TEXTPREP_NFKD }, + { NULL } + }; + + static zprop_index_t version_table[] = { + { "1", 1 }, + { "2", 2 }, + { "3", 3 }, + { "current", ZPL_VERSION }, + { NULL } + }; + + static zprop_index_t boolean_table[] = { + { "off", 0 }, + { "on", 1 }, + { NULL } + }; + + static zprop_index_t canmount_table[] = { + { "off", ZFS_CANMOUNT_OFF }, + { "on", ZFS_CANMOUNT_ON }, + { "noauto", ZFS_CANMOUNT_NOAUTO }, + { NULL } + }; + + static zprop_index_t cache_table[] = { + { "none", ZFS_CACHE_NONE }, + { "metadata", ZFS_CACHE_METADATA }, + { "all", ZFS_CACHE_ALL }, + { NULL } + }; + + /* inherit index properties */ + register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", + checksum_table); + register_index(ZFS_PROP_COMPRESSION, "compression", + ZIO_COMPRESS_DEFAULT, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table); + register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "hidden | visible", "SNAPDIR", snapdir_table); + register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_GROUPMASK, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "discard | groupmask | passthrough", "ACLMODE", acl_mode_table); + register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "discard | noallow | restricted | passthrough", + "ACLINHERIT", acl_inherit_table); + register_index(ZFS_PROP_COPIES, "copies", 1, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "1 | 2 | 3", "COPIES", copies_table); + register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", + ZFS_CACHE_ALL, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, + "all | none | metadata", "PRIMARYCACHE", cache_table); + register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", + ZFS_CACHE_ALL, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, + "all | none | metadata", "SECONDARYCACHE", cache_table); + + /* inherit index (boolean) properties */ + register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); + register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", + boolean_table); + register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", + boolean_table); + register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", + boolean_table); + register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", + boolean_table); + register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table); + register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR", + boolean_table); + register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", + boolean_table); + register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", + boolean_table); + + /* default index properties */ + register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "1 | 2 | 3 | current", "VERSION", version_table); + register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, + PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", + "CANMOUNT", canmount_table); + + /* readonly index (boolean) properties */ + register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); + + /* set once index properties */ + register_index(ZFS_PROP_NORMALIZE, "normalization", 0, + PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "none | formC | formD | formKC | formKD", "NORMALIZATION", + normalize_table); + register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE, + PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "sensitive | insensitive | mixed", "CASE", case_table); + + /* set once index (boolean) properties */ + register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "on | off", "UTF8ONLY", boolean_table); + + /* string properties */ + register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN"); + register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT"); + register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS"); + register_string(ZFS_PROP_SHAREISCSI, "shareiscsi", "off", PROP_INHERIT, + ZFS_TYPE_DATASET, "on | off | type=<type>", "SHAREISCSI"); + register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, + ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE"); + register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB"); + + /* readonly number properties */ + register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, + ZFS_TYPE_DATASET, "<size>", "USED"); + register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL"); + register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY, + ZFS_TYPE_DATASET, "<size>", "REFER"); + register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, + PROP_READONLY, ZFS_TYPE_DATASET, + "<1.00x or higher if compressed>", "RATIO"); + register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192, + PROP_ONETIME, + ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); + register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDSNAP"); + register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDDS"); + register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDCHILD"); + register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, + PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV"); + + /* default number properties */ + register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, + ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA"); + register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV"); + register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, + ZFS_TYPE_VOLUME, "<size>", "VOLSIZE"); + register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, + ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA"); + register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, + PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "<size> | none", "REFRESERV"); + + /* inherit number properties */ + register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE, + PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE"); + + /* hidden properties */ + register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, NULL); + register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL); + register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, + PROP_READONLY, ZFS_TYPE_DATASET, "NAME"); + register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, + PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); + register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY, + ZFS_TYPE_DATASET, "GUID"); + + /* oddball properties */ + register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, + PROP_READONLY, ZFS_TYPE_DATASET, + "<date>", "CREATION", B_FALSE, B_TRUE, NULL); } -/* - * Given a property name and its type, returns the corresponding property ID. - */ -zfs_prop_t -zfs_name_to_prop_common(const char *propname, zfs_type_t type) +boolean_t +zfs_prop_delegatable(zfs_prop_t prop) { - zfs_prop_t prop; - - prop = zfs_prop_iter_common(zfs_name_to_prop_cb, (void *)propname, - type, B_TRUE); - return (prop == ZFS_PROP_CONT ? ZFS_PROP_INVAL : prop); + zprop_desc_t *pd = &zfs_prop_table[prop]; + return (pd->pd_attr != PROP_READONLY); } /* @@ -308,17 +326,9 @@ zfs_name_to_prop_common(const char *propname, zfs_type_t type) zfs_prop_t zfs_name_to_prop(const char *propname) { - return (zfs_name_to_prop_common(propname, ZFS_TYPE_ANY)); + return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); } -/* - * Given a pool property name, returns the corresponding property ID. - */ -zpool_prop_t -zpool_name_to_prop(const char *propname) -{ - return (zfs_name_to_prop_common(propname, ZFS_TYPE_POOL)); -} /* * For user property names, we allow all lowercase alphanumeric characters, plus @@ -357,179 +367,85 @@ zfs_prop_user(const char *name) } /* - * Return the default value for the given property. + * Tables of index types, plus functions to convert between the user view + * (strings) and internal representation (uint64_t). */ -const char * -zfs_prop_default_string(zfs_prop_t prop) +int +zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) { - return (zfs_prop_table[prop].pd_strdefault); + return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET)); } -uint64_t -zfs_prop_default_numeric(zfs_prop_t prop) +int +zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) { - return (zfs_prop_table[prop].pd_numdefault); + return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); } /* - * Returns TRUE if the property is readonly. + * Returns TRUE if the property applies to any of the given dataset types. */ -int -zfs_prop_readonly(zfs_prop_t prop) +boolean_t +zfs_prop_valid_for_type(int prop, zfs_type_t types) { - return (zfs_prop_table[prop].pd_attr == prop_readonly); + return (zprop_valid_for_type(prop, types)); } -/* - * Given a dataset property ID, returns the corresponding name. - * Assuming the zfs dataset propety ID is valid. - */ -const char * -zfs_prop_to_name(zfs_prop_t prop) +zprop_type_t +zfs_prop_get_type(zfs_prop_t prop) { - return (zfs_prop_table[prop].pd_name); + return (zfs_prop_table[prop].pd_proptype); } /* - * Given a pool property ID, returns the corresponding name. - * Assuming the pool propety ID is valid. + * Returns TRUE if the property is readonly. */ -const char * -zpool_prop_to_name(zpool_prop_t prop) +boolean_t +zfs_prop_readonly(zfs_prop_t prop) { - return (zfs_prop_table[prop].pd_name); + return (zfs_prop_table[prop].pd_attr == PROP_READONLY || + zfs_prop_table[prop].pd_attr == PROP_ONETIME); } /* - * Returns TRUE if the property is inheritable. + * Returns TRUE if the property is only allowed to be set once. */ -int -zfs_prop_inheritable(zfs_prop_t prop) +boolean_t +zfs_prop_setonce(zfs_prop_t prop) { - return (zfs_prop_table[prop].pd_attr == prop_inherit); + return (zfs_prop_table[prop].pd_attr == PROP_ONETIME); } -typedef struct zfs_index { - const char *name; - uint64_t index; -} zfs_index_t; - -static zfs_index_t checksum_table[] = { - { "on", ZIO_CHECKSUM_ON }, - { "off", ZIO_CHECKSUM_OFF }, - { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, - { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, - { "sha256", ZIO_CHECKSUM_SHA256 }, - { NULL } -}; - -static zfs_index_t compress_table[] = { - { "on", ZIO_COMPRESS_ON }, - { "off", ZIO_COMPRESS_OFF }, - { "lzjb", ZIO_COMPRESS_LZJB }, - { "gzip", ZIO_COMPRESS_GZIP_6 }, /* the default gzip level */ - { "gzip-1", ZIO_COMPRESS_GZIP_1 }, - { "gzip-2", ZIO_COMPRESS_GZIP_2 }, - { "gzip-3", ZIO_COMPRESS_GZIP_3 }, - { "gzip-4", ZIO_COMPRESS_GZIP_4 }, - { "gzip-5", ZIO_COMPRESS_GZIP_5 }, - { "gzip-6", ZIO_COMPRESS_GZIP_6 }, - { "gzip-7", ZIO_COMPRESS_GZIP_7 }, - { "gzip-8", ZIO_COMPRESS_GZIP_8 }, - { "gzip-9", ZIO_COMPRESS_GZIP_9 }, - { NULL } -}; - -static zfs_index_t snapdir_table[] = { - { "hidden", ZFS_SNAPDIR_HIDDEN }, - { "visible", ZFS_SNAPDIR_VISIBLE }, - { NULL } -}; - -static zfs_index_t acl_mode_table[] = { - { "discard", ZFS_ACL_DISCARD }, - { "groupmask", ZFS_ACL_GROUPMASK }, - { "passthrough", ZFS_ACL_PASSTHROUGH }, - { NULL } -}; - -static zfs_index_t acl_inherit_table[] = { - { "discard", ZFS_ACL_DISCARD }, - { "noallow", ZFS_ACL_NOALLOW }, - { "secure", ZFS_ACL_SECURE }, - { "passthrough", ZFS_ACL_PASSTHROUGH }, - { NULL } -}; - -static zfs_index_t copies_table[] = { - { "1", 1 }, - { "2", 2 }, - { "3", 3 }, - { NULL } -}; - -static zfs_index_t * -zfs_prop_index_table(zfs_prop_t prop) +const char * +zfs_prop_default_string(zfs_prop_t prop) { - switch (prop) { - case ZFS_PROP_CHECKSUM: - return (checksum_table); - case ZFS_PROP_COMPRESSION: - return (compress_table); - case ZFS_PROP_SNAPDIR: - return (snapdir_table); - case ZFS_PROP_ACLMODE: - return (acl_mode_table); - case ZFS_PROP_ACLINHERIT: - return (acl_inherit_table); - case ZFS_PROP_COPIES: - return (copies_table); - default: - return (NULL); - } + return (zfs_prop_table[prop].pd_strdefault); } +uint64_t +zfs_prop_default_numeric(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_numdefault); +} /* - * Tables of index types, plus functions to convert between the user view - * (strings) and internal representation (uint64_t). + * Given a dataset property ID, returns the corresponding name. + * Assuming the zfs dataset property ID is valid. */ -int -zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) +const char * +zfs_prop_to_name(zfs_prop_t prop) { - zfs_index_t *table; - int i; - - if ((table = zfs_prop_index_table(prop)) == NULL) - return (-1); - - for (i = 0; table[i].name != NULL; i++) { - if (strcmp(string, table[i].name) == 0) { - *index = table[i].index; - return (0); - } - } - - return (-1); + return (zfs_prop_table[prop].pd_name); } -int -zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) +/* + * Returns TRUE if the property is inheritable. + */ +boolean_t +zfs_prop_inheritable(zfs_prop_t prop) { - zfs_index_t *table; - int i; - - if ((table = zfs_prop_index_table(prop)) == NULL) - return (-1); - - for (i = 0; table[i].name != NULL; i++) { - if (table[i].index == index) { - *string = table[i].name; - return (0); - } - } - - return (-1); + return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || + zfs_prop_table[prop].pd_attr == PROP_ONETIME); } #ifndef _KERNEL @@ -541,22 +457,6 @@ zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) const char * zfs_prop_values(zfs_prop_t prop) { - if (zfs_prop_table[prop].pd_types == ZFS_TYPE_POOL) - return (NULL); - - return (zfs_prop_table[prop].pd_values); -} - -/* - * Returns a string describing the set of acceptable values for the given - * zpool property, or NULL if it cannot be set. - */ -const char * -zpool_prop_values(zfs_prop_t prop) -{ - if (zfs_prop_table[prop].pd_types != ZFS_TYPE_POOL) - return (NULL); - return (zfs_prop_table[prop].pd_values); } @@ -568,8 +468,8 @@ zpool_prop_values(zfs_prop_t prop) int zfs_prop_is_string(zfs_prop_t prop) { - return (zfs_prop_table[prop].pd_proptype == prop_type_string || - zfs_prop_table[prop].pd_proptype == prop_type_index); + return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || + zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); } /* @@ -592,66 +492,4 @@ zfs_prop_align_right(zfs_prop_t prop) return (zfs_prop_table[prop].pd_rightalign); } -/* - * Determines the minimum width for the column, and indicates whether it's fixed - * or not. Only string columns are non-fixed. - */ -size_t -zfs_prop_width(zfs_prop_t prop, boolean_t *fixed) -{ - prop_desc_t *pd = &zfs_prop_table[prop]; - zfs_index_t *idx; - size_t ret; - int i; - - *fixed = B_TRUE; - - /* - * Start with the width of the column name. - */ - ret = strlen(pd->pd_colname); - - /* - * For fixed-width values, make sure the width is large enough to hold - * any possible value. - */ - switch (pd->pd_proptype) { - case prop_type_number: - /* - * The maximum length of a human-readable number is 5 characters - * ("20.4M", for example). - */ - if (ret < 5) - ret = 5; - /* - * 'creation' is handled specially because it's a number - * internally, but displayed as a date string. - */ - if (prop == ZFS_PROP_CREATION) - *fixed = B_FALSE; - break; - case prop_type_boolean: - /* - * The maximum length of a boolean value is 3 characters, for - * "off". - */ - if (ret < 3) - ret = 3; - break; - case prop_type_index: - idx = zfs_prop_index_table(prop); - for (i = 0; idx[i].name != NULL; i++) { - if (strlen(idx[i].name) > ret) - ret = strlen(idx[i].name); - } - break; - - case prop_type_string: - *fixed = B_FALSE; - break; - } - - return (ret); -} - #endif diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h index 133e740ce6bc..da5ae43093e5 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -40,14 +40,87 @@ extern "C" { * in the kernel, but the string value in userland. */ typedef enum { - prop_type_number, /* numeric value */ - prop_type_string, /* string value */ - prop_type_boolean, /* boolean value */ - prop_type_index /* numeric value indexed by string */ -} zfs_proptype_t; - -zfs_proptype_t zfs_prop_get_type(zfs_prop_t); -size_t zfs_prop_width(zfs_prop_t, boolean_t *); + PROP_TYPE_NUMBER, /* numeric value */ + PROP_TYPE_STRING, /* string value */ + PROP_TYPE_INDEX /* numeric value indexed by string */ +} zprop_type_t; + +typedef enum { + PROP_DEFAULT, + PROP_READONLY, + PROP_INHERIT, + /* + * ONETIME properties are a sort of conglomeration of READONLY + * and INHERIT. They can be set only during object creation, + * after that they are READONLY. If not explicitly set during + * creation, they can be inherited. + */ + PROP_ONETIME +} zprop_attr_t; + +typedef struct zfs_index { + const char *pi_name; + uint64_t pi_value; +} zprop_index_t; + +typedef struct { + const char *pd_name; /* human-readable property name */ + int pd_propnum; /* property number */ + zprop_type_t pd_proptype; /* string, boolean, index, number */ + const char *pd_strdefault; /* default for strings */ + uint64_t pd_numdefault; /* for boolean / index / number */ + zprop_attr_t pd_attr; /* default, readonly, inherit */ + int pd_types; /* bitfield of valid dataset types */ + /* fs | vol | snap; or pool */ + const char *pd_values; /* string telling acceptable values */ + const char *pd_colname; /* column header for "zfs list" */ + boolean_t pd_rightalign; /* column alignment for "zfs list" */ + boolean_t pd_visible; /* do we list this property with the */ + /* "zfs get" help message */ + const zprop_index_t *pd_table; /* for index properties, a table */ + /* defining the possible values */ +} zprop_desc_t; + +/* + * zfs dataset property functions + */ +void zfs_prop_init(void); +zprop_type_t zfs_prop_get_type(zfs_prop_t); +boolean_t zfs_prop_delegatable(zfs_prop_t prop); +zprop_desc_t *zfs_prop_get_table(void); + +/* + * zpool property functions + */ +void zpool_prop_init(void); +zprop_type_t zpool_prop_get_type(zpool_prop_t); +zprop_desc_t *zpool_prop_get_table(void); + +/* + * Common routines to initialize property tables + */ +void register_impl(int, const char *, zprop_type_t, uint64_t, + const char *, zprop_attr_t, int, const char *, const char *, + boolean_t, boolean_t, const zprop_index_t *); +void register_string(int, const char *, const char *, zprop_attr_t attr, + int, const char *, const char *); +void register_number(int, const char *, uint64_t, zprop_attr_t, int, + const char *, const char *); +void register_index(int, const char *, uint64_t, zprop_attr_t, int, + const char *, const char *, const zprop_index_t *); +void register_hidden(int, const char *, zprop_type_t, zprop_attr_t, + int, const char *); + +/* + * Common routines for zfs and zpool property management + */ +int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t); +int zprop_name_to_prop(const char *, zfs_type_t); +int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t); +int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t); +const char *zprop_values(int, zfs_type_t); +size_t zprop_width(int, boolean_t *, zfs_type_t); +boolean_t zprop_valid_for_type(int, zfs_type_t); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c new file mode 100644 index 000000000000..f5efe18d248b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c @@ -0,0 +1,186 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zio.h> +#include <sys/spa.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/fs/zfs.h> + +#include "zfs_prop.h" + +#if defined(_KERNEL) +#include <sys/systm.h> +#else +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#endif + +static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS]; + +zprop_desc_t * +zpool_prop_get_table(void) +{ + return (zpool_prop_table); +} + +void +zpool_prop_init(void) +{ + static zprop_index_t boolean_table[] = { + { "off", 0}, + { "on", 1}, + { NULL } + }; + + static zprop_index_t failuremode_table[] = { + { "wait", ZIO_FAILURE_MODE_WAIT }, + { "continue", ZIO_FAILURE_MODE_CONTINUE }, + { "panic", ZIO_FAILURE_MODE_PANIC }, + { NULL } + }; + + /* string properties */ + register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, + ZFS_TYPE_POOL, "<path>", "ALTROOT"); + register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, + ZFS_TYPE_POOL, "<filesystem>", "BOOTFS"); + register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT, + ZFS_TYPE_POOL, "<file> | none", "CACHEFILE"); + + /* readonly number properties */ + register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<size>", "SIZE"); + register_number(ZPOOL_PROP_USED, "used", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<size>", "USED"); + register_number(ZPOOL_PROP_AVAILABLE, "available", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<size>", "AVAIL"); + register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<size>", "CAP"); + register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<guid>", "GUID"); + register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<state>", "HEALTH"); + + /* default number properties */ + register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, + PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION"); + + /* default index (boolean) properties */ + register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT, + ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table); + register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT, + ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); + register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT, + ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table); + + /* default index properties */ + register_index(ZPOOL_PROP_FAILUREMODE, "failmode", + ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, + "wait | continue | panic", "FAILMODE", failuremode_table); + + /* hidden properties */ + register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, + PROP_READONLY, ZFS_TYPE_POOL, "NAME"); +} + +/* + * Given a property name and its type, returns the corresponding property ID. + */ +zpool_prop_t +zpool_name_to_prop(const char *propname) +{ + return (zprop_name_to_prop(propname, ZFS_TYPE_POOL)); +} + +/* + * Given a pool property ID, returns the corresponding name. + * Assuming the pool propety ID is valid. + */ +const char * +zpool_prop_to_name(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_name); +} + +zprop_type_t +zpool_prop_get_type(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_proptype); +} + +boolean_t +zpool_prop_readonly(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_attr == PROP_READONLY); +} + +const char * +zpool_prop_default_string(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_strdefault); +} + +uint64_t +zpool_prop_default_numeric(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_numdefault); +} + +int +zpool_prop_string_to_index(zpool_prop_t prop, const char *string, + uint64_t *index) +{ + return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL)); +} + +int +zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index, + const char **string) +{ + return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL)); +} + +#ifndef _KERNEL + +const char * +zpool_prop_values(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_values); +} + +const char * +zpool_prop_column_name(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_colname); +} + +boolean_t +zpool_prop_align_right(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_rightalign); +} +#endif diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c new file mode 100644 index 000000000000..87619e1cbf07 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c @@ -0,0 +1,406 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Common routines used by zfs and zpool property management. + */ + +#include <sys/zio.h> +#include <sys/spa.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_znode.h> +#include <sys/fs/zfs.h> + +#include "zfs_prop.h" +#include "zfs_deleg.h" + +#if defined(_KERNEL) +#include <sys/systm.h> +#include <sys/libkern.h> +#else +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#endif + +static zprop_desc_t * +zprop_get_proptable(zfs_type_t type) +{ + if (type == ZFS_TYPE_POOL) + return (zpool_prop_get_table()); + else + return (zfs_prop_get_table()); +} + +static int +zprop_get_numprops(zfs_type_t type) +{ + if (type == ZFS_TYPE_POOL) + return (ZPOOL_NUM_PROPS); + else + return (ZFS_NUM_PROPS); +} + +void +register_impl(int prop, const char *name, zprop_type_t type, + uint64_t numdefault, const char *strdefault, zprop_attr_t attr, + int objset_types, const char *values, const char *colname, + boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl) +{ + zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types); + zprop_desc_t *pd; + + pd = &prop_tbl[prop]; + + ASSERT(pd->pd_name == NULL || pd->pd_name == name); + + pd->pd_name = name; + pd->pd_propnum = prop; + pd->pd_proptype = type; + pd->pd_numdefault = numdefault; + pd->pd_strdefault = strdefault; + pd->pd_attr = attr; + pd->pd_types = objset_types; + pd->pd_values = values; + pd->pd_colname = colname; + pd->pd_rightalign = rightalign; + pd->pd_visible = visible; + pd->pd_table = idx_tbl; +} + +void +register_string(int prop, const char *name, const char *def, + zprop_attr_t attr, int objset_types, const char *values, + const char *colname) +{ + register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr, + objset_types, values, colname, B_FALSE, B_TRUE, NULL); + +} + +void +register_number(int prop, const char *name, uint64_t def, zprop_attr_t attr, + int objset_types, const char *values, const char *colname) +{ + register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr, + objset_types, values, colname, B_TRUE, B_TRUE, NULL); +} + +void +register_index(int prop, const char *name, uint64_t def, zprop_attr_t attr, + int objset_types, const char *values, const char *colname, + const zprop_index_t *idx_tbl) +{ + register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr, + objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl); +} + +void +register_hidden(int prop, const char *name, zprop_type_t type, + zprop_attr_t attr, int objset_types, const char *colname) +{ + register_impl(prop, name, type, 0, NULL, attr, + objset_types, NULL, colname, B_FALSE, B_FALSE, NULL); +} + + +/* + * A comparison function we can use to order indexes into property tables. + */ +static int +zprop_compare(const void *arg1, const void *arg2) +{ + const zprop_desc_t *p1 = *((zprop_desc_t **)arg1); + const zprop_desc_t *p2 = *((zprop_desc_t **)arg2); + boolean_t p1ro, p2ro; + + p1ro = (p1->pd_attr == PROP_READONLY); + p2ro = (p2->pd_attr == PROP_READONLY); + + if (p1ro == p2ro) + return (strcmp(p1->pd_name, p2->pd_name)); + + return (p1ro ? -1 : 1); +} + +/* + * Iterate over all properties in the given property table, calling back + * into the specified function for each property. We will continue to + * iterate until we either reach the end or the callback function returns + * something other than ZPROP_CONT. + */ +int +zprop_iter_common(zprop_func func, void *cb, boolean_t show_all, + boolean_t ordered, zfs_type_t type) +{ + int i, j, num_props, size, prop; + zprop_desc_t *prop_tbl; + zprop_desc_t **order; + + prop_tbl = zprop_get_proptable(type); + num_props = zprop_get_numprops(type); + size = num_props * sizeof (zprop_desc_t *); + +#if defined(_KERNEL) + order = kmem_alloc(size, KM_SLEEP); +#else + if ((order = malloc(size)) == NULL) + return (ZPROP_CONT); +#endif + + for (j = 0; j < num_props; j++) + order[j] = &prop_tbl[j]; + + if (ordered) { + qsort((void *)order, num_props, sizeof (zprop_desc_t *), + zprop_compare); + } + + prop = ZPROP_CONT; + for (i = 0; i < num_props; i++) { + if ((order[i]->pd_visible || show_all) && + (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) { + prop = order[i]->pd_propnum; + break; + } + } + +#if defined(_KERNEL) + kmem_free(order, size); +#else + free(order); +#endif + return (prop); +} + +static boolean_t +propname_match(const char *p, size_t len, zprop_desc_t *prop_entry) +{ + const char *propname = prop_entry->pd_name; +#ifndef _KERNEL + const char *colname = prop_entry->pd_colname; + int c; + + if (colname == NULL) + return (B_FALSE); +#endif + + if (len == strlen(propname) && + strncmp(p, propname, len) == 0) + return (B_TRUE); + +#ifndef _KERNEL + if (len != strlen(colname)) + return (B_FALSE); + + for (c = 0; c < len; c++) + if (p[c] != tolower(colname[c])) + break; + + return (colname[c] == '\0'); +#else + return (B_FALSE); +#endif +} + +typedef struct name_to_prop_cb { + const char *propname; + zprop_desc_t *prop_tbl; +} name_to_prop_cb_t; + +static int +zprop_name_to_prop_cb(int prop, void *cb_data) +{ + name_to_prop_cb_t *data = cb_data; + + if (propname_match(data->propname, strlen(data->propname), + &data->prop_tbl[prop])) + return (prop); + + return (ZPROP_CONT); +} + +int +zprop_name_to_prop(const char *propname, zfs_type_t type) +{ + int prop; + name_to_prop_cb_t cb_data; + + cb_data.propname = propname; + cb_data.prop_tbl = zprop_get_proptable(type); + + prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data, + B_TRUE, B_FALSE, type); + + return (prop == ZPROP_CONT ? ZPROP_INVAL : prop); +} + +int +zprop_string_to_index(int prop, const char *string, uint64_t *index, + zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + const zprop_index_t *idx_tbl; + int i; + + if (prop == ZPROP_INVAL || prop == ZPROP_CONT) + return (-1); + + ASSERT(prop < zprop_get_numprops(type)); + prop_tbl = zprop_get_proptable(type); + if ((idx_tbl = prop_tbl[prop].pd_table) == NULL) + return (-1); + + for (i = 0; idx_tbl[i].pi_name != NULL; i++) { + if (strcmp(string, idx_tbl[i].pi_name) == 0) { + *index = idx_tbl[i].pi_value; + return (0); + } + } + + return (-1); +} + +int +zprop_index_to_string(int prop, uint64_t index, const char **string, + zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + const zprop_index_t *idx_tbl; + int i; + + if (prop == ZPROP_INVAL || prop == ZPROP_CONT) + return (-1); + + ASSERT(prop < zprop_get_numprops(type)); + prop_tbl = zprop_get_proptable(type); + if ((idx_tbl = prop_tbl[prop].pd_table) == NULL) + return (-1); + + for (i = 0; idx_tbl[i].pi_name != NULL; i++) { + if (idx_tbl[i].pi_value == index) { + *string = idx_tbl[i].pi_name; + return (0); + } + } + + return (-1); +} + +const char * +zprop_values(int prop, zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + + ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT); + ASSERT(prop < zprop_get_numprops(type)); + + prop_tbl = zprop_get_proptable(type); + + return (prop_tbl[prop].pd_values); +} + +/* + * Returns TRUE if the property applies to any of the given dataset types. + */ +boolean_t +zprop_valid_for_type(int prop, zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + + if (prop == ZPROP_INVAL || prop == ZPROP_CONT) + return (B_FALSE); + + ASSERT(prop < zprop_get_numprops(type)); + prop_tbl = zprop_get_proptable(type); + return ((prop_tbl[prop].pd_types & type) != 0); +} + +#ifndef _KERNEL + +/* + * Determines the minimum width for the column, and indicates whether it's fixed + * or not. Only string columns are non-fixed. + */ +size_t +zprop_width(int prop, boolean_t *fixed, zfs_type_t type) +{ + zprop_desc_t *prop_tbl, *pd; + const zprop_index_t *idx; + size_t ret; + int i; + + ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT); + ASSERT(prop < zprop_get_numprops(type)); + + prop_tbl = zprop_get_proptable(type); + pd = &prop_tbl[prop]; + + *fixed = B_TRUE; + + /* + * Start with the width of the column name. + */ + ret = strlen(pd->pd_colname); + + /* + * For fixed-width values, make sure the width is large enough to hold + * any possible value. + */ + switch (pd->pd_proptype) { + case PROP_TYPE_NUMBER: + /* + * The maximum length of a human-readable number is 5 characters + * ("20.4M", for example). + */ + if (ret < 5) + ret = 5; + /* + * 'creation' is handled specially because it's a number + * internally, but displayed as a date string. + */ + if (prop == ZFS_PROP_CREATION) + *fixed = B_FALSE; + break; + case PROP_TYPE_INDEX: + idx = prop_tbl[prop].pd_table; + for (i = 0; idx[i].pi_name != NULL; i++) { + if (strlen(idx[i].pi_name) > ret) + ret = strlen(idx[i].pi_name); + } + break; + + case PROP_TYPE_STRING: + *fixed = B_FALSE; + break; + } + + return (ret); +} + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index 1800e792fa1e..cf49c78a5b0e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -20,11 +20,9 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This Makefile defines all file modules for the directory uts/common # and its children. These are the source files which may be considered # common to all SunOS systems. @@ -46,7 +44,9 @@ ZFS_COMMON_OBJS += \ dsl_pool.o \ dsl_synctask.o \ dmu_zfetch.o \ + dsl_deleg.o \ dsl_prop.o \ + dsl_scrub.o \ fletcher.o \ gzip.o \ lzjb.o \ @@ -64,6 +64,7 @@ ZFS_COMMON_OBJS += \ unique.o \ vdev.o \ vdev_cache.o \ + vdev_file.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ @@ -75,6 +76,7 @@ ZFS_COMMON_OBJS += \ zap_micro.o \ zfs_byteswap.o \ zfs_fm.o \ + zfs_fuid.o \ zfs_znode.o \ zil.o \ zio.o \ @@ -84,7 +86,11 @@ ZFS_COMMON_OBJS += \ ZFS_SHARED_OBJS += \ zfs_namecheck.o \ - zfs_prop.o + zfs_deleg.o \ + zfs_prop.o \ + zfs_comutil.o \ + zpool_prop.o \ + zprop_common.o ZFS_OBJS += \ $(ZFS_COMMON_OBJS) \ @@ -96,6 +102,7 @@ ZFS_OBJS += \ zfs_log.o \ zfs_replay.o \ zfs_rlock.o \ + rrwlock.o \ zfs_vfsops.o \ zfs_vnops.o \ zvol.o diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c index dd2aa82304ab..d9eb88a40202 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c @@ -20,7 +20,7 @@ */ /* Portions Copyright 2007 Shivakumar GN */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <sys/mutex.h> #include <sys/sysmacros.h> #include <sys/systm.h> +#include <sys/sunddi.h> #include <sys/uio.h> #include <sys/vfs.h> #include <sys/vnode.h> @@ -60,7 +61,7 @@ * * These routines are designed to play a support role for existing * pseudo-filesystems (such as procfs). They simplify common tasks, - * without enforcing the filesystem to hand over management to GFS. The + * without forcing the filesystem to hand over management to GFS. The * routines covered are: * * gfs_readdir_init() @@ -116,6 +117,42 @@ */ /* + * gfs_get_parent_ino: used to obtain a parent inode number and the + * inode number of the given vnode in preparation for calling gfs_readdir_init. + */ +int +gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + ino64_t *pino, ino64_t *ino) +{ + vnode_t *parent; + gfs_dir_t *dp = dvp->v_data; + int error; + + *ino = dp->gfsd_file.gfs_ino; + parent = dp->gfsd_file.gfs_parent; + + if (parent == NULL) { + *pino = *ino; /* root of filesystem */ + } else if (dvp->v_flag & V_XATTRDIR) { +#ifdef TODO + vattr_t va; + + va.va_mask = AT_NODEID; + error = VOP_GETATTR(parent, &va, 0, cr, ct); + if (error) + return (error); + *pino = va.va_nodeid; +#else + panic("%s:%u: not implemented", __func__, __LINE__); +#endif + } else { + *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; + } + + return (0); +} + +/* * gfs_readdir_init: initiate a generic readdir * st - a pointer to an uninitialized gfs_readdir_state_t structure * name_max - the directory's maximum file name length @@ -123,6 +160,7 @@ * uiop - the uiop passed to readdir * parent - the parent directory's inode * self - this directory's inode + * flags - flags from VOP_READDIR * * Returns 0 or a non-zero errno. * @@ -153,8 +191,10 @@ */ int gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, - uio_t *uiop, ino64_t parent, ino64_t self) + uio_t *uiop, ino64_t parent, ino64_t self, int flags) { + size_t dirent_size; + if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || (uiop->uio_loffset % ureclen) != 0) return (EINVAL); @@ -162,9 +202,14 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, st->grd_ureclen = ureclen; st->grd_oresid = uiop->uio_resid; st->grd_namlen = name_max; - st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP); + if (flags & V_RDDIR_ENTFLAGS) + dirent_size = EDIRENT_RECLEN(st->grd_namlen); + else + dirent_size = DIRENT64_RECLEN(st->grd_namlen); + st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); st->grd_parent = parent; st->grd_self = self; + st->grd_flags = flags; return (0); } @@ -172,8 +217,8 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, /* * gfs_readdir_emit_int: internal routine to emit directory entry * - * st - the current readdir state, which must have d_ino and d_name - * set + * st - the current readdir state, which must have d_ino/ed_ino + * and d_name/ed_name set * uiop - caller-supplied uio pointer * next - the offset of the next entry */ @@ -182,9 +227,18 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, int *ncookies, u_long **cookies) { int reclen, namlen; + dirent64_t *dp; + edirent_t *edp; - namlen = strlen(st->grd_dirent->d_name); - reclen = DIRENT64_RECLEN(namlen); + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edp = st->grd_dirent; + namlen = strlen(edp->ed_name); + reclen = EDIRENT_RECLEN(namlen); + } else { + dp = st->grd_dirent; + namlen = strlen(dp->d_name); + reclen = DIRENT64_RECLEN(namlen); + } if (reclen > uiop->uio_resid) { /* @@ -195,10 +249,15 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, return (-1); } - /* XXX: This can change in the future. */ - st->grd_dirent->d_type = DT_DIR; - st->grd_dirent->d_reclen = (ushort_t)reclen; - st->grd_dirent->d_namlen = namlen; + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edp->ed_off = next; + edp->ed_reclen = (ushort_t)reclen; + } else { + /* XXX: This can change in the future. */ + dp->d_reclen = (ushort_t)reclen; + dp->d_type = DT_DIR; + dp->d_namlen = namlen; + } if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) return (EFAULT); @@ -219,6 +278,7 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, * voff - the virtual offset (obtained from gfs_readdir_pred) * ino - the entry's inode * name - the entry's name + * eflags - value for ed_eflags (if processing edirent_t) * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or @@ -227,12 +287,22 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, */ int gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, - ino64_t ino, const char *name, int *ncookies, u_long **cookies) + ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies) { offset_t off = (voff + 2) * st->grd_ureclen; - st->grd_dirent->d_ino = ino; - (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen); + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edirent_t *edp = st->grd_dirent; + + edp->ed_ino = ino; + (void) strncpy(edp->ed_name, name, st->grd_namlen); + edp->ed_eflags = eflags; + } else { + dirent64_t *dp = st->grd_dirent; + + dp->d_ino = ino; + (void) strncpy(dp->d_name, name, st->grd_namlen); + } /* * Inter-entry offsets are invalid, so we assume a record size of @@ -266,11 +336,11 @@ top: voff = off - 2; if (off == 0) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, - ".", ncookies, cookies)) == 0) + ".", 0, ncookies, cookies)) == 0) goto top; } else if (off == 1) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, - "..", ncookies, cookies)) == 0) + "..", 0, ncookies, cookies)) == 0) goto top; } else { *voffp = voff; @@ -292,7 +362,13 @@ top: int gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) { - kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen)); + size_t dirent_size; + + if (st->grd_flags & V_RDDIR_ENTFLAGS) + dirent_size = EDIRENT_RECLEN(st->grd_namlen); + else + dirent_size = DIRENT64_RECLEN(st->grd_namlen); + kmem_free(st->grd_dirent, dirent_size); if (error > 0) return (error); if (eofp) @@ -485,7 +561,7 @@ gfs_file_inactive(vnode_t *vp) gfs_dir_t *dp = NULL; void *data; - if (fp->gfs_parent == NULL) + if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) goto found; dp = fp->gfs_parent->v_data; @@ -511,6 +587,8 @@ gfs_file_inactive(vnode_t *vp) ge = NULL; found: + if (vp->v_flag & V_XATTRDIR) + VI_LOCK(fp->gfs_parent); VI_LOCK(vp); ASSERT(vp->v_count < 2); /* @@ -535,7 +613,8 @@ found: * Free vnode and release parent */ if (fp->gfs_parent) { - gfs_dir_unlock(dp); + if (dp) + gfs_dir_unlock(dp); VI_LOCK(fp->gfs_parent); fp->gfs_parent->v_usecount--; VI_UNLOCK(fp->gfs_parent); @@ -543,6 +622,8 @@ found: ASSERT(vp->v_vfsp != NULL); VFS_RELE(vp->v_vfsp); } + if (vp->v_flag & V_XATTRDIR) + VI_UNLOCK(fp->gfs_parent); return (data); } @@ -570,55 +651,119 @@ gfs_dir_inactive(vnode_t *vp) } /* - * gfs_dir_lookup() + * gfs_dir_lookup_dynamic() * - * Looks up the given name in the directory and returns the corresponding vnode, - * if found. + * This routine looks up the provided name amongst the dynamic entries + * in the gfs directory and returns the corresponding vnode, if found. * - * First, we search statically defined entries, if any. If a match is found, - * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the - * existing vnode. Otherwise, we call the static entry's callback routine, - * caching the result if necessary. + * The gfs directory is expected to be locked by the caller prior to + * calling this function. The directory will be unlocked during the + * execution of this function, but will be locked upon return from the + * function. This function returns 0 on success, non-zero on error. * - * If no static entry is found, we invoke the lookup callback, if any. The - * arguments to this callback are: + * The dynamic lookups are performed by invoking the lookup + * callback, which is passed to this function as the first argument. + * The arguments to the callback are: * - * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp); + * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, + * int flags, int *deflgs, pathname_t *rpnp); * * pvp - parent vnode * nm - name of entry * vpp - pointer to resulting vnode + * cr - pointer to cred + * flags - flags value from lookup request + * ignored here; currently only used to request + * insensitive lookups + * direntflgs - output parameter, directory entry flags + * ignored here; currently only used to indicate a lookup + * has more than one possible match when case is not considered + * realpnp - output parameter, real pathname + * ignored here; when lookup was performed case-insensitively, + * this field contains the "real" name of the file. * * Returns 0 on success, non-zero on error. */ -int -gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) +static int +gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, + const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, + int *direntflags, pathname_t *realpnp) { - int i; - gfs_dirent_t *ge; - vnode_t *vp; - gfs_dir_t *dp = dvp->v_data; - int ret = 0; - - ASSERT(dvp->v_type == VDIR); + gfs_file_t *fp; + ino64_t ino; + int ret; - if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) - return (0); + ASSERT(GFS_DIR_LOCKED(dp)); + /* + * Drop the directory lock, as the lookup routine + * will need to allocate memory, or otherwise deadlock on this + * directory. + */ + gfs_dir_unlock(dp); + ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); gfs_dir_lock(dp); /* + * The callback for extended attributes returns a vnode + * with v_data from an underlying fs. + */ + if (ret == 0 && !IS_XATTRDIR(dvp)) { + fp = (gfs_file_t *)((*vpp)->v_data); + fp->gfs_index = -1; + fp->gfs_ino = ino; + } + + return (ret); +} + +/* + * gfs_dir_lookup_static() + * + * This routine looks up the provided name amongst the static entries + * in the gfs directory and returns the corresponding vnode, if found. + * The first argument to the function is a pointer to the comparison + * function this function should use to decide if names are a match. + * + * If a match is found, and GFS_CACHE_VNODE is set and the vnode + * exists, we simply return the existing vnode. Otherwise, we call + * the static entry's callback routine, caching the result if + * necessary. If the idx pointer argument is non-NULL, we use it to + * return the index of the matching static entry. + * + * The gfs directory is expected to be locked by the caller prior to calling + * this function. The directory may be unlocked during the execution of + * this function, but will be locked upon return from the function. + * + * This function returns 0 if a match is found, ENOENT if not. + */ +static int +gfs_dir_lookup_static(int (*compare)(const char *, const char *), + gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, + vnode_t **vpp, pathname_t *rpnp) +{ + gfs_dirent_t *ge; + vnode_t *vp = NULL; + int i; + + ASSERT(GFS_DIR_LOCKED(dp)); + + /* * Search static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; - if (strcmp(ge->gfse_name, nm) == 0) { + if (compare(ge->gfse_name, nm) == 0) { + if (rpnp) + (void) strlcpy(rpnp->pn_buf, ge->gfse_name, + rpnp->pn_bufsize); + if (ge->gfse_vnode) { ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); vp = ge->gfse_vnode; VN_HOLD(vp); - goto out; + break; } /* @@ -626,8 +771,8 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) * need to do KM_SLEEP allocations. If we return from * the constructor only to find that a parallel * operation has completed, and GFS_CACHE_VNODE is set - * for this entry, we discard the result in favor of the - * cached vnode. + * for this entry, we discard the result in favor of + * the cached vnode. */ gfs_dir_unlock(dp); vp = ge->gfse_ctor(dvp); @@ -660,49 +805,94 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) gfs_dir_lock(dp); } } - - goto out; + break; } } - /* - * See if there is a dynamic constructor. - */ - if (dp->gfsd_lookup) { - ino64_t ino; - gfs_file_t *fp; + if (vp == NULL) + return (ENOENT); + else if (idx) + *idx = i; + *vpp = vp; + return (0); +} - /* - * Once again, drop the directory lock, as the lookup routine - * will need to allocate memory, or otherwise deadlock on this - * directory. - */ - gfs_dir_unlock(dp); - ret = dp->gfsd_lookup(dvp, nm, &vp, &ino); - gfs_dir_lock(dp); - if (ret != 0) - goto out; +/* + * gfs_dir_lookup() + * + * Looks up the given name in the directory and returns the corresponding + * vnode, if found. + * + * First, we search statically defined entries, if any, with a call to + * gfs_dir_lookup_static(). If no static entry is found, and we have + * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). + * + * This function returns 0 on success, non-zero on error. + */ +int +gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, + int flags, int *direntflags, pathname_t *realpnp) +{ + gfs_dir_t *dp = dvp->v_data; + boolean_t casecheck; + vnode_t *dynvp = NULL; + vnode_t *vp = NULL; + int (*compare)(const char *, const char *); + int error, idx; - fp = (gfs_file_t *)vp->v_data; - fp->gfs_index = -1; - fp->gfs_ino = ino; - } else { - /* - * No static entry found, and there is no lookup callback, so - * return ENOENT. - */ - ret = ENOENT; + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) + return (0); + + casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; + if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || + (flags & FIGNORECASE)) + compare = strcasecmp; + else + compare = strcmp; + + gfs_dir_lock(dp); + + error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); + + if (vp && casecheck) { + gfs_dirent_t *ge; + int i; + + for (i = idx + 1; i < dp->gfsd_nstatic; i++) { + ge = &dp->gfsd_static[i]; + + if (strcasecmp(ge->gfse_name, nm) == 0) { + *direntflags |= ED_CASE_CONFLICT; + goto out; + } + } + } + + if ((error || casecheck) && dp->gfsd_lookup) + error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, + &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); + + if (vp && dynvp) { + /* static and dynamic entries are case-insensitive conflict */ + ASSERT(casecheck); + *direntflags |= ED_CASE_CONFLICT; + VN_RELE(dynvp); + } else if (vp == NULL) { + vp = dynvp; + } else if (error == ENOENT) { + error = 0; + } else if (error) { + VN_RELE(vp); + vp = NULL; } out: gfs_dir_unlock(dp); - if (ret == 0) - *vpp = vp; - else - *vpp = NULL; - - return (ret); + *vpp = vp; + return (error); } /* @@ -731,13 +921,15 @@ out: * This is significantly more complex, thanks to the particulars of * VOP_READDIR(). * - * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, - * offset_t *off, offset_t *nextoff, void *data) + * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, + * offset_t *off, offset_t *nextoff, void *data, int flags) * * vp - directory vnode * dp - directory entry, sized according to maxlen given to * gfs_dir_create(). callback must fill in d_name and - * d_ino. + * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags + * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS + * is set in 'flags'. * eofp - callback must set to 1 when EOF has been reached * off - on entry, the last offset read from the directory. Callback * must set to the offset of the current entry, typically left @@ -745,12 +937,13 @@ out: * nextoff - callback must set to offset of next entry. Typically * (off + 1) * data - caller-supplied data + * flags - VOP_READDIR flags * * Return 0 on success, or error on failure. */ int gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, - u_long **cookies, void *data) + u_long **cookies, void *data, cred_t *cr, int flags) { gfs_readdir_state_t gstate; int error, eof = 0; @@ -758,16 +951,12 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, offset_t off, next; gfs_dir_t *dp = dvp->v_data; - ino = dp->gfsd_file.gfs_ino; - - if (dp->gfsd_file.gfs_parent == NULL) - pino = ino; /* root of filesystem */ - else - pino = ((gfs_file_t *) - (dp->gfsd_file.gfs_parent->v_data))->gfs_ino; + error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino); + if (error) + return (error); if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, - pino, ino)) != 0) + pino, ino, flags)) != 0) return (error); while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, @@ -777,8 +966,8 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, ino = dp->gfsd_inode(dvp, off); if ((error = gfs_readdir_emit(&gstate, uiop, - off, ino, dp->gfsd_static[off].gfse_name, ncookies, - cookies)) != 0) + off, ino, dp->gfsd_static[off].gfse_name, 0, + ncookies, cookies)) != 0) break; } else if (dp->gfsd_readdir) { @@ -786,7 +975,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, if ((error = dp->gfsd_readdir(dvp, gstate.grd_dirent, &eof, &off, &next, - data)) != 0 || eof) + data, flags)) != 0 || eof) break; off += dp->gfsd_nstatic + 2; @@ -808,6 +997,21 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, } /* + * gfs_vop_lookup: VOP_LOOKUP() entry point + * + * For use directly in vnode ops table. Given a GFS directory, calls + * gfs_dir_lookup() as necessary. + */ +/* ARGSUSED */ +int +gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); +} + +/* * gfs_vop_readdir: VOP_READDIR() entry point * * For use directly in vnode ops table. Given a GFS directory, calls @@ -827,6 +1031,7 @@ gfs_vop_readdir(ap) { vnode_t *vp = ap->a_vp; uio_t *uiop = ap->a_uio; + cred_t *cr = ap->a_cred; int *eofp = ap->a_eofflag; int ncookies = 0; u_long *cookies = NULL; @@ -842,7 +1047,8 @@ gfs_vop_readdir(ap) *ap->a_ncookies = ncookies; } - error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL); + error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL, + cr, 0); if (error == 0) { /* Subtract unused cookies */ @@ -882,6 +1088,9 @@ gfs_vop_inactive(ap) if (data != NULL) kmem_free(data, fp->gfs_size); + + VI_LOCK(vp); vp->v_data = NULL; + VI_UNLOCK(vp); return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c new file mode 100644 index 000000000000..00a10aae8ec9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/vnode.h> + +/* Extensible attribute (xva) routines. */ + +/* + * Zero out the structure, set the size of the requested/returned bitmaps, + * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer + * to the returned attributes array. + */ +void +xva_init(xvattr_t *xvap) +{ + bzero(xvap, sizeof (xvattr_t)); + xvap->xva_mapsize = XVA_MAPSIZE; + xvap->xva_magic = XVA_MAGIC; + xvap->xva_vattr.va_mask = AT_XVATTR; + xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; +} + +/* + * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t + * structure. Otherwise, returns NULL. + */ +xoptattr_t * +xva_getxoptattr(xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + if (xvap->xva_vattr.va_mask & AT_XVATTR) + xoap = &xvap->xva_xoptattrs; + return (xoap); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 420f802f360d..7ca528033c4f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * DVA-based Adjustable Replacement Cache * @@ -47,13 +45,13 @@ * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we - * implement a "cache throttle" that slowes the flow of new data - * into the cache until we can make space avaiable. + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with - * high use, but also tries to react to memory preasure from the + * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * @@ -75,7 +73,7 @@ * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, - * or 2) via one of the ARC lists. The arc_read() inerface + * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefor provide two * types of locks: 1) the hash table lock array, and 2) the @@ -109,6 +107,14 @@ * * Note that the majority of the performance stats are manipulated * with atomic operations. + * + * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists */ #include <sys/spa.h> @@ -117,6 +123,7 @@ #include <sys/zfs_context.h> #include <sys/arc.h> #include <sys/refcount.h> +#include <sys/vdev.h> #ifdef _KERNEL #include <sys/dnlc.h> #endif @@ -128,6 +135,10 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; +extern int zfs_write_limit_shift; +extern uint64_t zfs_write_limit_max; +extern kmutex_t zfs_write_limit_lock; + #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; @@ -148,28 +159,45 @@ static int arc_min_prefetch_lifespan; static int arc_dead; /* + * The arc has filled available memory and has now warmed up. + */ +static boolean_t arc_warm; + +/* * These tunables are for performance analysis. */ -u_long zfs_arc_max; -u_long zfs_arc_min; -TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max); -TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min); +uint64_t zfs_arc_max; +uint64_t zfs_arc_min; +uint64_t zfs_arc_meta_limit = 0; +int zfs_mdcomp_disable = 0; + +TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); +TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); +TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); +TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); SYSCTL_DECL(_vfs_zfs); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, + &zfs_mdcomp_disable, 0, "Disable metadata compression"); /* - * Note that buffers can be on one of 5 states: + * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache - * When there are no active references to the buffer, they - * are linked onto one of the lists in arc. These are the - * only buffers that can be evicted or deleted. + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies @@ -177,21 +205,30 @@ SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. */ typedef struct arc_state { - list_t arcs_list; /* linked list of evictable buffer in state */ - uint64_t arcs_lsize; /* total size of buffers in the linked list */ - uint64_t arcs_size; /* total size of all buffers in this state */ + list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ + uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; } arc_state_t; -/* The 5 states: */ +/* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; @@ -222,6 +259,24 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_memory_throttle_count; } arc_stats_t; static arc_stats_t arc_stats = { @@ -252,7 +307,25 @@ static arc_stats_t arc_stats = { { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, - { "size", KSTAT_DATA_UINT64 } + { "size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -299,6 +372,7 @@ static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- @@ -316,13 +390,21 @@ static arc_state_t *arc_mfu_ghost; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_meta_used; +static uint64_t arc_meta_limit; +static uint64_t arc_meta_max = 0; +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN, + &arc_meta_used, 0, "ARC metadata used"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN, + &arc_meta_limit, 0, "ARC metadata limit"); + +typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_done_func_t *acb_done; - arc_byteswap_func_t *acb_byteswap; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; @@ -368,6 +450,9 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; + + l2arc_buf_hdr_t *b_l2hdr; + list_node_t b_l2node; }; static arc_buf_t *arc_eviction_list; @@ -375,9 +460,12 @@ static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); +static int arc_evict_needed(arc_buf_contents_t type); +static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); #define GHOST_STATE(state) \ - ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) /* * Private ARC flags. These flags are private ARC only flags that will show up @@ -393,12 +481,31 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ +#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ +#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ +#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ +#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ +#define ARC_STORED (1 << 19) /* has been store()d to */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) +#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) + +/* + * Other sizes + */ + +#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) /* * Hash table routines @@ -431,8 +538,90 @@ static buf_hash_table_t buf_hash_table; uint64_t zfs_crc64_table[256]; +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 4 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval */ + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* + * L2ARC Performance Tunables + */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_write; /* desired write size, bytes */ + uint64_t l2ad_boost; /* warmup write boost, bytes */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ +} l2arc_dev_t; + +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_t *l2rcb_buf; /* read buffer */ + spa_t *l2rcb_spa; /* spa */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ +} l2arc_read_callback_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; + +struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + daddr_t b_daddr; /* disk address, offset byte */ +}; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + void *l2df_data; + size_t l2df_size; + void (*l2df_func)(void *, size_t); + list_node_t l2df_list_node; +} l2arc_data_free_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static void l2arc_read_done(zio_t *zio); +static void l2arc_hdr_stat_add(void); +static void l2arc_hdr_stat_remove(void); + static uint64_t -buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) +buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) { uintptr_t spav = (uintptr_t)spa; uint8_t *vdva = (uint8_t *)dva; @@ -460,7 +649,7 @@ buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); @@ -579,6 +768,20 @@ hdr_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_hdr_t)); refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); + return (0); +} + +/* ARGSUSED */ +static int +buf_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_t)); + rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); return (0); } @@ -594,6 +797,18 @@ hdr_dest(void *vbuf, void *unused) refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); + mutex_destroy(&buf->b_freeze_lock); + + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +/* ARGSUSED */ +static void +buf_dest(void *vbuf, void *unused) +{ + arc_buf_t *buf = vbuf; + + rw_destroy(&buf->b_lock); } /* @@ -639,7 +854,7 @@ retry: hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); + 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) @@ -673,10 +888,24 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&buf->b_hdr->b_freeze_lock); } +static int +arc_cksum_equal(arc_buf_t *buf) +{ + zio_cksum_t zc; + int equal; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); + mutex_exit(&buf->b_hdr->b_freeze_lock); + + return (equal); +} + static void -arc_cksum_compute(arc_buf_t *buf) +arc_cksum_compute(arc_buf_t *buf, boolean_t force) { - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); @@ -693,14 +922,14 @@ arc_cksum_compute(arc_buf_t *buf) void arc_buf_thaw(arc_buf_t *buf) { - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + } - if (buf->b_hdr->b_state != arc_anon) - panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) - panic("modifying buffer while i/o in progress!"); - arc_cksum_verify(buf); mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); @@ -717,7 +946,7 @@ arc_buf_freeze(arc_buf_t *buf) ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); } static void @@ -728,21 +957,23 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc_anon)) { uint64_t delta = ab->b_size * ab->b_datacnt; + list_t *list = &ab->b_state->arcs_list[ab->b_type]; + uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); mutex_enter(&ab->b_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&ab->b_state->arcs_list, ab); + list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { ASSERT3U(ab->b_datacnt, ==, 0); ASSERT3P(ab->b_buf, ==, NULL); delta = ab->b_size; } ASSERT(delta > 0); - ASSERT3U(ab->b_state->arcs_lsize, >=, delta); - atomic_add_64(&ab->b_state->arcs_lsize, -delta); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); mutex_exit(&ab->b_state->arcs_mtx); - /* remove the prefetch flag is we get a reference */ + /* remove the prefetch flag if we get a reference */ if (ab->b_flags & ARC_PREFETCH) ab->b_flags &= ~ARC_PREFETCH; } @@ -759,13 +990,14 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (state != arc_anon)) { + uint64_t *size = &state->arcs_lsize[ab->b_type]; + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list, ab); + list_insert_head(&state->arcs_list[ab->b_type], ab); ASSERT(ab->b_datacnt > 0); - atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); - ASSERT3U(state->arcs_size, >=, state->arcs_lsize); + atomic_add_64(size, ab->b_size * ab->b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -796,12 +1028,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + uint64_t *size = &old_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list, ab); + list_remove(&old_state->arcs_list[ab->b_type], ab); /* * If prefetching out of the ghost cache, @@ -812,19 +1045,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(ab->b_buf == NULL); from_delta = ab->b_size; } - ASSERT3U(old_state->arcs_lsize, >=, from_delta); - atomic_add_64(&old_state->arcs_lsize, -from_delta); + ASSERT3U(*size, >=, from_delta); + atomic_add_64(size, -from_delta); if (use_mutex) mutex_exit(&old_state->arcs_mtx); } if (new_state != arc_anon) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + uint64_t *size = &new_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list, ab); + list_insert_head(&new_state->arcs_list[ab->b_type], ab); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { @@ -832,9 +1066,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(ab->b_buf == NULL); to_delta = ab->b_size; } - atomic_add_64(&new_state->arcs_lsize, to_delta); - ASSERT3U(new_state->arcs_size + to_delta, >=, - new_state->arcs_lsize); + atomic_add_64(size, to_delta); if (use_mutex) mutex_exit(&new_state->arcs_mtx); @@ -842,7 +1074,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && old_state != arc_anon) { + if (new_state == arc_anon) { buf_hash_remove(ab); } @@ -854,6 +1086,47 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) atomic_add_64(&old_state->arcs_size, -from_delta); } ab->b_state = new_state; + + /* adjust l2arc hdr stats */ + if (new_state == arc_l2c_only) + l2arc_hdr_stat_add(); + else if (old_state == arc_l2c_only) + l2arc_hdr_stat_remove(); +} + +void +arc_space_consume(uint64_t space) +{ + atomic_add_64(&arc_meta_used, space); + atomic_add_64(&arc_size, space); +} + +void +arc_space_return(uint64_t space) +{ + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + atomic_add_64(&arc_meta_used, -space); + ASSERT(arc_size >= space); + atomic_add_64(&arc_size, -space); +} + +void * +arc_data_buf_alloc(uint64_t size) +{ + if (arc_evict_needed(ARC_BUFC_DATA)) + cv_signal(&arc_reclaim_thr_cv); + atomic_add_64(&arc_size, size); + return (zio_data_buf_alloc(size)); +} + +void +arc_data_buf_free(void *buf, uint64_t size) +{ + zio_data_buf_free(buf, size); + ASSERT(arc_size >= size); + atomic_add_64(&arc_size, -size); } arc_buf_t * @@ -863,15 +1136,14 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) arc_buf_t *buf; ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; hdr->b_spa = spa; hdr->b_state = arc_anon; hdr->b_arc_access = 0; - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -894,7 +1166,7 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -914,28 +1186,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) kmutex_t *hash_lock; /* - * Check to see if this buffer is currently being evicted via - * arc_do_user_evicts(). + * Check to see if this buffer is evicted. Callers + * must verify b_data != NULL to know if the add_ref + * was successful. */ - mutex_enter(&arc_eviction_mtx); - hdr = buf->b_hdr; - if (hdr == NULL) { - mutex_exit(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_READER); + if (buf->b_data == NULL) { + rw_exit(&buf->b_lock); return; } + hdr = buf->b_hdr; + ASSERT(hdr != NULL); hash_lock = HDR_LOCK(hdr); - mutex_exit(&arc_eviction_mtx); - mutex_enter(hash_lock); - if (buf->b_data == NULL) { - /* - * This buffer is evicted. - */ - mutex_exit(hash_lock); - return; - } + rw_exit(&buf->b_lock); - ASSERT(buf->b_hdr == hdr); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); arc_access(hdr, hash_lock); @@ -946,6 +1211,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) data, metadata, hits); } +/* + * Free the arc data buffer. If it is an l2arc write in progress, + * the buffer is placed on l2arc_free_on_write to be freed later. + */ +static void +arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), + void *data, size_t size) +{ + if (HDR_L2_WRITING(hdr)) { + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + free_func(data, size); + } +} + static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) { @@ -960,18 +1248,24 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) arc_cksum_verify(buf); if (!recycle) { if (type == ARC_BUFC_METADATA) { - zio_buf_free(buf->b_data, size); + arc_buf_data_free(buf->b_hdr, zio_buf_free, + buf->b_data, size); + arc_space_return(size); } else { ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(buf->b_data, size); + arc_buf_data_free(buf->b_hdr, + zio_data_buf_free, buf->b_data, size); + atomic_add_64(&arc_size, -size); } - atomic_add_64(&arc_size, -size); } if (list_link_active(&buf->b_hdr->b_arc_node)) { + uint64_t *cnt = &state->arcs_lsize[type]; + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); ASSERT(state != arc_anon); - ASSERT3U(state->arcs_lsize, >=, size); - atomic_add_64(&state->arcs_lsize, -size); + + ASSERT3U(*cnt, >=, size); + atomic_add_64(cnt, -size); } ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); @@ -1002,6 +1296,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!(hdr->b_flags & ARC_STORED)); + + if (hdr->b_l2hdr != NULL) { + if (!MUTEX_HELD(&l2arc_buflist_mtx)) { + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + mutex_enter(&l2arc_buflist_mtx); + if (hdr->b_l2hdr != NULL) { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, + hdr); + } + mutex_exit(&l2arc_buflist_mtx); + } else { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + } + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); @@ -1014,12 +1337,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_WRITER); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; + rw_exit(&buf->b_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); @@ -1029,7 +1354,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - mutex_destroy(&hdr->b_freeze_lock); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -1124,14 +1448,19 @@ arc_buf_size(arc_buf_t *buf) * - return the data block from this buffer rather than freeing it. * This flag is used by callers that are trying to make space for a * new buffer in a full arc cache. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so may not catch all candidates. + * It may also return without evicting as much space as requested. */ static void * -arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, +arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; arc_buf_hdr_t *ab, *ab_prev = NULL; + list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; @@ -1143,10 +1472,11 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(ab) || + (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) { skipped++; @@ -1163,10 +1493,15 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; + if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { + missed += 1; + break; + } if (buf->b_data) { bytes_evicted += ab->b_size; if (recycle && ab->b_type == type && - ab->b_size == bytes) { + ab->b_size == bytes && + !HDR_L2_WRITING(ab)) { stolen = buf->b_data; recycle = FALSE; } @@ -1180,16 +1515,20 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); + rw_exit(&buf->b_lock); } else { + rw_exit(&buf->b_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } } - ASSERT(ab->b_datacnt == 0); - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags = ARC_IN_HASH_TABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + if (ab->b_datacnt == 0) { + arc_change_state(evicted_state, ab, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags |= ARC_IN_HASH_TABLE; + ab->b_flags &= ~ARC_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + } if (!have_lock) mutex_exit(hash_lock); if (bytes >= 0 && bytes_evicted >= bytes) @@ -1212,6 +1551,27 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, if (missed) ARCSTAT_INCR(arcstat_mutex_miss, missed); + /* + * We have just evicted some date into the ghost state, make + * sure we also adjust the ghost state size if necessary. + */ + if (arc_no_grow && + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { + int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + int64_t todelete = + MIN(arc_mru_ghost->arcs_lsize[type], mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); + } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { + int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + } + } + return (stolen); } @@ -1220,9 +1580,10 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, int64_t bytes) +arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; + list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; @@ -1230,17 +1591,30 @@ arc_evict_ghost(arc_state_t *state, int64_t bytes) ASSERT(GHOST_STATE(state)); top: mutex_enter(&state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + if (spa && ab->b_spa != spa) + continue; hash_lock = HDR_LOCK(ab); if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); - arc_change_state(arc_anon, ab, hash_lock); - mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += ab->b_size; - arc_hdr_destroy(ab); + + if (ab->b_l2hdr != NULL) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, ab, hash_lock); + mutex_exit(hash_lock); + } else { + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + } + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; @@ -1256,6 +1630,12 @@ top: } mutex_exit(&state->arcs_mtx); + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_deleted < bytes)) { + list = &state->arcs_list[ARC_BUFC_METADATA]; + goto top; + } + if (bufs_skipped) { ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ASSERT(bytes >= 0); @@ -1271,38 +1651,58 @@ arc_adjust(void) { int64_t top_sz, mru_over, arc_over, todelete; - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; + + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); + (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + } - if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); - (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); + (void) arc_evict(arc_mru, NULL, toevict, FALSE, + ARC_BUFC_METADATA); top_sz = arc_anon->arcs_size + arc_mru->arcs_size; } mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; if (mru_over > 0) { - if (arc_mru_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); - arc_evict_ghost(arc_mru_ghost, todelete); + if (arc_mru_ghost->arcs_size > 0) { + todelete = MIN(arc_mru_ghost->arcs_size, mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); } } if ((arc_over = arc_size - arc_c) > 0) { int64_t tbl_over; - if (arc_mfu->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); - (void) arc_evict(arc_mfu, toevict, FALSE, - ARC_BUFC_UNDEF); + if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); + (void) arc_evict(arc_mfu, NULL, toevict, FALSE, + ARC_BUFC_DATA); + arc_over = arc_size - arc_c; + } + + if (arc_over > 0 && + arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], + arc_over); + (void) arc_evict(arc_mfu, NULL, toevict, FALSE, + ARC_BUFC_METADATA); } - tbl_over = arc_size + arc_mru_ghost->arcs_lsize + - arc_mfu_ghost->arcs_lsize - arc_c*2; + tbl_over = arc_size + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c * 2; - if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); - arc_evict_ghost(arc_mfu_ghost, todelete); + if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { + todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); } } } @@ -1314,7 +1714,9 @@ arc_do_user_evicts(void) while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; + rw_enter(&buf->b_lock, RW_WRITER); buf->b_hdr = NULL; + rw_exit(&buf->b_lock); mutex_exit(&arc_eviction_mtx); if (buf->b_efunc != NULL) @@ -1329,24 +1731,40 @@ arc_do_user_evicts(void) } /* - * Flush all *evictable* data from the cache. + * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ void -arc_flush(void) +arc_flush(spa_t *spa) { - while (list_head(&arc_mru->arcs_list)) - (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); - while (list_head(&arc_mfu->arcs_list)) - (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } - arc_evict_ghost(arc_mru_ghost, -1); - arc_evict_ghost(arc_mfu_ghost, -1); + arc_evict_ghost(arc_mru_ghost, spa, -1); + arc_evict_ghost(arc_mfu_ghost, spa, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); mutex_exit(&arc_reclaim_thr_lock); - ASSERT(arc_eviction_list == NULL); + ASSERT(spa || arc_eviction_list == NULL); } int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ @@ -1380,7 +1798,7 @@ arc_shrink(void) arc_adjust(); } -static int zfs_needfree = 0; +static int needfree = 0; static int arc_reclaim_needed(void) @@ -1391,13 +1809,28 @@ arc_reclaim_needed(void) #ifdef _KERNEL - if (zfs_needfree) + if (needfree) return (1); #if 0 /* + * take 'desfree' extra pages, so we reclaim sooner, rather than later + */ + extra = desfree; + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + if (freemem < lotsfree + needfree + extra) + return (1); + + /* * check to make sure that swapfs has enough space so that anon - * reservations can still succeeed. anon_resvmem() checks that the + * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. @@ -1405,23 +1838,6 @@ arc_reclaim_needed(void) if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); - /* - * If zio data pages are being allocated out of a separate heap segment, - * then check that the size of available vmem for this area remains - * above 1/4th free. This needs to be done when the size of the - * non-default segment is smaller than physical memory, so we could - * conceivably run out of VA in that segment before running out of - * physical memory. - */ - if (zio_arena != NULL) { - size_t arc_ziosize = - btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); - - if ((physmem > arc_ziosize) && - (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) - return (1); - } - #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the @@ -1431,7 +1847,7 @@ arc_reclaim_needed(void) * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the caclulation, if less than 1/4th is + * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ if (btop(vmem_size(heap_arena, VMEM_FREE)) < @@ -1462,12 +1878,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif #ifdef _KERNEL - /* - * First purge some DNLC entries, in case the DNLC is using - * up too much memory. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - + if (arc_meta_used >= arc_meta_limit) { + /* + * We are exceeding our meta-data cache limit. + * Purge some DNLC entries to release holds on meta-data. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. @@ -1477,7 +1894,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif /* - * An agressive reclamation will shrink the cache size as well as + * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) @@ -1526,11 +1943,10 @@ arc_reclaim_thread(void *dummy __unused) /* reset the growth delay for every reclaim */ growtime = LBOLT + (arc_grow_retry * hz); - ASSERT(growtime > 0); - if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { + if (needfree && last_reclaim == ARC_RECLAIM_CONS) { /* - * If zfs_needfree is TRUE our vm_lowmem hook + * If needfree is TRUE our vm_lowmem hook * was called and in that case we must free some * memory, so switch to aggressive mode. */ @@ -1538,11 +1954,13 @@ arc_reclaim_thread(void *dummy __unused) last_reclaim = ARC_RECLAIM_AGGR; } arc_kmem_reap_now(last_reclaim); - } else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) { + arc_warm = B_TRUE; + + } else if (arc_no_grow && LBOLT >= growtime) { arc_no_grow = FALSE; } - if (zfs_needfree || + if (needfree || (2 * arc_c < arc_size + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) arc_adjust(); @@ -1551,9 +1969,9 @@ arc_reclaim_thread(void *dummy __unused) arc_do_user_evicts(); if (arc_reclaim_needed()) { - zfs_needfree = 0; + needfree = 0; #ifdef _KERNEL - wakeup(&zfs_needfree); + wakeup(&needfree); #endif } @@ -1580,6 +1998,9 @@ arc_adapt(int bytes, arc_state_t *state) { int mult; + if (state == arc_l2c_only) + return; + ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: @@ -1634,8 +2055,25 @@ arc_adapt(int bytes, arc_state_t *state) * prior to insert. */ static int -arc_evict_needed() +arc_evict_needed(arc_buf_contents_t type) { + if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) + return (1); + +#if 0 +#ifdef _KERNEL + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this area remains + * above about 1/32nd free. + */ + if (type == ARC_BUFC_DATA && zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) + return (1); +#endif +#endif + if (arc_reclaim_needed()) return (1); @@ -1678,14 +2116,15 @@ arc_get_data_buf(arc_buf_t *buf) * We have not yet reached cache maximum size, * just allocate a new buffer. */ - if (!arc_evict_needed()) { + if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); } - atomic_add_64(&arc_size, size); goto out; } @@ -1700,20 +2139,23 @@ arc_get_data_buf(arc_buf_t *buf) if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_p > mru_used) ? arc_mfu : arc_mru; + state = (arc_mfu->arcs_lsize[type] > 0 && + arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; - state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + state = (arc_mru->arcs_lsize[type] > 0 && + mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { + if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); } - atomic_add_64(&arc_size, size); ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); @@ -1728,7 +2170,7 @@ out: atomic_add_64(&hdr->b_state->arcs_size, size); if (list_link_active(&hdr->b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize, size); + atomic_add_64(&hdr->b_state->arcs_lsize[type], size); } /* * If we are growing the cache, and we are adding anonymous @@ -1773,10 +2215,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) if ((buf->b_flags & ARC_PREFETCH) != 0) { if (refcount_count(&buf->b_refcnt) == 0) { ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mru->arcs_mtx); - list_remove(&arc_mru->arcs_list, buf); - list_insert_head(&arc_mru->arcs_list, buf); - mutex_exit(&arc_mru->arcs_mtx); } else { buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); @@ -1836,10 +2274,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) if ((buf->b_flags & ARC_PREFETCH) != 0) { ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mfu->arcs_mtx); - list_remove(&arc_mfu->arcs_list, buf); - list_insert_head(&arc_mfu->arcs_list, buf); - mutex_exit(&arc_mfu->arcs_mtx); } ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = LBOLT; @@ -1865,6 +2299,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (buf->b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + buf->b_arc_access = LBOLT; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); } else { ASSERT(!"invalid arc state"); } @@ -1879,7 +2321,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) VERIFY(arc_buf_remove_ref(buf, arg) == 1); } -/* a generic arc_done_func_t which you can use */ +/* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { @@ -1917,15 +2359,24 @@ arc_read_done(zio_t *zio) &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || - (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + + hdr->b_flags &= ~ARC_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) + hdr->b_flags &= ~ARC_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) - callback_list->acb_byteswap(buf->b_data, hdr->b_size); + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? + byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + func(buf->b_data, hdr->b_size); + } - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); /* create copies of the data buffer for the callers */ abuf = buf; @@ -1952,9 +2403,6 @@ arc_read_done(zio_t *zio) if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_refcnt); - /* convert checksum errors into IO errors */ - if (zio->io_error == ECKSUM) - zio->io_error = EIO; } /* @@ -2020,16 +2468,40 @@ arc_read_done(zio_t *zio) * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. + * + * Normal callers should use arc_read and pass the arc buffer and offset + * for the bp. But if you know you don't need locking, you can use + * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, - arc_done_func_t *done, void *private, int priority, int flags, - uint32_t *arc_flags, zbookmark_t *zb) +arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + int err; + arc_buf_hdr_t *hdr = pbuf->b_hdr; + + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); + ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); + rw_enter(&pbuf->b_lock, RW_READER); + + err = arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb); + + ASSERT3P(hdr, ==, pbuf->b_hdr); + rw_exit(&pbuf->b_lock); + return (err); +} + +int +arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf; kmutex_t *hash_lock; - zio_t *rzio; + zio_t *rzio; top: hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); @@ -2053,10 +2525,9 @@ top: KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - acb->acb_byteswap = swap; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, flags); + spa, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; @@ -2093,6 +2564,8 @@ top: } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), @@ -2104,6 +2577,8 @@ top: } else { uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; + vdev_t *vd = NULL; + daddr_t addr; if (hdr == NULL) { /* this block is not in the cache */ @@ -2130,6 +2605,8 @@ top: private); hdr->b_flags |= ARC_PREFETCH; } + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_INDIRECT; } else { @@ -2144,7 +2621,9 @@ top: hdr->b_flags |= ARC_PREFETCH; else add_reference(hdr, hash_lock, private); - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -2160,7 +2639,6 @@ top: acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - acb->acb_byteswap = swap; ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; @@ -2176,6 +2654,18 @@ top: if (GHOST_STATE(hdr->b_state)) arc_access(hdr, hash_lock); + + if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && + (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + addr = hdr->b_l2hdr->b_daddr; + /* + * Lock out device removal. + */ + if (vdev_is_dead(vd) || + !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + vd = NULL; + } + mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); @@ -2186,8 +2676,65 @@ top: demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); + if (vd != NULL) { + /* + * Read from the L2ARC if the following are true: + * 1. The L2ARC vdev was previously cached. + * 2. This buffer still has L2ARC metadata. + * 3. This buffer isn't currently writing to the L2ARC. + * 4. The L2ARC entry wasn't evicted, which may + * also have invalidated the vdev. + */ + if (hdr->b_l2hdr != NULL && + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { + l2arc_read_callback_t *cb; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_buf = buf; + cb->l2rcb_spa = spa; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = zio_flags; + + /* + * l2arc read. The SCL_L2ARC lock will be + * released by l2arc_read_done(). + */ + rzio = zio_read_phys(pio, vd, addr, size, + buf->b_data, ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, zio_flags | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + + if (*arc_flags & ARC_NOWAIT) { + zio_nowait(rzio); + return (0); + } + + ASSERT(*arc_flags & ARC_WAIT); + if (zio_wait(rzio) == 0) + return (0); + + /* l2arc read error; goto zio_read() */ + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + spa_config_exit(spa, SCL_L2ARC, vd); + } + } + rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, flags, zb); + arc_read_done, buf, priority, zio_flags, zb); if (*arc_flags & ARC_WAIT) return (zio_wait(rzio)); @@ -2254,45 +2801,28 @@ arc_buf_evict(arc_buf_t *buf) kmutex_t *hash_lock; arc_buf_t **bufp; - mutex_enter(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_WRITER); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); - mutex_exit(&arc_eviction_mtx); + rw_exit(&buf->b_lock); return (0); - } - hash_lock = HDR_LOCK(hdr); - mutex_exit(&arc_eviction_mtx); - - mutex_enter(hash_lock); - - if (buf->b_data == NULL) { + } else if (buf->b_data == NULL) { + arc_buf_t copy = *buf; /* structure assignment */ /* - * We are on the eviction list. + * We are on the eviction list; process this buffer now + * but let arc_do_user_evicts() do the reaping. */ - mutex_exit(hash_lock); - mutex_enter(&arc_eviction_mtx); - if (buf->b_hdr == NULL) { - /* - * We are already in arc_do_user_evicts(). - */ - mutex_exit(&arc_eviction_mtx); - return (0); - } else { - arc_buf_t copy = *buf; /* structure assignment */ - /* - * Process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&arc_eviction_mtx); - VERIFY(copy.b_efunc(©) == 0); - return (1); - } + buf->b_efunc = NULL; + rw_exit(&buf->b_lock); + VERIFY(copy.b_efunc(©) == 0); + return (1); } + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); ASSERT(buf->b_hdr == hdr); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); @@ -2323,12 +2853,14 @@ arc_buf_evict(arc_buf_t *buf) arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags = ARC_IN_HASH_TABLE; + hdr->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_BUF_AVAILABLE; mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&old_state->arcs_mtx); } mutex_exit(hash_lock); + rw_exit(&buf->b_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; @@ -2342,16 +2874,22 @@ arc_buf_evict(arc_buf_t *buf) * Release this buffer from the cache. This must be done * after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make - * make a new hdr for the buffer. + * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + l2arc_buf_hdr_t *l2hdr; + uint64_t buf_size; + + rw_enter(&buf->b_lock, RW_WRITER); + hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); + ASSERT(!(hdr->b_flags & ARC_STORED)); if (hdr->b_state == arc_anon) { /* this buffer is already released */ @@ -2359,22 +2897,32 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(BUF_EMPTY(hdr)); ASSERT(buf->b_efunc == NULL); arc_buf_thaw(buf); + rw_exit(&buf->b_lock); return; } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + l2hdr = hdr->b_l2hdr; + if (l2hdr) { + mutex_enter(&l2arc_buflist_mtx); + hdr->b_l2hdr = NULL; + buf_size = hdr->b_size; + } + /* * Do we have more than one buf? */ - if (hdr->b_buf != buf || buf->b_next != NULL) { + if (hdr->b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; spa_t *spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; + uint32_t flags = hdr->b_flags; - ASSERT(hdr->b_datacnt > 1); + ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this buf and attach it to * a new anonymous buf. @@ -2389,37 +2937,39 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); if (refcount_is_zero(&hdr->b_refcnt)) { - ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3U(*size, >=, hdr->b_size); + atomic_add_64(size, -hdr->b_size); } hdr->b_datacnt -= 1; arc_cksum_verify(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; nhdr->b_type = type; nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; - nhdr->b_flags = 0; + nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; - mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; + rw_exit(&buf->b_lock); atomic_add_64(&arc_anon->arcs_size, blksz); - - hdr = nhdr; } else { + rw_exit(&buf->b_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_cksum0 = 0; @@ -2427,25 +2977,47 @@ arc_release(arc_buf_t *buf, void *tag) } buf->b_efunc = NULL; buf->b_private = NULL; + + if (l2hdr) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -buf_size); + mutex_exit(&l2arc_buflist_mtx); + } } int arc_released(arc_buf_t *buf) { - return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + int released; + + rw_enter(&buf->b_lock, RW_READER); + released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + rw_exit(&buf->b_lock); + return (released); } int arc_has_callback(arc_buf_t *buf) { - return (buf->b_efunc != NULL); + int callback; + + rw_enter(&buf->b_lock, RW_READER); + callback = (buf->b_efunc != NULL); + rw_exit(&buf->b_lock); + return (callback); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { - return (refcount_count(&buf->b_hdr->b_refcnt)); + int referenced; + + rw_enter(&buf->b_lock, RW_READER); + referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + rw_exit(&buf->b_lock); + return (referenced); } #endif @@ -2454,12 +3026,27 @@ arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; - if (callback->awcb_ready) { - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + + /* + * If the IO is already in progress, then this is a re-write + * attempt, so we need to thaw and re-compute the cksum. + * It is the responsibility of the callback to handle the + * accounting for any re-write attempt. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + mutex_enter(&hdr->b_freeze_lock); + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_freeze_lock); } - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); + hdr->b_flags |= ARC_IO_IN_PROGRESS; } static void @@ -2471,9 +3058,6 @@ arc_write_done(zio_t *zio) hdr->b_acb = NULL; - /* this buffer is on no lists and is not in the hash table */ - ASSERT3P(hdr->b_state, ==, arc_anon); - hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = zio->io_bp->blk_birth; hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; @@ -2496,6 +3080,7 @@ arc_write_done(zio_t *zio) * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ + ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE); ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), BP_IDENTITY(zio->io_bp))); ASSERT3U(zio->io_bp_orig.blk_birth, ==, @@ -2509,7 +3094,9 @@ arc_write_done(zio_t *zio) ASSERT3P(exists, ==, NULL); } hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - arc_access(hdr, hash_lock); + /* if it's not anon, we are doing a scrub */ + if (hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else if (callback->awcb_done == NULL) { int destroy_hdr; @@ -2526,6 +3113,7 @@ arc_write_done(zio_t *zio) } else { hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } + hdr->b_flags &= ~ARC_STORED; if (callback->awcb_done) { ASSERT(!refcount_is_zero(&hdr->b_refcnt)); @@ -2535,31 +3123,74 @@ arc_write_done(zio_t *zio) kmem_free(callback, sizeof (arc_write_callback_t)); } +static void +write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp) +{ + boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); + + /* Determine checksum setting */ + if (ismd) { + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && + !zio_checksum_table[wp->wp_oschecksum].ci_zbt) + zp->zp_checksum = wp->wp_oschecksum; + else + zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4; + } else { + zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum, + wp->wp_oschecksum); + } + + /* Determine compression setting */ + if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + } else { + zp->zp_compress = zio_compress_select(wp->wp_dncompress, + wp->wp_oscompress); + } + + zp->zp_type = wp->wp_type; + zp->zp_level = wp->wp_level; + zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa)); +} + zio_t * -arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, - uint64_t txg, blkptr_t *bp, arc_buf_t *buf, +arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, + boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb) + int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; - zio_t *zio; + zio_t *zio; + zio_prop_t zp; - /* this is a private buffer - no locking required */ - ASSERT3P(hdr->b_state, ==, arc_anon); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(ready != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == 0); + if (l2arc) + hdr->b_flags |= ARC_L2CACHE; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; - hdr->b_flags |= ARC_IO_IN_PROGRESS; - zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, - buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, - priority, flags, zb); + + write_policy(spa, wp, &zp); + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp, + arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); return (zio); } @@ -2584,7 +3215,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, * nonzero, it should match what we have in the cache. */ ASSERT(bp->blk_cksum.zc_word[0] == 0 || - ab->b_cksum0 == bp->blk_cksum.zc_word[0]); + bp->blk_cksum.zc_word[0] == ab->b_cksum0 || + bp->blk_fill == BLK_FILL_ALREADY_FREED); + if (ab->b_state != arc_anon) arc_change_state(arc_anon, ab, hash_lock); if (HDR_IO_IN_PROGRESS(ab)) { @@ -2604,6 +3237,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ab->b_buf->b_private = NULL; mutex_exit(hash_lock); } else if (refcount_is_zero(&ab->b_refcnt)) { + ab->b_flags |= ARC_FREE_IN_PROGRESS; mutex_exit(hash_lock); arc_hdr_destroy(ab); ARCSTAT_BUMP(arcstat_deleted); @@ -2624,7 +3258,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } } - zio = zio_free(pio, spa, txg, bp, done, private); + zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED); if (arc_flags & ARC_WAIT) return (zio_wait(zio)); @@ -2635,16 +3269,75 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, return (0); } +static int +arc_memory_throttle(uint64_t reserve, uint64_t txg) +{ +#ifdef _KERNEL + uint64_t inflight_data = arc_anon->arcs_size; + uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count); + static uint64_t page_load = 0; + static uint64_t last_txg = 0; + +#if 0 +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif +#endif + if (available_memory >= zfs_write_limit_max) + return (0); + + if (txg > last_txg) { + last_txg = txg; + page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (curproc == pageproc) { + if (page_load > available_memory / 4) + return (ERESTART); + /* Note: reserve is inflated, so we deflate */ + page_load += reserve / 8; + return (0); + } else if (page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (EAGAIN); + } + page_load = 0; + + if (arc_size > arc_c_min) { + uint64_t evictable_memory = + arc_mru->arcs_lsize[ARC_BUFC_DATA] + + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + available_memory += MIN(evictable_memory, arc_size - arc_c_min); + } + + if (inflight_data > available_memory / 4) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (ERESTART); + } +#endif + return (0); +} + void -arc_tempreserve_clear(uint64_t tempreserve) +arc_tempreserve_clear(uint64_t reserve) { - atomic_add_64(&arc_tempreserve, -tempreserve); + atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int -arc_tempreserve_space(uint64_t tempreserve) +arc_tempreserve_space(uint64_t reserve, uint64_t txg) { + int error; + #ifdef ZFS_DEBUG /* * Once in a while, fail for no reason. Everything should cope. @@ -2654,31 +3347,37 @@ arc_tempreserve_space(uint64_t tempreserve) return (ERESTART); } #endif - if (tempreserve > arc_c/4 && !arc_no_grow) - arc_c = MIN(arc_c_max, tempreserve * 4); - if (tempreserve > arc_c) + if (reserve > arc_c/4 && !arc_no_grow) + arc_c = MIN(arc_c_max, reserve * 4); + if (reserve > arc_c) return (ENOMEM); /* + * Writes will, almost always, require additional memory allocations + * in order to compress/encrypt/etc the data. We therefor need to + * make sure that there is sufficient available memory for this. + */ + if (error = arc_memory_throttle(reserve, txg)) + return (error); + + /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. - * - * XXX The limit should be adjusted dynamically to keep the time - * to sync a dataset fixed (around 1-5 seconds?). */ - - if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { - dprintf("failing, arc_tempreserve=%lluK anon=%lluK " - "tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, arc_anon->arcs_lsize>>10, - tempreserve>>10, arc_c>>10); + if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && + arc_anon->arcs_size > arc_c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, + arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, + arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, + reserve>>10, arc_c>>10); return (ERESTART); } - atomic_add_64(&arc_tempreserve, tempreserve); + atomic_add_64(&arc_tempreserve, reserve); return (0); } @@ -2692,10 +3391,10 @@ arc_lowmem(void *arg __unused, int howto __unused) /* Serialize access via arc_lowmem_lock. */ mutex_enter(&arc_lowmem_lock); - zfs_needfree = 1; + needfree = 1; cv_signal(&arc_reclaim_thr_cv); - while (zfs_needfree) - tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); + while (needfree) + tsleep(&needfree, 0, "zfs:lowmem", hz / 5); mutex_exit(&arc_lowmem_lock); } #endif @@ -2743,6 +3442,16 @@ arc_init(void) arc_c = arc_c_max; arc_p = (arc_c >> 1); + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -2757,6 +3466,7 @@ arc_init(void) arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; arc_size = 0; mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -2764,15 +3474,28 @@ arc_init(void) mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); + mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); buf_init(); @@ -2798,6 +3521,13 @@ arc_init(void) #endif arc_dead = FALSE; + arc_warm = B_FALSE; + + if (zfs_write_limit_max == 0) + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + else + zfs_write_limit_shift = 0; + mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef _KERNEL /* Warn about ZFS memory and address space requirements. */ @@ -2808,9 +3538,9 @@ arc_init(void) if (kmem_size() < 512 * (1 << 20)) { printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " "expect unstable behavior.\n"); - printf(" Consider tuning vm.kmem_size and " + printf(" Consider tuning vm.kmem_size and " "vm.kmem_size_max\n"); - printf(" in /boot/loader.conf.\n"); + printf(" in /boot/loader.conf.\n"); } #endif } @@ -2818,6 +3548,7 @@ arc_init(void) void arc_fini(void) { + mutex_enter(&arc_reclaim_thr_lock); arc_thread_exit = 1; cv_signal(&arc_reclaim_thr_cv); @@ -2825,7 +3556,7 @@ arc_fini(void) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); mutex_exit(&arc_reclaim_thr_lock); - arc_flush(); + arc_flush(NULL); arc_dead = TRUE; @@ -2838,10 +3569,14 @@ arc_fini(void) mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); - list_destroy(&arc_mru->arcs_list); - list_destroy(&arc_mru_ghost->arcs_list); - list_destroy(&arc_mfu->arcs_list); - list_destroy(&arc_mfu_ghost->arcs_list); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); mutex_destroy(&arc_anon->arcs_mtx); mutex_destroy(&arc_mru->arcs_mtx); @@ -2849,6 +3584,8 @@ arc_fini(void) mutex_destroy(&arc_mfu->arcs_mtx); mutex_destroy(&arc_mfu_ghost->arcs_mtx); + mutex_destroy(&zfs_write_limit_lock); + buf_fini(); mutex_destroy(&arc_lowmem_lock); @@ -2857,3 +3594,985 @@ arc_fini(void) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); #endif } + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. After system boot and before the ARC has filled main memory, there are + * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru + * lists can remain mostly static. Instead of searching from tail of these + * lists as pictured, the l2arc_feed_thread() will search from the list heads + * for eligible buffers, greatly increasing its chance of finding them. + * + * The L2ARC device write speed is also boosted during this time so that + * the L2ARC warms up faster. Since there have been no ARC evictions yet, + * there are no L2ARC reads, and no fear of degrading read performance + * through increased writes. + * + * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 7. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 8. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_write_boost extra write bytes during device warmup + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + */ + +static void +l2arc_hdr_stat_add(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +static void +l2arc_hdr_stat_remove(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * If a device is returned, this also returns holding the spa config lock. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *first, *next = NULL; + + /* + * Lock out the removal of spas (spa_namespace_lock), then removal + * of cache devices (l2arc_dev_mtx). Once a device has been selected, + * both locks will be dropped and a spa config lock held instead. + */ + mutex_enter(&spa_namespace_lock); + mutex_enter(&l2arc_dev_mtx); + + /* if there are no vdevs, there is nothing to do */ + if (l2arc_ndev == 0) + goto out; + + first = NULL; + next = l2arc_dev_last; + do { + /* loop around the list looking for a non-faulted vdev */ + if (next == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, next); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + /* if we have come back to the start, bail out */ + if (first == NULL) + first = next; + else if (next == first) + break; + + } while (vdev_is_dead(next->l2ad_vdev)); + + /* if we were unable to find any usable vdevs, return NULL */ + if (vdev_is_dead(next->l2ad_vdev)) + next = NULL; + + l2arc_dev_last = next; + +out: + mutex_exit(&l2arc_dev_mtx); + + /* + * Grab the config lock to prevent the 'next' device from being + * removed while we are writing to it. + */ + if (next != NULL) + spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); + mutex_exit(&spa_namespace_lock); + + return (next); +} + +/* + * Free buffers that were tagged for destruction. + */ +static void +l2arc_do_free_on_write() +{ + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT(df->l2df_data != NULL); + ASSERT(df->l2df_func != NULL); + df->l2df_func(df->l2df_data, df->l2df_size); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + + mutex_exit(&l2arc_free_on_write_mtx); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_dev_t *dev; + list_t *buflist; + arc_buf_hdr_t *head, *ab, *ab_prev; + l2arc_buf_hdr_t *abl2; + kmutex_t *hash_lock; + + cb = zio->io_private; + ASSERT(cb != NULL); + dev = cb->l2wcb_dev; + ASSERT(dev != NULL); + head = cb->l2wcb_head; + ASSERT(head != NULL); + buflist = dev->l2ad_buflist; + ASSERT(buflist != NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + mutex_enter(&l2arc_buflist_mtx); + + /* + * All writes completed, or an error was hit. + */ + for (ab = list_prev(buflist, head); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * This buffer misses out. It may be in a stage + * of eviction. Its ARC_L2_WRITING flag will be + * left set, denying reads to this buffer. + */ + ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); + continue; + } + + if (zio->io_error != 0) { + /* + * Error - drop L2ARC entry. + */ + list_remove(buflist, ab); + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + |