diff options
Diffstat (limited to 'sys/ufs')
| -rw-r--r-- | sys/ufs/dinode.h | 104 | ||||
| -rw-r--r-- | sys/ufs/dir.h | 99 | ||||
| -rw-r--r-- | sys/ufs/fs.h | 441 | ||||
| -rw-r--r-- | sys/ufs/inode.h | 234 | ||||
| -rw-r--r-- | sys/ufs/lockf.h | 72 | ||||
| -rw-r--r-- | sys/ufs/mfs_vfsops.c | 198 | ||||
| -rw-r--r-- | sys/ufs/mfs_vnops.c | 273 | ||||
| -rw-r--r-- | sys/ufs/mfsiom.h | 38 | ||||
| -rw-r--r-- | sys/ufs/mfsnode.h | 197 | ||||
| -rw-r--r-- | sys/ufs/quota.h | 183 | ||||
| -rw-r--r-- | sys/ufs/ufs_alloc.c | 1101 | ||||
| -rw-r--r-- | sys/ufs/ufs_bmap.c | 361 | ||||
| -rw-r--r-- | sys/ufs/ufs_disksubr.c | 593 | ||||
| -rw-r--r-- | sys/ufs/ufs_inode.c | 707 | ||||
| -rw-r--r-- | sys/ufs/ufs_lockf.c | 794 | ||||
| -rw-r--r-- | sys/ufs/ufs_lookup.c | 915 | ||||
| -rw-r--r-- | sys/ufs/ufs_quota.c | 936 | ||||
| -rw-r--r-- | sys/ufs/ufs_subr.c | 211 | ||||
| -rw-r--r-- | sys/ufs/ufs_tables.c | 141 | ||||
| -rw-r--r-- | sys/ufs/ufs_vfsops.c | 758 | ||||
| -rw-r--r-- | sys/ufs/ufs_vnops.c | 1808 | ||||
| -rw-r--r-- | sys/ufs/ufsmount.h | 78 |
22 files changed, 10242 insertions, 0 deletions
diff --git a/sys/ufs/dinode.h b/sys/ufs/dinode.h new file mode 100644 index 000000000000..51ea7c7e813a --- /dev/null +++ b/sys/ufs/dinode.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 1982, 1989 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)dinode.h 7.10 (Berkeley) 5/8/91 + * $Id: dinode.h,v 1.3 1993/10/16 18:17:35 rgrimes Exp $ + */ + +/* + * A dinode contains all the meta-data associated with a UFS file. + * This structure defines the on-disk format of a dinode. + */ + +#define NDADDR 12 /* direct addresses in inode */ +#define NIADDR 3 /* indirect addresses in inode */ + +#define MAXFASTLINK (((NDADDR+NIADDR) * sizeof(daddr_t)) - 1) + +struct dinode { + u_short di_mode; /* 0: mode and type of file */ + short di_nlink; /* 2: number of links to file */ + uid_t di_uid; /* 4: owner's user id */ + gid_t di_gid; /* 6: owner's group id */ + u_quad di_qsize; /* 8: number of bytes in file */ + time_t di_atime; /* 16: time last accessed */ + long di_atspare; + time_t di_mtime; /* 24: time last modified */ + long di_mtspare; + time_t di_ctime; /* 32: last time inode changed */ + long di_ctspare; + union { + struct { + daddr_t di_udb[NDADDR]; /* 40: disk block addresses */ + daddr_t di_uib[NIADDR]; /* 88: indirect blocks */ + } di_addr; + char di_usymlink[MAXFASTLINK+1]; + } di_un; + long di_flags; /* 100: status, currently unused */ + long di_blocks; /* 104: blocks actually held */ + long di_gen; /* 108: generation number */ +#define DI_SPARE_SZ 4 /* 112: spare for 4 longs */ + u_long di_spare[DI_SPARE_SZ]; /* reserved (unused) */ +}; + +#define di_db di_un.di_addr.di_udb +#define di_ib di_un.di_addr.di_uib +#define di_symlink di_un.di_usymlink + +#if BYTE_ORDER == LITTLE_ENDIAN || defined(tahoe) /* ugh! -- must be fixed */ +#define di_size di_qsize.val[0] +#else /* BYTE_ORDER == BIG_ENDIAN */ +#define di_size di_qsize.val[1] +#endif +#define di_rdev di_db[0] + +/* file modes */ +#define IFMT 0170000 /* mask of file type */ +#define IFIFO 0010000 /* named pipe (fifo) */ +#define IFCHR 0020000 /* character special device */ +#define IFDIR 0040000 /* directory */ +#define IFBLK 0060000 /* block special device */ +#define IFREG 0100000 /* regular file */ +#define IFLNK 0120000 /* symbolic link */ +#define IFSOCK 0140000 /* UNIX domain socket */ + +#define ISUID 04000 /* set user identifier when exec'ing */ +#define ISGID 02000 /* set group identifier when exec'ing */ +#define ISVTX 01000 /* save execution information on exit */ +#define IREAD 0400 /* read permission */ +#define IWRITE 0200 /* write permission */ +#define IEXEC 0100 /* execute permission */ + +#define DFASTLINK(di) \ + ((((di).di_mode & IFMT) == IFLNK) && \ + ((di).di_size <= MAXFASTLINK) && \ + ((di).di_size == (di).di_spare[0])) diff --git a/sys/ufs/dir.h b/sys/ufs/dir.h new file mode 100644 index 000000000000..f98d206be7ca --- /dev/null +++ b/sys/ufs/dir.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)dir.h 7.10 (Berkeley) 3/25/91 + * $Id: dir.h,v 1.2 1993/10/16 18:17:37 rgrimes Exp $ + */ + +#ifndef _DIR_H_ +#define _DIR_H_ + +/* + * A directory consists of some number of blocks of DIRBLKSIZ + * bytes, where DIRBLKSIZ is chosen such that it can be transferred + * to disk in a single atomic operation (e.g. 512 bytes on most machines). + * + * Each DIRBLKSIZ byte block contains some number of directory entry + * structures, which are of variable length. Each directory entry has + * a struct direct at the front of it, containing its inode number, + * the length of the entry, and the length of the name contained in + * the entry. These are followed by the name padded to a 4 byte boundary + * with null bytes. All names are guaranteed null terminated. + * The maximum length of a name in a directory is MAXNAMLEN. + * + * The macro DIRSIZ(dp) gives the amount of space required to represent + * a directory entry. Free space in a directory is represented by + * entries which have dp->d_reclen > DIRSIZ(dp). All DIRBLKSIZ bytes + * in a directory block are claimed by the directory entries. This + * usually results in the last entry in a directory having a large + * dp->d_reclen. When entries are deleted from a directory, the + * space is returned to the previous entry in the same directory + * block by increasing its dp->d_reclen. If the first entry of + * a directory block is free, then its dp->d_ino is set to 0. + * Entries other than the first in a directory do not normally have + * dp->d_ino set to 0. + */ +#define DIRBLKSIZ DEV_BSIZE +#define MAXNAMLEN 255 + +struct direct { + u_long d_ino; /* inode number of entry */ + u_short d_reclen; /* length of this record */ + u_short d_namlen; /* length of string in d_name */ + char d_name[MAXNAMLEN + 1]; /* name with length <= MAXNAMLEN */ +}; + +/* + * The DIRSIZ macro gives the minimum record length which will hold + * the directory entry. This requires the amount of space in struct direct + * without the d_name field, plus enough space for the name with a terminating + * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary. + */ +#define DIRSIZ(dp) \ + ((sizeof (struct direct) - (MAXNAMLEN+1)) + (((dp)->d_namlen+1 + 3) &~ 3)) + +/* + * Template for manipulating directories. + * Should use struct direct's, but the name field + * is MAXNAMLEN - 1, and this just won't do. + */ +struct dirtemplate { + u_long dot_ino; + short dot_reclen; + short dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_long dotdot_ino; + short dotdot_reclen; + short dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; +#endif /* !_DIR_H_ */ diff --git a/sys/ufs/fs.h b/sys/ufs/fs.h new file mode 100644 index 000000000000..477699cb4a00 --- /dev/null +++ b/sys/ufs/fs.h @@ -0,0 +1,441 @@ +/* + * Copyright (c) 1982, 1986 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)fs.h 7.12 (Berkeley) 5/8/91 + * $Id: fs.h,v 1.2 1993/10/16 18:17:38 rgrimes Exp $ + */ + +/* + * Each disk drive contains some number of file systems. + * A file system consists of a number of cylinder groups. + * Each cylinder group has inodes and data. + * + * A file system is described by its super-block, which in turn + * describes the cylinder groups. The super-block is critical + * data and is replicated in each cylinder group to protect against + * catastrophic loss. This is done at `newfs' time and the critical + * super-block data does not change, so the copies need not be + * referenced further unless disaster strikes. + * + * For file system fs, the offsets of the various blocks of interest + * are given in the super block as: + * [fs->fs_sblkno] Super-block + * [fs->fs_cblkno] Cylinder group block + * [fs->fs_iblkno] Inode blocks + * [fs->fs_dblkno] Data blocks + * The beginning of cylinder group cg in fs, is given by + * the ``cgbase(fs, cg)'' macro. + * + * The first boot and super blocks are given in absolute disk addresses. + * The byte-offset forms are preferred, as they don't imply a sector size. + */ +#define BBSIZE 8192 +#define SBSIZE 8192 +#define BBOFF ((off_t)(0)) +#define SBOFF ((off_t)(BBOFF + BBSIZE)) +#define BBLOCK ((daddr_t)(0)) +#define SBLOCK ((daddr_t)(BBLOCK + BBSIZE / DEV_BSIZE)) + +/* + * Addresses stored in inodes are capable of addressing fragments + * of `blocks'. File system blocks of at most size MAXBSIZE can + * be optionally broken into 2, 4, or 8 pieces, each of which is + * addressible; these pieces may be DEV_BSIZE, or some multiple of + * a DEV_BSIZE unit. + * + * Large files consist of exclusively large data blocks. To avoid + * undue wasted disk space, the last data block of a small file may be + * allocated as only as many fragments of a large block as are + * necessary. The file system format retains only a single pointer + * to such a fragment, which is a piece of a single large block that + * has been divided. The size of such a fragment is determinable from + * information in the inode, using the ``blksize(fs, ip, lbn)'' macro. + * + * The file system records space availability at the fragment level; + * to determine block availability, aligned fragments are examined. + * + * The root inode is the root of the file system. + * Inode 0 can't be used for normal purposes and + * historically bad blocks were linked to inode 1, + * thus the root inode is 2. (inode 1 is no longer used for + * this purpose, however numerous dump tapes make this + * assumption, so we are stuck with it) + */ +#define ROOTINO ((ino_t)2) + +/* + * MINBSIZE is the smallest allowable block size. + * In order to insure that it is possible to create files of size + * 2^32 with only two levels of indirection, MINBSIZE is set to 4096. + * MINBSIZE must be big enough to hold a cylinder group block, + * thus changes to (struct cg) must keep its size within MINBSIZE. + * Note that super blocks are always of size SBSIZE, + * and that both SBSIZE and MAXBSIZE must be >= MINBSIZE. + */ +#define MINBSIZE 4096 + +/* + * The path name on which the file system is mounted is maintained + * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in + * the super block for this name. + * The limit on the amount of summary information per file system + * is defined by MAXCSBUFS. It is currently parameterized for a + * maximum of two million cylinders. + */ +#define MAXMNTLEN 512 +#define MAXCSBUFS 32 + +/* + * Per cylinder group information; summarized in blocks allocated + * from first cylinder group data blocks. These blocks have to be + * read in from fs_csaddr (size fs_cssize) in addition to the + * super block. + * + * N.B. sizeof(struct csum) must be a power of two in order for + * the ``fs_cs'' macro to work (see below). + */ +struct csum { + long cs_ndir; /* number of directories */ + long cs_nbfree; /* number of free blocks */ + long cs_nifree; /* number of free inodes */ + long cs_nffree; /* number of free frags */ +}; + +/* + * Super block for a file system. + */ +#define FS_MAGIC 0x011954 +#define FSOKAY 0x7c269d38 +struct fs +{ + struct fs *fs_link; /* linked list of file systems */ + struct fs *fs_rlink; /* used for incore super blocks */ + daddr_t fs_sblkno; /* addr of super-block in filesys */ + daddr_t fs_cblkno; /* offset of cyl-block in filesys */ + daddr_t fs_iblkno; /* offset of inode-blocks in filesys */ + daddr_t fs_dblkno; /* offset of first data after cg */ + long fs_cgoffset; /* cylinder group offset in cylinder */ + long fs_cgmask; /* used to calc mod fs_ntrak */ + time_t fs_time; /* last time written */ + long fs_size; /* number of blocks in fs */ + long fs_dsize; /* number of data blocks in fs */ + long fs_ncg; /* number of cylinder groups */ + long fs_bsize; /* size of basic blocks in fs */ + long fs_fsize; /* size of frag blocks in fs */ + long fs_frag; /* number of frags in a block in fs */ +/* these are configuration parameters */ + long fs_minfree; /* minimum percentage of free blocks */ + long fs_rotdelay; /* num of ms for optimal next block */ + long fs_rps; /* disk revolutions per second */ +/* these fields can be computed from the others */ + long fs_bmask; /* ``blkoff'' calc of blk offsets */ + long fs_fmask; /* ``fragoff'' calc of frag offsets */ + long fs_bshift; /* ``lblkno'' calc of logical blkno */ + long fs_fshift; /* ``numfrags'' calc number of frags */ +/* these are configuration parameters */ + long fs_maxcontig; /* max number of contiguous blks */ + long fs_maxbpg; /* max number of blks per cyl group */ +/* these fields can be computed from the others */ + long fs_fragshift; /* block to frag shift */ + long fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + long fs_sbsize; /* actual size of super block */ + long fs_csmask; /* csum block offset */ + long fs_csshift; /* csum block number */ + long fs_nindir; /* value of NINDIR */ + long fs_inopb; /* value of INOPB */ + long fs_nspf; /* value of NSPF */ +/* yet another configuration parameter */ + long fs_optim; /* optimization preference, see below */ +/* these fields are derived from the hardware */ + long fs_npsect; /* # sectors/track including spares */ + long fs_interleave; /* hardware sector interleave */ + long fs_trackskew; /* sector 0 skew, per track */ + long fs_headswitch; /* head switch time, usec */ + long fs_trkseek; /* track-to-track seek, usec */ +/* sizes determined by number of cylinder groups and their sizes */ + daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ + long fs_cssize; /* size of cyl grp summary area */ + long fs_cgsize; /* cylinder group size */ +/* these fields are derived from the hardware */ + long fs_ntrak; /* tracks per cylinder */ + long fs_nsect; /* sectors per track */ + long fs_spc; /* sectors per cylinder */ +/* this comes from the disk driver partitioning */ + long fs_ncyl; /* cylinders in file system */ +/* these fields can be computed from the others */ + long fs_cpg; /* cylinders per group */ + long fs_ipg; /* inodes per group */ + long fs_fpg; /* blocks per group * fs_frag */ +/* this data must be re-computed after crashes */ + struct csum fs_cstotal; /* cylinder summary information */ +/* these fields are cleared at mount time */ + char fs_fmod; /* super block modified flag */ + char fs_clean; /* file system is clean flag */ + char fs_ronly; /* mounted read-only flag */ + char fs_flags; /* currently unused flag */ + char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ +/* these fields retain the current block allocation info */ + long fs_cgrotor; /* last cg searched */ + struct csum *fs_csp[MAXCSBUFS];/* list of fs_cs info buffers */ + long fs_cpc; /* cyl per cycle in postbl */ + short fs_opostbl[16][8]; /* old rotation block list head */ + long fs_sparecon[55]; /* reserved for future constants */ + long fs_state; /* validate fs_clean field */ + quad fs_qbmask; /* ~fs_bmask - for use with quad size */ + quad fs_qfmask; /* ~fs_fmask - for use with quad size */ + long fs_postblformat; /* format of positional layout tables */ + long fs_nrpos; /* number of rotaional positions */ + long fs_postbloff; /* (short) rotation block list head */ + long fs_rotbloff; /* (u_char) blocks for each rotation */ + long fs_magic; /* magic number */ + u_char fs_space[1]; /* list of blocks for each rotation */ +/* actually longer */ +}; +/* + * Preference for optimization. + */ +#define FS_OPTTIME 0 /* minimize allocation time */ +#define FS_OPTSPACE 1 /* minimize disk fragmentation */ + +/* + * Rotational layout table format types + */ +#define FS_42POSTBLFMT -1 /* 4.2BSD rotational table format */ +#define FS_DYNAMICPOSTBLFMT 1 /* dynamic rotational table format */ +/* + * Macros for access to superblock array structures + */ +#define fs_postbl(fs, cylno) \ + (((fs)->fs_postblformat == FS_42POSTBLFMT) \ + ? ((fs)->fs_opostbl[cylno]) \ + : ((short *)((char *)(fs) + (fs)->fs_postbloff) + (cylno) * (fs)->fs_nrpos)) +#define fs_rotbl(fs) \ + (((fs)->fs_postblformat == FS_42POSTBLFMT) \ + ? ((fs)->fs_space) \ + : ((u_char *)((char *)(fs) + (fs)->fs_rotbloff))) + +/* + * Convert cylinder group to base address of its global summary info. + * + * N.B. This macro assumes that sizeof(struct csum) is a power of two. + */ +#define fs_cs(fs, indx) \ + fs_csp[(indx) >> (fs)->fs_csshift][(indx) & ~(fs)->fs_csmask] + +/* + * Cylinder group block for a file system. + */ +#define CG_MAGIC 0x090255 +struct cg { + struct cg *cg_link; /* linked list of cyl groups */ + long cg_magic; /* magic number */ + time_t cg_time; /* time last written */ + long cg_cgx; /* we are the cgx'th cylinder group */ + short cg_ncyl; /* number of cyl's this cg */ + short cg_niblk; /* number of inode blocks this cg */ + long cg_ndblk; /* number of data blocks this cg */ + struct csum cg_cs; /* cylinder summary information */ + long cg_rotor; /* position of last used block */ + long cg_frotor; /* position of last used frag */ + long cg_irotor; /* position of last used inode */ + long cg_frsum[MAXFRAG]; /* counts of available frags */ + long cg_btotoff; /* (long) block totals per cylinder */ + long cg_boff; /* (short) free block positions */ + long cg_iusedoff; /* (char) used inode map */ + long cg_freeoff; /* (u_char) free block map */ + long cg_nextfreeoff; /* (u_char) next available space */ + long cg_sparecon[16]; /* reserved for future use */ + u_char cg_space[1]; /* space for cylinder group maps */ +/* actually longer */ +}; +/* + * Macros for access to cylinder group array structures + */ +#define cg_blktot(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_btot) \ + : ((long *)((char *)(cgp) + (cgp)->cg_btotoff))) +#define cg_blks(fs, cgp, cylno) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_b[cylno]) \ + : ((short *)((char *)(cgp) + (cgp)->cg_boff) + (cylno) * (fs)->fs_nrpos)) +#define cg_inosused(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_iused) \ + : ((char *)((char *)(cgp) + (cgp)->cg_iusedoff))) +#define cg_blksfree(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_free) \ + : ((u_char *)((char *)(cgp) + (cgp)->cg_freeoff))) +#define cg_chkmagic(cgp) \ + ((cgp)->cg_magic == CG_MAGIC || ((struct ocg *)(cgp))->cg_magic == CG_MAGIC) + +/* + * The following structure is defined + * for compatibility with old file systems. + */ +struct ocg { + struct ocg *cg_link; /* linked list of cyl groups */ + struct ocg *cg_rlink; /* used for incore cyl groups */ + time_t cg_time; /* time last written */ + long cg_cgx; /* we are the cgx'th cylinder group */ + short cg_ncyl; /* number of cyl's this cg */ + short cg_niblk; /* number of inode blocks this cg */ + long cg_ndblk; /* number of data blocks this cg */ + struct csum cg_cs; /* cylinder summary information */ + long cg_rotor; /* position of last used block */ + long cg_frotor; /* position of last used frag */ + long cg_irotor; /* position of last used inode */ + long cg_frsum[8]; /* counts of available frags */ + long cg_btot[32]; /* block totals per cylinder */ + short cg_b[32][8]; /* positions of free blocks */ + char cg_iused[256]; /* used inode map */ + long cg_magic; /* magic number */ + u_char cg_free[1]; /* free block map */ +/* actually longer */ +}; + +/* + * Turn file system block numbers into disk block addresses. + * This maps file system blocks to device size blocks. + */ +#define fsbtodb(fs, b) ((b) << (fs)->fs_fsbtodb) +#define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb) + +/* + * Cylinder group macros to locate things in cylinder groups. + * They calc file system addresses of cylinder group data structures. + */ +#define cgbase(fs, c) ((daddr_t)((fs)->fs_fpg * (c))) +#define cgstart(fs, c) \ + (cgbase(fs, c) + (fs)->fs_cgoffset * ((c) & ~((fs)->fs_cgmask))) +#define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk */ +#define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */ +#define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk */ +#define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */ + +/* + * Macros for handling inode numbers: + * inode number to file system block offset. + * inode number to cylinder group number. + * inode number to file system block address. + */ +#define itoo(fs, x) ((x) % INOPB(fs)) +#define itog(fs, x) ((x) / (fs)->fs_ipg) +#define itod(fs, x) \ + ((daddr_t)(cgimin(fs, itog(fs, x)) + \ + (blkstofrags((fs), (((x) % (fs)->fs_ipg) / INOPB(fs)))))) + +/* + * Give cylinder group number for a file system block. + * Give cylinder group block number for a file system block. + */ +#define dtog(fs, d) ((d) / (fs)->fs_fpg) +#define dtogd(fs, d) ((d) % (fs)->fs_fpg) + +/* + * Extract the bits for a block from a map. + * Compute the cylinder and rotational position of a cyl block addr. + */ +#define blkmap(fs, map, loc) \ + (((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag))) +#define cbtocylno(fs, bno) \ + ((bno) * NSPF(fs) / (fs)->fs_spc) +#define cbtorpos(fs, bno) \ + (((bno) * NSPF(fs) % (fs)->fs_spc / (fs)->fs_nsect * (fs)->fs_trackskew + \ + (bno) * NSPF(fs) % (fs)->fs_spc % (fs)->fs_nsect * (fs)->fs_interleave) % \ + (fs)->fs_nsect * (fs)->fs_nrpos / (fs)->fs_npsect) + +/* + * The following macros optimize certain frequently calculated + * quantities by using shifts and masks in place of divisions + * modulos and multiplications. + */ +#define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \ + ((loc) & ~(fs)->fs_bmask) +#define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \ + ((loc) & ~(fs)->fs_fmask) +#define lblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \ + ((blk) << (fs)->fs_bshift) +#define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \ + ((loc) >> (fs)->fs_bshift) +#define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ + ((loc) >> (fs)->fs_fshift) +#define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \ + (((size) + (fs)->fs_bsize - 1) & (fs)->fs_bmask) +#define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \ + (((size) + (fs)->fs_fsize - 1) & (fs)->fs_fmask) +#define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \ + ((frags) >> (fs)->fs_fragshift) +#define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \ + ((blks) << (fs)->fs_fragshift) +#define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \ + ((fsb) & ((fs)->fs_frag - 1)) +#define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \ + ((fsb) &~ ((fs)->fs_frag - 1)) + +/* + * Determine the number of available frags given a + * percentage to hold in reserve + */ +#define freespace(fs, percentreserved) \ + (blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \ + (fs)->fs_cstotal.cs_nffree - ((fs)->fs_dsize * (percentreserved) / 100)) + +/* + * Determining the size of a file block in the file system. + */ +#define blksize(fs, ip, lbn) \ + (((lbn) >= NDADDR || (ip)->i_size >= ((lbn) + 1) << (fs)->fs_bshift) \ + ? (fs)->fs_bsize \ + : (fragroundup(fs, blkoff(fs, (ip)->i_size)))) +#define dblksize(fs, dip, lbn) \ + (((lbn) >= NDADDR || (dip)->di_size >= ((lbn) + 1) << (fs)->fs_bshift) \ + ? (fs)->fs_bsize \ + : (fragroundup(fs, blkoff(fs, (dip)->di_size)))) + +/* + * Number of disk sectors per block; assumes DEV_BSIZE byte sector size. + */ +#define NSPB(fs) ((fs)->fs_nspf << (fs)->fs_fragshift) +#define NSPF(fs) ((fs)->fs_nspf) + +/* + * INOPB is the number of inodes in a secondary storage block. + */ +#define INOPB(fs) ((fs)->fs_inopb) +#define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift) + +/* + * NINDIR is the number of indirects in a file system block. + */ +#define NINDIR(fs) ((fs)->fs_nindir) diff --git a/sys/ufs/inode.h b/sys/ufs/inode.h new file mode 100644 index 000000000000..2f6da627e075 --- /dev/null +++ b/sys/ufs/inode.h @@ -0,0 +1,234 @@ +/* + * Copyright (c) 1982, 1989 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)inode.h 7.17 (Berkeley) 5/8/91 + * $Id: inode.h,v 1.3 1993/10/16 18:17:40 rgrimes Exp $ + */ + +#ifdef KERNEL +#include "../ufs/dinode.h" +#else +#include <ufs/dinode.h> +#endif + +/* + * The inode is used to describe each active (or recently active) + * file in the UFS filesystem. It is composed of two types of + * information. The first part is the information that is needed + * only while the file is active (such as the identity of the file + * and linkage to speed its lookup). The second part is the + * permannent meta-data associated with the file which is read + * in from the permanent dinode from long term storage when the + * file becomes active, and is put back when the file is no longer + * being used. + */ +struct inode { + struct inode *i_chain[2]; /* hash chain, MUST be first */ + struct vnode *i_vnode; /* vnode associated with this inode */ + struct vnode *i_devvp; /* vnode for block I/O */ + u_long i_flag; /* see below */ + dev_t i_dev; /* device where inode resides */ + ino_t i_number; /* the identity of the inode */ + struct fs *i_fs; /* filesystem associated with this inode */ + struct dquot *i_dquot[MAXQUOTAS]; /* pointer to dquot structures */ + struct lockf *i_lockf; /* head of byte-level lock list */ + long i_diroff; /* offset in dir, where we found last entry */ + off_t i_endoff; /* end of useful stuff in directory */ + long i_spare0; + long i_spare1; + struct dinode i_din; /* the on-disk dinode */ +}; + +#define FASTLINK(ip) (DFASTLINK((ip)->i_din)) +#define i_symlink i_din.di_symlink +#define i_mode i_din.di_mode +#define i_nlink i_din.di_nlink +#define i_uid i_din.di_uid +#define i_gid i_din.di_gid +#if BYTE_ORDER == LITTLE_ENDIAN || defined(tahoe) /* ugh! -- must be fixed */ +#define i_size i_din.di_qsize.val[0] +#else /* BYTE_ORDER == BIG_ENDIAN */ +#define i_size i_din.di_qsize.val[1] +#endif +#define i_db i_din.di_db +#define i_ib i_din.di_ib +#define i_atime i_din.di_atime +#define i_mtime i_din.di_mtime +#define i_ctime i_din.di_ctime +#define i_blocks i_din.di_blocks +#define i_rdev i_din.di_db[0] +#define i_flags i_din.di_flags +#define i_gen i_din.di_gen +#define i_forw i_chain[0] +#define i_back i_chain[1] +#define i_di_spare i_din.di_spare + +/* flags */ +#define ILOCKED 0x0001 /* inode is locked */ +#define IWANT 0x0002 /* some process waiting on lock */ +#define IRENAME 0x0004 /* inode is being renamed */ +#define IUPD 0x0010 /* file has been modified */ +#define IACC 0x0020 /* inode access time to be updated */ +#define ICHG 0x0040 /* inode has been changed */ +#define IMOD 0x0080 /* inode has been modified */ +#define ISHLOCK 0x0100 /* file has shared lock */ +#define IEXLOCK 0x0200 /* file has exclusive lock */ +#define ILWAIT 0x0400 /* someone waiting on file lock */ + +#ifdef KERNEL +/* + * Convert between inode pointers and vnode pointers + */ +#define VTOI(vp) ((struct inode *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +/* + * Convert between vnode types and inode formats + */ +extern enum vtype iftovt_tab[]; +extern int vttoif_tab[]; +#define IFTOVT(mode) (iftovt_tab[((mode) & IFMT) >> 12]) +#define VTTOIF(indx) (vttoif_tab[(int)(indx)]) + +#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) + +u_long nextgennumber; /* next generation number to assign */ + +extern ino_t dirpref(); + +/* + * Lock and unlock inodes. + */ +#ifdef notdef +#define ILOCK(ip) { \ + while ((ip)->i_flag & ILOCKED) { \ + (ip)->i_flag |= IWANT; \ + (void) sleep((caddr_t)(ip), PINOD); \ + } \ + (ip)->i_flag |= ILOCKED; \ +} + +#define IUNLOCK(ip) { \ + (ip)->i_flag &= ~ILOCKED; \ + if ((ip)->i_flag&IWANT) { \ + (ip)->i_flag &= ~IWANT; \ + wakeup((caddr_t)(ip)); \ + } \ +} +#else +#define ILOCK(ip) ilock(ip) +#define IUNLOCK(ip) iunlock(ip) +#endif + +#define IUPDAT(ip, t1, t2, waitfor) { \ + if (ip->i_flag&(IUPD|IACC|ICHG|IMOD)) \ + (void) iupdat(ip, t1, t2, waitfor); \ +} + +#define ITIMES(ip, t1, t2) { \ + if ((ip)->i_flag&(IUPD|IACC|ICHG)) { \ + (ip)->i_flag |= IMOD; \ + if ((ip)->i_flag&IACC) \ + (ip)->i_atime = (t1)->tv_sec; \ + if ((ip)->i_flag&IUPD) \ + (ip)->i_mtime = (t2)->tv_sec; \ + if ((ip)->i_flag&ICHG) \ + (ip)->i_ctime = time.tv_sec; \ + (ip)->i_flag &= ~(IACC|IUPD|ICHG); \ + } \ +} + +/* + * This overlays the fid sturcture (see mount.h) + */ +struct ufid { + u_short ufid_len; /* length of structure */ + u_short ufid_pad; /* force long alignment */ + ino_t ufid_ino; /* file number (ino) */ + long ufid_gen; /* generation number */ +}; + +/* + * Prototypes for UFS vnode operations + */ +int ufs_lookup __P((struct vnode *vp, struct nameidata *ndp, struct proc *p)); +int ufs_create __P((struct nameidata *ndp, struct vattr *vap, struct proc *p)); +int ufs_mknod __P((struct nameidata *ndp, struct vattr *vap, struct ucred *cred, + struct proc *p)); +int ufs_open __P((struct vnode *vp, int mode, struct ucred *cred, + struct proc *p)); +int ufs_close __P((struct vnode *vp, int fflag, struct ucred *cred, + struct proc *p)); +int ufs_access __P((struct vnode *vp, int mode, struct ucred *cred, + struct proc *p)); +int ufs_getattr __P((struct vnode *vp, struct vattr *vap, struct ucred *cred, + struct proc *p)); +int ufs_setattr __P((struct vnode *vp, struct vattr *vap, struct ucred *cred, + struct proc *p)); +int ufs_read __P((struct vnode *vp, struct uio *uio, int ioflag, + struct ucred *cred)); +int ufs_write __P((struct vnode *vp, struct uio *uio, int ioflag, + struct ucred *cred)); +int ufs_ioctl __P((struct vnode *vp, int command, caddr_t data, int fflag, + struct ucred *cred, struct proc *p)); +int ufs_select __P((struct vnode *vp, int which, int fflags, struct ucred *cred, + struct proc *p)); +int ufs_mmap __P((struct vnode *vp, int fflags, struct ucred *cred, + struct proc *p)); +int ufs_fsync __P((struct vnode *vp, int fflags, struct ucred *cred, + int waitfor, struct proc *p)); +int ufs_seek __P((struct vnode *vp, off_t oldoff, off_t newoff, + struct ucred *cred)); +int ufs_remove __P((struct nameidata *ndp, struct proc *p)); +int ufs_link __P((struct vnode *vp, struct nameidata *ndp, struct proc *p)); +int ufs_rename __P((struct nameidata *fndp, struct nameidata *tdnp, + struct proc *p)); +int ufs_mkdir __P((struct nameidata *ndp, struct vattr *vap, struct proc *p)); +int ufs_rmdir __P((struct nameidata *ndp, struct proc *p)); +int ufs_symlink __P((struct nameidata *ndp, struct vattr *vap, char *target, + struct proc *p)); +int ufs_readdir __P((struct vnode *vp, struct uio *uio, struct ucred *cred, + int *eofflagp)); +int ufs_readlink __P((struct vnode *vp, struct uio *uio, struct ucred *cred)); +int ufs_abortop __P((struct nameidata *ndp)); +int ufs_inactive __P((struct vnode *vp, struct proc *p)); +int ufs_reclaim __P((struct vnode *vp)); +int ufs_lock __P((struct vnode *vp)); +int ufs_unlock __P((struct vnode *vp)); +int ufs_bmap __P((struct vnode *vp, daddr_t bn, struct vnode **vpp, + daddr_t *bnp)); +int ufs_strategy __P((struct buf *bp)); +int ufs_print __P((struct vnode *vp)); +int ufs_islocked __P((struct vnode *vp)); +int ufs_advlock __P((struct vnode *vp, caddr_t id, int op, struct flock *fl, + int flags)); +#endif /* KERNEL */ diff --git a/sys/ufs/lockf.h b/sys/ufs/lockf.h new file mode 100644 index 000000000000..dc8b99d1df26 --- /dev/null +++ b/sys/ufs/lockf.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)lockf.h 7.1 (Berkeley) 2/1/91 + * $Id: lockf.h,v 1.3 1993/10/20 07:31:36 davidg Exp $ + */ + +/* + * The lockf structure is a kernel structure which contains all the + * information associated with a byte range lock. The lockf structures + * are linked into the inode structure. Locks are sorted by the starting + * byte of the lock for efficiency. + */ +struct lockf { + short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */ + short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ + off_t lf_start; /* The byte # of the start of the lock */ + off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/ + caddr_t lf_id; /* The id of the resource holding the lock */ + struct lockf **lf_head; /* Back pointer to the head of lockf list */ + struct lockf *lf_next; /* A pointer to the next lock on this inode */ + struct lockf *lf_block; /* The list of blocked locks */ +}; + +/* + * Maximum length of sleep chains to traverse to try and detect deadlock. + */ +#define MAXDEPTH 50 + +#ifdef KERNEL +/* + * Public lock manipulation routines + */ +extern struct lockf *lf_remove(); /* Remove a lock */ +extern struct lockf *lf_getblock(); /* Return the first blocking lock */ + +#ifdef LOCKF_DEBUG +extern int lockf_debug; +#endif LOCKF_DEBUG +#endif KERNEL diff --git a/sys/ufs/mfs_vfsops.c b/sys/ufs/mfs_vfsops.c new file mode 100644 index 000000000000..5a28439a55c0 --- /dev/null +++ b/sys/ufs/mfs_vfsops.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 1989, 1990 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)mfs_vfsops.c 7.19 (Berkeley) 4/16/91 + * $Id: mfs_vfsops.c,v 1.3 1993/10/16 18:17:42 rgrimes Exp $ + */ + +#include "param.h" +#include "time.h" +#include "kernel.h" +#include "proc.h" +#include "buf.h" +#include "mount.h" +#include "signalvar.h" +#include "vnode.h" + +#include "quota.h" +#include "inode.h" +#include "ufsmount.h" +#include "mfsnode.h" +#include "fs.h" + +extern struct vnodeops mfs_vnodeops; + +/* + * mfs vfs operations. + */ +int mfs_mount(); +int mfs_start(); +int ufs_unmount(); +int ufs_root(); +int ufs_quotactl(); +int mfs_statfs(); +int ufs_sync(); +int ufs_fhtovp(); +int ufs_vptofh(); +int mfs_init(); + +struct vfsops mfs_vfsops = { + mfs_mount, + mfs_start, + ufs_unmount, + ufs_root, + ufs_quotactl, + mfs_statfs, + ufs_sync, + ufs_fhtovp, + ufs_vptofh, + mfs_init, +}; + +/* + * VFS Operations. + * + * mount system call + */ +/* ARGSUSED */ +mfs_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct mfs_args args; + struct ufsmount *ump; + register struct fs *fs; + register struct mfsnode *mfsp; + static int mfs_minor; + u_int size; + int error; + + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_ronly && (mp->mnt_flag & MNT_RDONLY) == 0) + fs->fs_ronly = 0; + return (0); + } + if (error = copyin(data, (caddr_t)&args, sizeof (struct mfs_args))) + return (error); + error = getnewvnode(VT_MFS, (struct mount *)0, &mfs_vnodeops, &devvp); + if (error) + return (error); + devvp->v_type = VBLK; + if (checkalias(devvp, makedev(255, mfs_minor++), (struct mount *)0)) + panic("mfs_mount: dup dev"); + mfsp = VTOMFS(devvp); + mfsp->mfs_baseoff = args.base; + mfsp->mfs_size = args.size; + mfsp->mfs_vnode = devvp; + mfsp->mfs_pid = p->p_pid; + mfsp->mfs_buflist = (struct buf *)0; + if (error = mountfs(devvp, mp)) { + mfsp->mfs_buflist = (struct buf *)-1; + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_fs; + (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); + bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + mp->mnt_stat.f_mntonname[MNAMELEN-1] = '\0'; + (void) copyinstr(args.name, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) mfs_statfs(mp, &mp->mnt_stat); + return (0); +} + +int mfs_pri = PWAIT | PCATCH; /* XXX prob. temp */ + +/* + * Used to grab the process and keep it in the kernel to service + * memory filesystem I/O requests. + * + * Loop servicing I/O requests. + * Copy the requested data into or out of the memory filesystem + * address space. + */ +/* ARGSUSED */ +mfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + register struct vnode *vp = VFSTOUFS(mp)->um_devvp; + register struct mfsnode *mfsp = VTOMFS(vp); + register struct buf *bp; + register caddr_t base; + int error = 0; + + base = mfsp->mfs_baseoff; + while (mfsp->mfs_buflist != (struct buf *)(-1)) { + while (bp = mfsp->mfs_buflist) { + mfsp->mfs_buflist = bp->av_forw; + mfs_doio(bp, base); + wakeup((caddr_t)bp); + } + /* + * If a non-ignored signal is received, try to unmount. + * If that fails, clear the signal (it has been "processed"), + * otherwise we will loop here, as tsleep will always return + * EINTR/ERESTART. + */ + if (error = tsleep((caddr_t)vp, mfs_pri, "mfsidl", 0)) + if (dounmount(mp, MNT_NOFORCE, p) != 0) + CLRSIG(p, CURSIG(p)); + } + return (error); +} + +/* + * Get file system statistics. + */ +mfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + + error = ufs_statfs(mp, sbp, p); + sbp->f_type = MOUNT_MFS; + return (error); +} diff --git a/sys/ufs/mfs_vnops.c b/sys/ufs/mfs_vnops.c new file mode 100644 index 000000000000..d0c5ff8ed5dd --- /dev/null +++ b/sys/ufs/mfs_vnops.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 1989 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)mfs_vnops.c 7.22 (Berkeley) 4/16/91 + * $Id: mfs_vnops.c,v 1.2 1993/10/16 18:17:44 rgrimes Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "time.h" +#include "kernel.h" +#include "proc.h" +#include "buf.h" +#include "vnode.h" + +#include "mfsnode.h" +#include "mfsiom.h" + +#include "machine/vmparam.h" + +/* + * mfs vnode operations. + */ +struct vnodeops mfs_vnodeops = { + mfs_lookup, /* lookup */ + mfs_create, /* create */ + mfs_mknod, /* mknod */ + mfs_open, /* open */ + mfs_close, /* close */ + mfs_access, /* access */ + mfs_getattr, /* getattr */ + mfs_setattr, /* setattr */ + mfs_read, /* read */ + mfs_write, /* write */ + mfs_ioctl, /* ioctl */ + mfs_select, /* select */ + mfs_mmap, /* mmap */ + mfs_fsync, /* fsync */ + mfs_seek, /* seek */ + mfs_remove, /* remove */ + mfs_link, /* link */ + mfs_rename, /* rename */ + mfs_mkdir, /* mkdir */ + mfs_rmdir, /* rmdir */ + mfs_symlink, /* symlink */ + mfs_readdir, /* readdir */ + mfs_readlink, /* readlink */ + mfs_abortop, /* abortop */ + mfs_inactive, /* inactive */ + mfs_reclaim, /* reclaim */ + mfs_lock, /* lock */ + mfs_unlock, /* unlock */ + mfs_bmap, /* bmap */ + mfs_strategy, /* strategy */ + mfs_print, /* print */ + mfs_islocked, /* islocked */ + mfs_advlock, /* advlock */ +}; + +/* + * Vnode Operations. + * + * Open called to allow memory filesystem to initialize and + * validate before actual IO. Record our process identifier + * so we can tell when we are doing I/O to ourself. + */ +/* ARGSUSED */ +mfs_open(vp, mode, cred, p) + register struct vnode *vp; + int mode; + struct ucred *cred; + struct proc *p; +{ + + if (vp->v_type != VBLK) { + panic("mfs_ioctl not VBLK"); + /* NOTREACHED */ + } + return (0); +} + +/* + * Ioctl operation. + */ +/* ARGSUSED */ +mfs_ioctl(vp, com, data, fflag, cred, p) + struct vnode *vp; + int com; + caddr_t data; + int fflag; + struct ucred *cred; + struct proc *p; +{ + + return (-1); +} + +/* + * Pass I/O requests to the memory filesystem process. + */ +mfs_strategy(bp) + register struct buf *bp; +{ + register struct mfsnode *mfsp; + struct vnode *vp; + struct proc *p = curproc; /* XXX */ + + if (vfinddev(bp->b_dev, VBLK, &vp) || vp->v_usecount == 0) + panic("mfs_strategy: bad dev"); + mfsp = VTOMFS(vp); + if (mfsp->mfs_pid == p->p_pid) { + mfs_doio(bp, mfsp->mfs_baseoff); + } else { + bp->av_forw = mfsp->mfs_buflist; + mfsp->mfs_buflist = bp; + wakeup((caddr_t)vp); + } + return (0); +} + +/* + * Memory file system I/O. + * + * Trivial since buffer has already been mapping into KVA space. + */ +mfs_doio(bp, base) + register struct buf *bp; + caddr_t base; +{ + base += (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + bp->b_error = copyin(base, bp->b_un.b_addr, bp->b_bcount); + else + bp->b_error = copyout(bp->b_un.b_addr, base, bp->b_bcount); + if (bp->b_error) + bp->b_flags |= B_ERROR; + biodone(bp); +} + +/* + * This is a noop, simply returning what one has been given. + */ +mfs_bmap(vp, bn, vpp, bnp) + struct vnode *vp; + daddr_t bn; + struct vnode **vpp; + daddr_t *bnp; +{ + + if (vpp != NULL) + *vpp = vp; + if (bnp != NULL) + *bnp = bn; + return (0); +} + +/* + * Memory filesystem close routine + */ +/* ARGSUSED */ +mfs_close(vp, flag, cred, p) + register struct vnode *vp; + int flag; + struct ucred *cred; + struct proc *p; +{ + register struct mfsnode *mfsp = VTOMFS(vp); + register struct buf *bp; + + /* + * Finish any pending I/O requests. + */ + while (bp = mfsp->mfs_buflist) { + mfsp->mfs_buflist = bp->av_forw; + mfs_doio(bp, mfsp->mfs_baseoff); + wakeup((caddr_t)bp); + } + /* + * On last close of a memory filesystem + * we must invalidate any in core blocks, so that + * we can, free up its vnode. + */ + vflushbuf(vp, 0); + if (vinvalbuf(vp, 1)) + return (0); + /* + * There should be no way to have any more uses of this + * vnode, so if we find any other uses, it is a panic. + */ + if (vp->v_usecount > 1) + printf("mfs_close: ref count %d > 1\n", vp->v_usecount); + if (vp->v_usecount > 1 || mfsp->mfs_buflist) + panic("mfs_close"); + /* + * Send a request to the filesystem server to exit. + */ + mfsp->mfs_buflist = (struct buf *)(-1); + wakeup((caddr_t)vp); + return (0); +} + +/* + * Memory filesystem inactive routine + */ +/* ARGSUSED */ +mfs_inactive(vp, p) + struct vnode *vp; + struct proc *p; +{ + + if (VTOMFS(vp)->mfs_buflist != (struct buf *)(-1)) + panic("mfs_inactive: not inactive"); + return (0); +} + +/* + * Print out the contents of an mfsnode. + */ +mfs_print(vp) + struct vnode *vp; +{ + register struct mfsnode *mfsp = VTOMFS(vp); + + printf("tag VT_MFS, pid %d, base %d, size %d\n", mfsp->mfs_pid, + mfsp->mfs_baseoff, mfsp->mfs_size); +} + +/* + * Block device bad operation + */ +mfs_badop() +{ + + panic("mfs_badop called\n"); + /* NOTREACHED */ +} + +/* + * Memory based filesystem initialization. + */ +mfs_init() +{ + +} diff --git a/sys/ufs/mfsiom.h b/sys/ufs/mfsiom.h new file mode 100644 index 000000000000..0980bb286812 --- /dev/null +++ b/sys/ufs/mfsiom.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 1989 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)mfsiom.h 7.2 (Berkeley) 6/28/90 + * $Id: mfsiom.h,v 1.2 1993/10/16 18:17:45 rgrimes Exp $ + */ + +#define MFS_MAPREG (MAXPHYS/NBPG + 2) /* Kernel mapping pte's */ +#define MFS_MAPSIZE 10 /* Size of alloc map for pte's */ diff --git a/sys/ufs/mfsnode.h b/sys/ufs/mfsnode.h new file mode 100644 index 000000000000..9cd1c29e4845 --- /dev/null +++ b/sys/ufs/mfsnode.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 1989 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)mfsnode.h 7.3 (Berkeley) 4/16/91 + * $Id: mfsnode.h,v 1.2 1993/10/16 18:17:46 rgrimes Exp $ + */ + +/* + * This structure defines the control data for the memory + * based file system. + */ + +struct mfsnode { + struct vnode *mfs_vnode; /* vnode associated with this mfsnode */ + caddr_t mfs_baseoff; /* base of file system in memory */ + long mfs_size; /* size of memory file system */ + pid_t mfs_pid; /* supporting process pid */ + struct buf *mfs_buflist; /* list of I/O requests */ + long mfs_spare[4]; +}; + +/* + * Convert between mfsnode pointers and vnode pointers + */ +#define VTOMFS(vp) ((struct mfsnode *)(vp)->v_data) +#define MFSTOV(mfsp) ((mfsp)->mfs_vnode) + +/* + * Prototypes for MFS operations on vnodes. + */ +int mfs_badop(); +#define mfs_lookup ((int (*) __P(( \ + struct vnode *vp, \ + struct nameidata *ndp, \ + struct proc *p))) mfs_badop) +#define mfs_create ((int (*) __P(( \ + struct nameidata *ndp, \ + struct vattr *vap, \ + struct proc *p))) mfs_badop) +#define mfs_mknod ((int (*) __P(( \ + struct nameidata *ndp, \ + struct vattr *vap, \ + struct ucred *cred, \ + struct proc *p))) mfs_badop) +int mfs_open __P(( + struct vnode *vp, + int mode, + struct ucred *cred, + struct proc *p)); +int mfs_close __P(( + struct vnode *vp, + int fflag, + struct ucred *cred, + struct proc *p)); +#define mfs_access ((int (*) __P(( \ + struct vnode *vp, \ + int mode, \ + struct ucred *cred, \ + struct proc *p))) mfs_badop) +#define mfs_getattr ((int (*) __P(( \ + struct vnode *vp, \ + struct vattr *vap, \ + struct ucred *cred, \ + struct proc *p))) mfs_badop) +#define mfs_setattr ((int (*) __P(( \ + struct vnode *vp, \ + struct vattr *vap, \ + struct ucred *cred, \ + struct proc *p))) mfs_badop) +#define mfs_read ((int (*) __P(( \ + struct vnode *vp, \ + struct uio *uio, \ + int ioflag, \ + struct ucred *cred))) mfs_badop) +#define mfs_write ((int (*) __P(( \ + struct vnode *vp, \ + struct uio *uio, \ + int ioflag, \ + struct ucred *cred))) mfs_badop) +int mfs_ioctl __P(( + struct vnode *vp, + int command, + caddr_t data, + int fflag, + struct ucred *cred, + struct proc *p)); +#define mfs_select ((int (*) __P(( \ + struct vnode *vp, \ + int which, \ + int fflags, \ + struct ucred *cred, \ + struct proc *p))) mfs_badop) +#define mfs_mmap ((int (*) __P(( \ + struct vnode *vp, \ + int fflags, \ + struct ucred *cred, \ + struct proc *p))) mfs_badop) +#define mfs_fsync ((int (*) __P(( \ + struct vnode *vp, \ + int fflags, \ + struct ucred *cred, \ + int waitfor, \ + struct proc *p))) mfs_badop) +#define mfs_seek ((int (*) __P(( \ + struct vnode *vp, \ + off_t oldoff, \ + off_t newoff, \ + struct ucred *cred))) mfs_badop) +#define mfs_remove ((int (*) __P(( \ + struct nameidata *ndp, \ + struct proc *p))) mfs_badop) +#define mfs_link ((int (*) __P(( \ + struct vnode *vp, \ + struct nameidata *ndp, \ + struct proc *p))) mfs_badop) +#define mfs_rename ((int (*) __P(( \ + struct nameidata *fndp, \ + struct nameidata *tdnp, \ + struct proc *p))) mfs_badop) +#define mfs_mkdir ((int (*) __P(( \ + struct nameidata *ndp, \ + struct vattr *vap, \ + struct proc *p))) mfs_badop) +#define mfs_rmdir ((int (*) __P(( \ + struct nameidata *ndp, \ + struct proc *p))) mfs_badop) +#define mfs_symlink ((int (*) __P(( \ + struct nameidata *ndp, \ + struct vattr *vap, \ + char *target, \ + struct proc *p))) mfs_badop) +#define mfs_readdir ((int (*) __P(( \ + struct vnode *vp, \ + struct uio *uio, \ + struct ucred *cred, \ + int *eofflagp))) mfs_badop) +#define mfs_readlink ((int (*) __P(( \ + struct vnode *vp, \ + struct uio *uio, \ + struct ucred *cred))) mfs_badop) +#define mfs_abortop ((int (*) __P(( \ + struct nameidata *ndp))) mfs_badop) +int mfs_inactive __P(( + struct vnode *vp, + struct proc *p)); +#define mfs_reclaim ((int (*) __P(( \ + struct vnode *vp))) nullop) +#define mfs_lock ((int (*) __P(( \ + struct vnode *vp))) nullop) +#define mfs_unlock ((int (*) __P(( \ + struct vnode *vp))) nullop) +int mfs_bmap __P(( + struct vnode *vp, + daddr_t bn, + struct vnode **vpp, + daddr_t *bnp)); +int mfs_strategy __P(( + struct buf *bp)); +int mfs_print __P(( + struct vnode *vp)); +#define mfs_islocked ((int (*) __P(( \ + struct vnode *vp))) nullop) +#define mfs_advlock ((int (*) __P(( \ + struct vnode *vp, \ + caddr_t id, \ + int op, \ + struct flock *fl, \ + int flags))) mfs_badop) diff --git a/sys/ufs/quota.h b/sys/ufs/quota.h new file mode 100644 index 000000000000..afcb865b13d3 --- /dev/null +++ b/sys/ufs/quota.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 1982, 1986 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)quota.h 7.9 (Berkeley) 2/22/91 + * $Id: quota.h,v 1.2 1993/10/16 18:17:47 rgrimes Exp $ + */ + +#ifndef _QUOTA_ +#define _QUOTA_ + +/* + * Definitions for disk quotas imposed on the average user + * (big brother finally hits UNIX). + * + * The following constants define the amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). The timer is started when the user crosses + * their soft limit, it is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME (7*24*60*60) /* 1 week */ +#define MAX_DQ_TIME (7*24*60*60) /* 1 week */ + +/* + * The following constants define the usage of the quota file array + * in the ufsmount structure and dquot array in the inode structure. + * The semantics of the elements of these arrays are defined in the + * routine getinoquota; the remainder of the quota code treats them + * generically and need not be inspected when changing the size of + * the array. + */ +#define MAXQUOTAS 2 +#define USRQUOTA 0 /* element used for user quotas */ +#define GRPQUOTA 1 /* element used for group quotas */ + +/* + * Definitions for the default names of the quotas files. + */ +#define INITQFNAMES { \ + "user", /* USRQUOTA */ \ + "group", /* GRPQUOTA */ \ + "undefined", \ +}; +#define QUOTAFILENAME "quota" +#define QUOTAGROUP "operator" + +/* + * Command definitions for the 'quotactl' system call. + * The commands are broken into a main command defined below + * and a subcommand that is used to convey the type of + * quota that is being manipulated (see above). + */ +#define SUBCMDMASK 0x00ff +#define SUBCMDSHIFT 8 +#define QCMD(cmd, type) (((cmd) << SUBCMDSHIFT) | ((type) & SUBCMDMASK)) + +#define Q_QUOTAON 0x0100 /* enable quotas */ +#define Q_QUOTAOFF 0x0200 /* disable quotas */ +#define Q_GETQUOTA 0x0300 /* get limits and usage */ +#define Q_SETQUOTA 0x0400 /* set limits and usage */ +#define Q_SETUSE 0x0500 /* set usage */ +#define Q_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. The setquota system call establishes + * the vnode for each quota file (a pointer is retained in the ufsmount + * structure). + */ +struct dqblk { + u_long dqb_bhardlimit; /* absolute limit on disk blks alloc */ + u_long dqb_bsoftlimit; /* preferred limit on disk blks */ + u_long dqb_curblocks; /* current block count */ + u_long dqb_ihardlimit; /* maximum # allocated inodes + 1 */ + u_long dqb_isoftlimit; /* preferred inode limit */ + u_long dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive files */ +}; + +#ifdef KERNEL +/* + * The following structure records disk usage for a user or group on a + * filesystem. There is one allocated for each quota that exists on any + * filesystem for the current user or group. A cache is kept of recently + * used entries. + */ +struct dquot { + struct dquot *dq_forw, *dq_back;/* MUST be first entry */ + struct dquot *dq_freef, **dq_freeb; /* free list */ + short dq_flags; /* flags, see below */ + short dq_cnt; /* count of active references */ + short dq_spare; /* unused spare padding */ + short dq_type; /* quota type of this dquot */ + u_long dq_id; /* identifier this applies to */ + struct ufsmount *dq_ump; /* filesystem that this is taken from */ + struct dqblk dq_dqb; /* actual usage & quotas */ +}; +/* + * Flag values. + */ +#define DQ_LOCK 0x01 /* this quota locked (no MODS) */ +#define DQ_WANT 0x02 /* wakeup on unlock */ +#define DQ_MOD 0x04 /* this quota modified since read */ +#define DQ_FAKE 0x08 /* no limits here, just usage */ +#define DQ_BLKS 0x10 /* has been warned about blk limit */ +#define DQ_INODS 0x20 /* has been warned about inode limit */ +/* + * Shorthand notation. + */ +#define dq_bhardlimit dq_dqb.dqb_bhardlimit +#define dq_bsoftlimit dq_dqb.dqb_bsoftlimit +#define dq_curblocks dq_dqb.dqb_curblocks +#define dq_ihardlimit dq_dqb.dqb_ihardlimit +#define dq_isoftlimit dq_dqb.dqb_isoftlimit +#define dq_curinodes dq_dqb.dqb_curinodes +#define dq_btime dq_dqb.dqb_btime +#define dq_itime dq_dqb.dqb_itime + +/* + * If the system has never checked for a quota for this file, + * then it is set to NODQUOT. Once a write attempt is made + * the inode pointer is set to reference a dquot structure. + */ +#define NODQUOT ((struct dquot *) 0) + +/* + * Flags to chkdq() and chkiq() + */ +#define FORCE 0x01 /* force usage changes independent of limits */ +#define CHOWN 0x02 /* (advisory) change initiated by chown */ + +/* + * Macros to avoid subroutine calls to trivial functions. + */ +#ifndef DIAGNOSTIC +#define DQREF(dq) (dq)->dq_cnt++ +#else +#define DQREF(dq) dqref(dq) +#endif /* DIAGNOSTIC */ + +#else + +#include <sys/cdefs.h> + +__BEGIN_DECLS +int quotactl __P((const char *, int, int, void *)); +__END_DECLS + +#endif /* KERNEL */ +#endif /* _QUOTA_ */ diff --git a/sys/ufs/ufs_alloc.c b/sys/ufs/ufs_alloc.c new file mode 100644 index 000000000000..8c43640403aa --- /dev/null +++ b/sys/ufs/ufs_alloc.c @@ -0,0 +1,1101 @@ +/* + * Copyright (c) 1982, 1986, 1989 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_alloc.c 7.26 (Berkeley) 5/2/91 + * $Id: ufs_alloc.c,v 1.2 1993/10/16 18:17:49 rgrimes Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "buf.h" +#include "proc.h" +#include "vnode.h" +#include "kernel.h" +#include "syslog.h" + +#include "quota.h" +#include "inode.h" +#include "fs.h" + +extern u_long hashalloc(); +extern ino_t ialloccg(); +extern daddr_t alloccg(); +extern daddr_t alloccgblk(); +extern daddr_t fragextend(); +extern daddr_t blkpref(); +extern daddr_t mapsearch(); +extern int inside[], around[]; +extern unsigned char *fragtbl[]; + +/* + * Allocate a block in the file system. + * + * The size of the requested block is given, which must be some + * multiple of fs_fsize and <= fs_bsize. + * A preference may be optionally specified. If a preference is given + * the following hierarchy is used to allocate a block: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate a block in the same cylinder group. + * 4) quadradically rehash into other cylinder groups, until an + * available block is located. + * If no block preference is given the following heirarchy is used + * to allocate a block: + * 1) allocate a block in the cylinder group that contains the + * inode for the file. + * 2) quadradically rehash into other cylinder groups, until an + * available block is located. + */ +alloc(ip, lbn, bpref, size, bnp) + register struct inode *ip; + daddr_t lbn, bpref; + int size; + daddr_t *bnp; +{ + daddr_t bno; + register struct fs *fs; + register struct buf *bp; + int cg, error; + struct ucred *cred = curproc->p_ucred; /* XXX */ + + *bnp = 0; + fs = ip->i_fs; + if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); + panic("alloc: bad size"); + } + if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) + goto nospace; + if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0) + goto nospace; +#ifdef QUOTA + if (error = chkdq(ip, (long)btodb(size), cred, 0)) + return (error); +#endif + if (bpref >= fs->fs_size) + bpref = 0; + if (bpref == 0) + cg = itog(fs, ip->i_number); + else + cg = dtog(fs, bpref); + bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size, + (u_long (*)())alloccg); + if (bno > 0) { + ip->i_blocks += btodb(size); + ip->i_flag |= IUPD|ICHG; + *bnp = bno; + return (0); + } +#ifdef QUOTA + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, (long)-btodb(size), cred, FORCE); +#endif +nospace: + fserr(fs, cred->cr_uid, "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Reallocate a fragment to a bigger size + * + * The number and size of the old block is given, and a preference + * and new size is also specified. The allocator attempts to extend + * the original block. Failing that, the regular block allocator is + * invoked to get an appropriate block. + */ +realloccg(ip, lbprev, bpref, osize, nsize, bpp) + register struct inode *ip; + off_t lbprev; + daddr_t bpref; + int osize, nsize; + struct buf **bpp; +{ + register struct fs *fs; + struct buf *bp, *obp; + int cg, request, error; + daddr_t bprev, bno; + struct ucred *cred = curproc->p_ucred; /* XXX */ + + *bpp = 0; + fs = ip->i_fs; + if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || + (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { + printf("dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); + panic("realloccg: bad size"); + } + if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0) + goto nospace; + if ((bprev = ip->i_db[lbprev]) == 0) { + printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt); + panic("realloccg: bad bprev"); + } + /* + * Allocate the extra space in the buffer. + */ + if (error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) { + brelse(bp); + return (error); + } +#ifdef QUOTA + if (error = chkdq(ip, (long)btodb(nsize - osize), cred, 0)) { + brelse(bp); + return (error); + } +#endif + /* + * Check for extension in the existing location. + */ + cg = dtog(fs, bprev); + if (bno = fragextend(ip, cg, (long)bprev, osize, nsize)) { + if (bp->b_blkno != fsbtodb(fs, bno)) + panic("bad blockno"); + ip->i_blocks += btodb(nsize - osize); + ip->i_flag |= IUPD|ICHG; + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + bzero(bp->b_un.b_addr + osize, (unsigned)nsize - osize); + *bpp = bp; + return (0); + } + /* + * Allocate a new disk location. + */ + if (bpref >= fs->fs_size) + bpref = 0; + switch ((int)fs->fs_optim) { + case FS_OPTSPACE: + /* + * Allocate an exact sized fragment. Although this makes + * best use of space, we will waste time relocating it if + * the file continues to grow. If the fragmentation is + * less than half of the minimum free reserve, we choose + * to begin optimizing for time. + */ + request = nsize; + if (fs->fs_minfree < 5 || + fs->fs_cstotal.cs_nffree > + fs->fs_dsize * fs->fs_minfree / (2 * 100)) + break; + log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", + fs->fs_fsmnt); + fs->fs_optim = FS_OPTTIME; + break; + case FS_OPTTIME: + /* + * At this point we have discovered a file that is trying + * to grow a small fragment to a larger fragment. To save + * time, we allocate a full sized block, then free the + * unused portion. If the file continues to grow, the + * `fragextend' call above will be able to grow it in place + * without further copying. If aberrant programs cause + * disk fragmentation to grow within 2% of the free reserve, + * we choose to begin optimizing for space. + */ + request = fs->fs_bsize; + if (fs->fs_cstotal.cs_nffree < + fs->fs_dsize * (fs->fs_minfree - 2) / 100) + break; + log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", + fs->fs_fsmnt); + fs->fs_optim = FS_OPTSPACE; + break; + default: + printf("dev = 0x%x, optim = %d, fs = %s\n", + ip->i_dev, fs->fs_optim, fs->fs_fsmnt); + panic("realloccg: bad optim"); + /* NOTREACHED */ + } + bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request, + (u_long (*)())alloccg); + if (bno > 0) { + bp->b_blkno = fsbtodb(fs, bno); + (void) vnode_pager_uncache(ITOV(ip)); + blkfree(ip, bprev, (off_t)osize); + if (nsize < request) + blkfree(ip, bno + numfrags(fs, nsize), + (off_t)(request - nsize)); + ip->i_blocks += btodb(nsize - osize); + ip->i_flag |= IUPD|ICHG; + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + bzero(bp->b_un.b_addr + osize, (unsigned)nsize - osize); + *bpp = bp; + return (0); + } +#ifdef QUOTA + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE); +#endif + brelse(bp); +nospace: + /* + * no space available + */ + fserr(fs, cred->cr_uid, "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Allocate an inode in the file system. + * + * A preference may be optionally specified. If a preference is given + * the following hierarchy is used to allocate an inode: + * 1) allocate the requested inode. + * 2) allocate an inode in the same cylinder group. + * 3) quadradically rehash into other cylinder groups, until an + * available inode is located. + * If no inode preference is given the following heirarchy is used + * to allocate an inode: + * 1) allocate an inode in cylinder group 0. + * 2) quadradically rehash into other cylinder groups, until an + * available inode is located. + */ +ialloc(pip, ipref, mode, cred, ipp) + register struct inode *pip; + ino_t ipref; + int mode; + struct ucred *cred; + struct inode **ipp; +{ + ino_t ino; + register struct fs *fs; + register struct inode *ip; + int cg, error; + + *ipp = 0; + fs = pip->i_fs; + if (fs->fs_cstotal.cs_nifree == 0) + goto noinodes; + if (ipref >= fs->fs_ncg * fs->fs_ipg) + ipref = 0; + cg = itog(fs, ipref); + ino = (ino_t)hashalloc(pip, cg, (long)ipref, mode, ialloccg); + if (ino == 0) + goto noinodes; + error = iget(pip, ino, ipp); + if (error) { + ifree(pip, ino, mode); + return (error); + } + ip = *ipp; + if (ip->i_mode) { + printf("mode = 0%o, inum = %d, fs = %s\n", + ip->i_mode, ip->i_number, fs->fs_fsmnt); + panic("ialloc: dup alloc"); + } + if (ip->i_blocks) { /* XXX */ + printf("free inode %s/%d had %d blocks\n", + fs->fs_fsmnt, ino, ip->i_blocks); + ip->i_blocks = 0; + } + ip->i_flags = 0; + /* + * Set up a new generation number for this inode. + */ + if (++nextgennumber < (u_long)time.tv_sec) + nextgennumber = time.tv_sec; + ip->i_gen = nextgennumber; + return (0); +noinodes: + fserr(fs, cred->cr_uid, "out of inodes"); + uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Find a cylinder to place a directory. + * + * The policy implemented by this algorithm is to select from + * among those cylinder groups with above the average number of + * free inodes, the one with the smallest number of directories. + */ +ino_t +dirpref(fs) + register struct fs *fs; +{ + int cg, minndir, mincg, avgifree; + + avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; + minndir = fs->fs_ipg; + mincg = 0; + for (cg = 0; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + return ((ino_t)(fs->fs_ipg * mincg)); +} + +/* + * Select the desired position for the next block in a file. The file is + * logically divided into sections. The first section is composed of the + * direct blocks. Each additional section contains fs_maxbpg blocks. + * + * If no blocks have been allocated in the first section, the policy is to + * request a block in the same cylinder group as the inode that describes + * the file. If no blocks have been allocated in any other section, the + * policy is to place the section in a cylinder group with a greater than + * average number of free blocks. An appropriate cylinder group is found + * by using a rotor that sweeps the cylinder groups. When a new group of + * blocks is needed, the sweep begins in the cylinder group following the + * cylinder group from which the previous allocation was made. The sweep + * continues until a cylinder group with greater than the average number + * of free blocks is found. If the allocation is for the first block in an + * indirect block, the information on the previous allocation is unavailable; + * here a best guess is made based upon the logical block number being + * allocated. + * + * If a section is already partially allocated, the policy is to + * contiguously allocate fs_maxcontig blocks. The end of one of these + * contiguous blocks and the beginning of the next is physically separated + * so that the disk head will be in transit between them for at least + * fs_rotdelay milliseconds. This is to allow time for the processor to + * schedule another I/O transfer. + */ +daddr_t +blkpref(ip, lbn, indx, bap) + struct inode *ip; + daddr_t lbn; + int indx; + daddr_t *bap; +{ + register struct fs *fs; + register int cg; + int avgbfree, startcg; + daddr_t nextblk; + + fs = ip->i_fs; + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { + if (lbn < NDADDR) { + cg = itog(fs, ip->i_number); + return (fs->fs_fpg * cg + fs->fs_frag); + } + /* + * Find a cylinder with greater than average number of + * unused data blocks. + */ + if (indx == 0 || bap[indx - 1] == 0) + startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg; + else + startcg = dtog(fs, bap[indx - 1]) + 1; + startcg %= fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + for (cg = startcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (fs->fs_fpg * cg + fs->fs_frag); + } + for (cg = 0; cg <= startcg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (fs->fs_fpg * cg + fs->fs_frag); + } + return (NULL); + } + /* + * One or more previous blocks have been laid out. If less + * than fs_maxcontig previous blocks are contiguous, the + * next block is requested contiguously, otherwise it is + * requested rotationally delayed by fs_rotdelay milliseconds. + */ + nextblk = bap[indx - 1] + fs->fs_frag; + if (indx < fs->fs_maxcontig || bap[indx - fs->fs_maxcontig] + + blkstofrags(fs, fs->fs_maxcontig) != nextblk) + return (nextblk); + if (fs->fs_rotdelay != 0) + /* + * Here we convert ms of delay to frags as: + * (frags) = (ms) * (rev/sec) * (sect/rev) / + * ((sect/frag) * (ms/sec)) + * then round up to the next block. + */ + nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / + (NSPF(fs) * 1000), fs->fs_frag); + return (nextblk); +} + +/* + * Implement the cylinder overflow algorithm. + * + * The policy implemented by this algorithm is: + * 1) allocate the block in its requested cylinder group. + * 2) quadradically rehash on the cylinder group number. + * 3) brute force search for a free block. + */ +/*VARARGS5*/ +u_long +hashalloc(ip, cg, pref, size, allocator) + struct inode *ip; + int cg; + long pref; + int size; /* size for data blocks, mode for inodes */ + u_long (*allocator)(); +{ + register struct fs *fs; + long result; + int i, icg = cg; + + fs = ip->i_fs; + /* + * 1: preferred cylinder group + */ + result = (*allocator)(ip, cg, pref, size); + if (result) + return (result); + /* + * 2: quadratic rehash + */ + for (i = 1; i < fs->fs_ncg; i *= 2) { + cg += i; + if (cg >= fs->fs_ncg) + cg -= fs->fs_ncg; + result = (*allocator)(ip, cg, 0, size); + if (result) + return (result); + } + /* + * 3: brute force search + * Note that we start at i == 2, since 0 was checked initially, + * and 1 is always checked in the quadratic rehash. + */ + cg = (icg + 2) % fs->fs_ncg; + for (i = 2; i < fs->fs_ncg; i++) { + result = (*allocator)(ip, cg, 0, size); + if (result) + return (result); + cg++; + if (cg == fs->fs_ncg) + cg = 0; + } + return (NULL); +} + +/* + * Determine whether a fragment can be extended. + * + * Check to see if the necessary fragments are available, and + * if they are, allocate them. + */ +daddr_t +fragextend(ip, cg, bprev, osize, nsize) + struct inode *ip; + int cg; + long bprev; + int osize, nsize; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + long bno; + int frags, bbase; + int i, error; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) + return (NULL); + frags = numfrags(fs, nsize); + bbase = fragnum(fs, bprev); + if (bbase > fragnum(fs, (bprev + frags - 1))) { + /* cannot extend across a block boundary */ + return (NULL); + } + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = bp->b_un.b_cg; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + bno = dtogd(fs, bprev); + for (i = numfrags(fs, osize); i < frags; i++) + if (isclr(cg_blksfree(cgp), bno + i)) { + brelse(bp); + return (NULL); + } + /* + * the current fragment can be extended + * deduct the count on fragment being extended into + * increase the count on the remaining fragment (if any) + * allocate the extended piece + */ + for (i = frags; i < fs->fs_frag - bbase; i++) + if (isclr(cg_blksfree(cgp), bno + i)) + break; + cgp->cg_frsum[i - numfrags(fs, osize)]--; + if (i != frags) + cgp->cg_frsum[i - frags]++; + for (i = numfrags(fs, osize); i < frags; i++) { + clrbit(cg_blksfree(cgp), bno + i); + cgp->cg_cs.cs_nffree--; + fs->fs_cstotal.cs_nffree--; + fs->fs_cs(fs, cg).cs_nffree--; + } + fs->fs_fmod++; + bdwrite(bp); + return (bprev); +} + +/* + * Determine whether a block can be allocated. + * + * Check to see if a block of the apprpriate size is available, + * and if it is, allocate it. + */ +daddr_t +alloccg(ip, cg, bpref, size) + struct inode *ip; + int cg; + daddr_t bpref; + int size; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + register int i; + int error, bno, frags, allocsiz; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) + return (NULL); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = bp->b_un.b_cg; + if (!cg_chkmagic(cgp) || + (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + if (size == fs->fs_bsize) { + bno = alloccgblk(fs, cgp, bpref); + bdwrite(bp); + return (bno); + } + /* + * check to see if any fragments are already available + * allocsiz is the size which will be allocated, hacking + * it down to a smaller size if necessary + */ + frags = numfrags(fs, size); + for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) + if (cgp->cg_frsum[allocsiz] != 0) + break; + if (allocsiz == fs->fs_frag) { + /* + * no fragments were available, so a block will be + * allocated, and hacked up + */ + if (cgp->cg_cs.cs_nbfree == 0) { + brelse(bp); + return (NULL); + } + bno = alloccgblk(fs, cgp, bpref); + bpref = dtogd(fs, bno); + for (i = frags; i < fs->fs_frag; i++) + setbit(cg_blksfree(cgp), bpref + i); + i = fs->fs_frag - frags; + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + fs->fs_fmod++; + cgp->cg_frsum[i]++; + bdwrite(bp); + return (bno); + } + bno = mapsearch(fs, cgp, bpref, allocsiz); + if (bno < 0) { + brelse(bp); + return (NULL); + } + for (i = 0; i < frags; i++) + clrbit(cg_blksfree(cgp), bno + i); + cgp->cg_cs.cs_nffree -= frags; + fs->fs_cstotal.cs_nffree -= frags; + fs->fs_cs(fs, cg).cs_nffree -= frags; + fs->fs_fmod++; + cgp->cg_frsum[allocsiz]--; + if (frags != allocsiz) + cgp->cg_frsum[allocsiz - frags]++; + bdwrite(bp); + return (cg * fs->fs_fpg + bno); +} + +/* + * Allocate a block in a cylinder group. + * + * This algorithm implements the following policy: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate the next available block on the block rotor for the + * specified cylinder group. + * Note that this routine only allocates fs_bsize blocks; these + * blocks may be fragmented by the routine that allocates them. + */ +daddr_t +alloccgblk(fs, cgp, bpref) + register struct fs *fs; + register struct cg *cgp; + daddr_t bpref; +{ + daddr_t bno; + int cylno, pos, delta; + short *cylbp; + register int i; + + if (bpref == 0) { + bpref = cgp->cg_rotor; + goto norot; + } + bpref = blknum(fs, bpref); + bpref = dtogd(fs, bpref); + /* + * if the requested block is available, use it + */ + if (isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bpref))) { + bno = bpref; + goto gotit; + } + /* + * check for a block available on the same cylinder + */ + cylno = cbtocylno(fs, bpref); + if (cg_blktot(cgp)[cylno] == 0) + goto norot; + if (fs->fs_cpc == 0) { + /* + * block layout info is not available, so just have + * to take any block in this cylinder. + */ + bpref = howmany(fs->fs_spc * cylno, NSPF(fs)); + goto norot; + } + /* + * check the summary information to see if a block is + * available in the requested cylinder starting at the + * requested rotational position and proceeding around. + */ + cylbp = cg_blks(fs, cgp, cylno); + pos = cbtorpos(fs, bpref); + for (i = pos; i < fs->fs_nrpos; i++) + if (cylbp[i] > 0) + break; + if (i == fs->fs_nrpos) + for (i = 0; i < pos; i++) + if (cylbp[i] > 0) + break; + if (cylbp[i] > 0) { + /* + * found a rotational position, now find the actual + * block. A panic if none is actually there. + */ + pos = cylno % fs->fs_cpc; + bno = (cylno - pos) * fs->fs_spc / NSPB(fs); + if (fs_postbl(fs, pos)[i] == -1) { + printf("pos = %d, i = %d, fs = %s\n", + pos, i, fs->fs_fsmnt); + panic("alloccgblk: cyl groups corrupted"); + } + for (i = fs_postbl(fs, pos)[i];; ) { + if (isblock(fs, cg_blksfree(cgp), bno + i)) { + bno = blkstofrags(fs, (bno + i)); + goto gotit; + } + delta = fs_rotbl(fs)[i]; + if (delta <= 0 || + delta + i > fragstoblks(fs, fs->fs_fpg)) + break; + i += delta; + } + printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); + panic("alloccgblk: can't find blk in cyl"); + } +norot: + /* + * no blocks in the requested cylinder, so take next + * available one in this cylinder group. + */ + bno = mapsearch(fs, cgp, bpref, (int)fs->fs_frag); + if (bno < 0) + return (NULL); + cgp->cg_rotor = bno; +gotit: + clrblock(fs, cg_blksfree(cgp), (long)fragstoblks(fs, bno)); + cgp->cg_cs.cs_nbfree--; + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; + cylno = cbtocylno(fs, bno); + cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--; + cg_blktot(cgp)[cylno]--; + fs->fs_fmod++; + return (cgp->cg_cgx * fs->fs_fpg + bno); +} + +/* + * Determine whether an inode can be allocated. + * + * Check to see if an inode is available, and if it is, + * allocate it using the following policy: + * 1) allocate the requested inode. + * 2) allocate the next available inode after the requested + * inode in the specified cylinder group. + */ +ino_t +ialloccg(ip, cg, ipref, mode) + struct inode *ip; + int cg; + daddr_t ipref; + int mode; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + int error, start, len, loc, map, i; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nifree == 0) + return (NULL); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = bp->b_un.b_cg; + if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + if (ipref) { + ipref %= fs->fs_ipg; + if (isclr(cg_inosused(cgp), ipref)) + goto gotit; + } + start = cgp->cg_irotor / NBBY; + len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); + loc = skpc(0xff, len, &cg_inosused(cgp)[start]); + if (loc == 0) { + len = start + 1; + start = 0; + loc = skpc(0xff, len, &cg_inosused(cgp)[0]); + if (loc == 0) { + printf("cg = %s, irotor = %d, fs = %s\n", + cg, cgp->cg_irotor, fs->fs_fsmnt); + panic("ialloccg: map corrupted"); + /* NOTREACHED */ + } + } + i = start + len - loc; + map = cg_inosused(cgp)[i]; + ipref = i * NBBY; + for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { + if ((map & i) == 0) { + cgp->cg_irotor = ipref; + goto gotit; + } + } + printf("fs = %s\n", fs->fs_fsmnt); + panic("ialloccg: block not in map"); + /* NOTREACHED */ +gotit: + setbit(cg_inosused(cgp), ipref); + cgp->cg_cs.cs_nifree--; + fs->fs_cstotal.cs_nifree--; + fs->fs_cs(fs, cg).cs_nifree--; + fs->fs_fmod++; + if ((mode & IFMT) == IFDIR) { + cgp->cg_cs.cs_ndir++; + fs->fs_cstotal.cs_ndir++; + fs->fs_cs(fs, cg).cs_ndir++; + } + bdwrite(bp); + return (cg * fs->fs_ipg + ipref); +} + +/* + * Free a block or fragment. + * + * The specified block or fragment is placed back in the + * free map. If a fragment is deallocated, a possible + * block reassembly is checked. + */ +blkfree(ip, bno, size) + register struct inode *ip; + daddr_t bno; + off_t size; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + int error, cg, blk, frags, bbase; + register int i; + struct ucred *cred = curproc->p_ucred; /* XXX */ + + fs = ip->i_fs; + if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); + panic("blkfree: bad size"); + } + cg = dtog(fs, bno); + if ((unsigned)bno >= fs->fs_size) { + printf("bad block %d, ino %d\n", bno, ip->i_number); + fserr(fs, cred->cr_uid, "bad block"); + return; + } + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return; + } + cgp = bp->b_un.b_cg; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return; + } + cgp->cg_time = time.tv_sec; + bno = dtogd(fs, bno); + if (size == fs->fs_bsize) { + if (isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bno))) { + printf("dev = 0x%x, block = %d, fs = %s\n", + ip->i_dev, bno, fs->fs_fsmnt); + panic("blkfree: freeing free block"); + } + setblock(fs, cg_blksfree(cgp), fragstoblks(fs, bno)); + cgp->cg_cs.cs_nbfree++; + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + i = cbtocylno(fs, bno); + cg_blks(fs, cgp, i)[cbtorpos(fs, bno)]++; + cg_blktot(cgp)[i]++; + } else { + bbase = bno - fragnum(fs, bno); + /* + * decrement the counts associated with the old frags + */ + blk = blkmap(fs, cg_blksfree(cgp), bbase); + fragacct(fs, blk, cgp->cg_frsum, -1); + /* + * deallocate the fragment + */ + frags = numfrags(fs, size); + for (i = 0; i < frags; i++) { + if (isset(cg_blksfree(cgp), bno + i)) { + printf("dev = 0x%x, block = %d, fs = %s\n", + ip->i_dev, bno + i, fs->fs_fsmnt); + panic("blkfree: freeing free frag"); + } + setbit(cg_blksfree(cgp), bno + i); + } + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + /* + * add back in counts associated with the new frags + */ + blk = blkmap(fs, cg_blksfree(cgp), bbase); + fragacct(fs, blk, cgp->cg_frsum, 1); + /* + * if a complete block has been reassembled, account for it + */ + if (isblock(fs, cg_blksfree(cgp), + (daddr_t)fragstoblks(fs, bbase))) { + cgp->cg_cs.cs_nffree -= fs->fs_frag; + fs->fs_cstotal.cs_nffree -= fs->fs_frag; + fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; + cgp->cg_cs.cs_nbfree++; + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + i = cbtocylno(fs, bbase); + cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++; + cg_blktot(cgp)[i]++; + } + } + fs->fs_fmod++; + bdwrite(bp); +} + +/* + * Free an inode. + * + * The specified inode is placed back in the free map. + */ +ifree(ip, ino, mode) + struct inode *ip; + ino_t ino; + int mode; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + int error, cg; + + fs = ip->i_fs; + if ((unsigned)ino >= fs->fs_ipg*fs->fs_ncg) { + printf("dev = 0x%x, ino = %d, fs = %s\n", + ip->i_dev, ino, fs->fs_fsmnt); + panic("ifree: range"); + } + cg = itog(fs, ino); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return; + } + cgp = bp->b_un.b_cg; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return; + } + cgp->cg_time = time.tv_sec; + ino %= fs->fs_ipg; + if (isclr(cg_inosused(cgp), ino)) { + printf("dev = 0x%x, ino = %d, fs = %s\n", + ip->i_dev, ino, fs->fs_fsmnt); + if (fs->fs_ronly == 0) + panic("ifree: freeing free inode"); + } + clrbit(cg_inosused(cgp), ino); + if (ino < cgp->cg_irotor) + cgp->cg_irotor = ino; + cgp->cg_cs.cs_nifree++; + fs->fs_cstotal.cs_nifree++; + fs->fs_cs(fs, cg).cs_nifree++; + if ((mode & IFMT) == IFDIR) { + cgp->cg_cs.cs_ndir--; + fs->fs_cstotal.cs_ndir--; + fs->fs_cs(fs, cg).cs_ndir--; + } + fs->fs_fmod++; + bdwrite(bp); +} + +/* + * Find a block of the specified size in the specified cylinder group. + * + * It is a panic if a request is made to find a block if none are + * available. + */ +daddr_t +mapsearch(fs, cgp, bpref, allocsiz) + register struct fs *fs; + register struct cg *cgp; + daddr_t bpref; + int allocsiz; +{ + daddr_t bno; + int start, len, loc, i; + int blk, field, subfield, pos; + + /* + * find the fragment by searching through the free block + * map for an appropriate bit pattern + */ + if (bpref) + start = dtogd(fs, bpref) / NBBY; + else + start = cgp->cg_frotor / NBBY; + len = howmany(fs->fs_fpg, NBBY) - start; + loc = scanc((unsigned)len, (u_char *)&cg_blksfree(cgp)[start], + (u_char *)fragtbl[fs->fs_frag], + (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); + if (loc == 0) { + len = start + 1; + start = 0; + loc = scanc((unsigned)len, (u_char *)&cg_blksfree(cgp)[0], + (u_char *)fragtbl[fs->fs_frag], + (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); + if (loc == 0) { + printf("start = %d, len = %d, fs = %s\n", + start, len, fs->fs_fsmnt); + panic("alloccg: map corrupted"); + /* NOTREACHED */ + } + } + bno = (start + len - loc) * NBBY; + cgp->cg_frotor = bno; + /* + * found the byte in the map + * sift through the bits to find the selected frag + */ + for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { + blk = blkmap(fs, cg_blksfree(cgp), bno); + blk <<= 1; + field = around[allocsiz]; + subfield = inside[allocsiz]; + for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { + if ((blk & field) == subfield) + return (bno + pos); + field <<= 1; + subfield <<= 1; + } + } + printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt); + panic("alloccg: block not in map"); + return (-1); +} + +/* + * Fserr prints the name of a file system with an error diagnostic. + * + * The form of the error message is: + * fs: error message + */ +fserr(fs, uid, cp) + struct fs *fs; + uid_t uid; + char *cp; +{ + + log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp); +} diff --git a/sys/ufs/ufs_bmap.c b/sys/ufs/ufs_bmap.c new file mode 100644 index 000000000000..09f2bc6afca3 --- /dev/null +++ b/sys/ufs/ufs_bmap.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 1982, 1986, 1989 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_bmap.c 7.13 (Berkeley) 5/8/91 + * $Id: ufs_bmap.c,v 1.2 1993/10/16 18:17:51 rgrimes Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "buf.h" +#include "proc.h" +#include "file.h" +#include "vnode.h" + +#include "quota.h" +#include "inode.h" +#include "fs.h" + +/* + * Bmap converts a the logical block number of a file + * to its physical block number on the disk. The conversion + * is done by using the logical block number to index into + * the array of block pointers described by the dinode. + */ +bmap(ip, bn, bnp) + register struct inode *ip; + register daddr_t bn; + daddr_t *bnp; +{ + register struct fs *fs; + register daddr_t nb; + struct buf *bp; + daddr_t *bap; + int i, j, sh; + int error; + + if (bn < 0) + return (EFBIG); + fs = ip->i_fs; + + /* + * The first NDADDR blocks are direct blocks + */ + if (bn < NDADDR) { + nb = ip->i_db[bn]; + if (nb == 0) { + *bnp = (daddr_t)-1; + return (0); + } + *bnp = fsbtodb(fs, nb); + return (0); + } + /* + * Determine the number of levels of indirection. + */ + sh = 1; + bn -= NDADDR; + for (j = NIADDR; j > 0; j--) { + sh *= NINDIR(fs); + if (bn < sh) + break; + bn -= sh; + } + if (j == 0) + return (EFBIG); + /* + * Fetch through the indirect blocks. + */ + nb = ip->i_ib[NIADDR - j]; + if (nb == 0) { + *bnp = (daddr_t)-1; + return (0); + } + for (; j <= NIADDR; j++) { + if (error = bread(ip->i_devvp, fsbtodb(fs, nb), + (int)fs->fs_bsize, NOCRED, &bp)) { + brelse(bp); + return (error); + } + bap = bp->b_un.b_daddr; + sh /= NINDIR(fs); + i = (bn / sh) % NINDIR(fs); + nb = bap[i]; + if (nb == 0) { + *bnp = (daddr_t)-1; + brelse(bp); + return (0); + } + brelse(bp); + } + *bnp = fsbtodb(fs, nb); + return (0); +} + +/* + * Balloc defines the structure of file system storage + * by allocating the physical blocks on a device given + * the inode and the logical block number in a file. + */ +balloc(ip, bn, size, bpp, flags) + register struct inode *ip; + register daddr_t bn; + int size; + struct buf **bpp; + int flags; +{ + register struct fs *fs; + register daddr_t nb; + struct buf *bp, *nbp; + struct vnode *vp = ITOV(ip); + int osize, nsize, i, j, sh, error; + daddr_t newb, lbn, *bap, pref, blkpref(); + + *bpp = (struct buf *)0; + if (bn < 0) + return (EFBIG); + fs = ip->i_fs; + + /* + * If the next write will extend the file into a new block, + * and the file is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + nb = lblkno(fs, ip->i_size); + if (nb < NDADDR && nb < bn) { + osize = blksize(fs, ip, nb); + if (osize < fs->fs_bsize && osize > 0) { + error = realloccg(ip, nb, + blkpref(ip, nb, (int)nb, &ip->i_db[0]), + osize, (int)fs->fs_bsize, &bp); + if (error) + return (error); + ip->i_size = (nb + 1) * fs->fs_bsize; + vnode_pager_setsize(ITOV(ip), (u_long)ip->i_size); + ip->i_db[nb] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IUPD|ICHG; + if (flags & B_SYNC) + bwrite(bp); + else + bawrite(bp); + } + } + /* + * The first NDADDR blocks are direct blocks + */ + if (bn < NDADDR) { + nb = ip->i_db[bn]; + if (nb != 0 && ip->i_size >= (bn + 1) * fs->fs_bsize) { + error = bread(vp, bn, fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + *bpp = bp; + return (0); + } + if (nb != 0) { + /* + * Consider need to reallocate a fragment. + */ + osize = fragroundup(fs, blkoff(fs, ip->i_size)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + error = bread(vp, bn, osize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + } else { + error = realloccg(ip, bn, + blkpref(ip, bn, (int)bn, &ip->i_db[0]), + osize, nsize, &bp); + if (error) + return (error); + } + } else { + if (ip->i_size < (bn + 1) * fs->fs_bsize) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + error = alloc(ip, bn, + blkpref(ip, bn, (int)bn, &ip->i_db[0]), + nsize, &newb); + if (error) + return (error); + bp = getblk(vp, bn, nsize); + bp->b_blkno = fsbtodb(fs, newb); + if (flags & B_CLRBUF) + clrbuf(bp); + } + ip->i_db[bn] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IUPD|ICHG; + *bpp = bp; + return (0); + } + /* + * Determine the number of levels of indirection. + */ + pref = 0; + sh = 1; + lbn = bn; + bn -= NDADDR; + for (j = NIADDR; j > 0; j--) { + sh *= NINDIR(fs); + if (bn < sh) + break; + bn -= sh; + } + if (j == 0) + return (EFBIG); + /* + * Fetch the first indirect block allocating if necessary. + */ + nb = ip->i_ib[NIADDR - j]; + if (nb == 0) { + pref = blkpref(ip, lbn, 0, (daddr_t *)0); + if (error = alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb)) + return (error); + nb = newb; + bp = getblk(ip->i_devvp, fsbtodb(fs, nb), fs->fs_bsize); + clrbuf(bp); + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if (error = bwrite(bp)) { + blkfree(ip, nb, fs->fs_bsize); + return (error); + } + ip->i_ib[NIADDR - j] = nb; + ip->i_flag |= IUPD|ICHG; + } + /* + * Fetch through the indirect blocks, allocating as necessary. + */ + for (; ; j++) { + error = bread(ip->i_devvp, fsbtodb(fs, nb), + (int)fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + bap = bp->b_un.b_daddr; + sh /= NINDIR(fs); + i = (bn / sh) % NINDIR(fs); + nb = bap[i]; + if (j == NIADDR) + break; + if (nb != 0) { + brelse(bp); + continue; + } + if (pref == 0) + pref = blkpref(ip, lbn, 0, (daddr_t *)0); + if (error = alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb)) { + brelse(bp); + return (error); + } + nb = newb; + nbp = getblk(ip->i_devvp, fsbtodb(fs, nb), fs->fs_bsize); + clrbuf(nbp); + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if (error = bwrite(nbp)) { + blkfree(ip, nb, fs->fs_bsize); + brelse(bp); + return (error); + } + bap[i] = nb; + /* + * If required, write synchronously, otherwise use + * delayed write. If this is the first instance of + * the delayed write, reassociate the buffer with the + * file so it will be written if the file is sync'ed. + */ + if (flags & B_SYNC) { + bwrite(bp); + } else if (bp->b_flags & B_DELWRI) { + bdwrite(bp); + } else { + bdwrite(bp); + reassignbuf(bp, vp); + } + } + /* + * Get the data block, allocating if necessary. + */ + if (nb == 0) { + pref = blkpref(ip, lbn, i, &bap[0]); + if (error = alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb)) { + brelse(bp); + return (error); + } + nb = newb; + nbp = getblk(vp, lbn, fs->fs_bsize); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & B_CLRBUF) + clrbuf(nbp); + bap[i] = nb; + /* + * If required, write synchronously, otherwise use + * delayed write. If this is the first instance of + * the delayed write, reassociate the buffer with the + * file so it will be written if the file is sync'ed. + */ + if (flags & B_SYNC) { + bwrite(bp); + } else if (bp->b_flags & B_DELWRI) { + bdwrite(bp); + } else { + bdwrite(bp); + reassignbuf(bp, vp); + } + *bpp = nbp; + return (0); + } + brelse(bp); + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + if (error) { + brelse(nbp); + return (error); + } + } else { + nbp = getblk(vp, lbn, fs->fs_bsize); + nbp->b_blkno = fsbtodb(fs, nb); + } + *bpp = nbp; + return (0); +} diff --git a/sys/ufs/ufs_disksubr.c b/sys/ufs/ufs_disksubr.c new file mode 100644 index 000000000000..09cb4d96c042 --- /dev/null +++ b/sys/ufs/ufs_disksubr.c @@ -0,0 +1,593 @@ +/* + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * $Id: ufs_disksubr.c,v 1.3 1993/10/08 21:00:37 rgrimes Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "buf.h" +#include "dkbad.h" +#include "disklabel.h" +#include "syslog.h" + +/* + * Seek sort for disks. We depend on the driver + * which calls us using b_resid as the current cylinder number. + * + * The argument dp structure holds a b_actf activity chain pointer + * on which we keep two queues, sorted in ascending cylinder order. + * The first queue holds those requests which are positioned after + * the current cylinder (in the first request); the second holds + * requests which came in after their cylinder number was passed. + * Thus we implement a one way scan, retracting after reaching the + * end of the drive to the first request on the second queue, + * at which time it becomes the first queue. + * + * A one-way scan is natural because of the way UNIX read-ahead + * blocks are allocated. + */ + +#define b_cylin b_resid + +void +disksort(dp, bp) + register struct buf *dp, *bp; +{ + register struct buf *ap; + + /* + * If nothing on the activity queue, then + * we become the only thing. + */ + ap = dp->b_actf; + if(ap == NULL) { + dp->b_actf = bp; + dp->b_actl = bp; + bp->av_forw = NULL; + return; + } + /* + * If we lie after the first (currently active) + * request, then we must locate the second request list + * and add ourselves to it. + */ + if (bp->b_cylin < ap->b_cylin) { + while (ap->av_forw) { + /* + * Check for an ``inversion'' in the + * normally ascending cylinder numbers, + * indicating the start of the second request list. + */ + if (ap->av_forw->b_cylin < ap->b_cylin) { + /* + * Search the second request list + * for the first request at a larger + * cylinder number. We go before that; + * if there is no such request, we go at end. + */ + do { + if (bp->b_cylin < ap->av_forw->b_cylin) + goto insert; + if (bp->b_cylin == ap->av_forw->b_cylin && + bp->b_blkno < ap->av_forw->b_blkno) + goto insert; + ap = ap->av_forw; + } while (ap->av_forw); + goto insert; /* after last */ + } + ap = ap->av_forw; + } + /* + * No inversions... we will go after the last, and + * be the first request in the second request list. + */ + goto insert; + } + /* + * Request is at/after the current request... + * sort in the first request list. + */ + while (ap->av_forw) { + /* + * We want to go after the current request + * if there is an inversion after it (i.e. it is + * the end of the first request list), or if + * the next request is a larger cylinder than our request. + */ + if (ap->av_forw->b_cylin < ap->b_cylin || + bp->b_cylin < ap->av_forw->b_cylin || + (bp->b_cylin == ap->av_forw->b_cylin && + bp->b_blkno < ap->av_forw->b_blkno)) + goto insert; + ap = ap->av_forw; + } + /* + * Neither a second list nor a larger + * request... we go at the end of the first list, + * which is the same as the end of the whole schebang. + */ +insert: + bp->av_forw = ap->av_forw; + ap->av_forw = bp; + if (ap == dp->b_actl) + dp->b_actl = bp; +} + +/* encoding of disk minor numbers, should be elsewhere... */ +#define dkunit(dev) (minor(dev) >> 3) +#define dkpart(dev) (minor(dev) & 7) +#define dkminor(unit, part) (((unit) << 3) | (part)) + +/* + * Attempt to read a disk label from a device + * using the indicated stategy routine. + * The label must be partly set up before this: + * secpercyl, secsize and anything required for a block i/o read + * operation in the driver's strategy/start routines + * must be filled in before calling us. + * + * If dos partition table requested, attempt to load it and + * find disklabel inside a DOS partition. Also, if bad block + * table needed, attempt to extract it as well. Return buffer + * for use in signalling errors if requested. + * + * Returns null on success and an error string on failure. + */ +char * +readdisklabel(dev, strat, lp, dp, bdp, bpp) + dev_t dev; + int (*strat)(); + register struct disklabel *lp; + struct dos_partition *dp; + struct dkbad *bdp; + struct buf **bpp; +{ + register struct buf *bp; + struct disklabel *dlp; + char *msg = NULL; + int cyl, dospartoff, i; + + /* minimal requirements for archtypal disk label */ + if (lp->d_secperunit == 0) + lp->d_secperunit = 0x1fffffff; + lp->d_npartitions = 1; + if (lp->d_partitions[0].p_size == 0) + lp->d_partitions[0].p_size = 0x1fffffff; + lp->d_partitions[0].p_offset = 0; + + /* obtain buffer to probe drive with */ + bp = geteblk((int)lp->d_secsize); + + /* request no partition relocation by driver on I/O operations */ + bp->b_dev = makedev(major(dev), dkminor((dkunit(dev)), 3)); + + /* do dos partitions in the process of getting disklabel? */ + dospartoff = 0; + cyl = LABELSECTOR / lp->d_secpercyl; + if (dp) { + struct dos_partition *ap; + + /* read master boot record */ + bp->b_blkno = DOSBBSECTOR; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_BUSY | B_READ; + bp->b_cylin = DOSBBSECTOR / lp->d_secpercyl; + (*strat)(bp); + + /* if successful, wander through dos partition table */ + if (biowait(bp)) { + msg = "dos partition I/O error"; + goto done; + } else { + /* XXX how do we check veracity/bounds of this? */ + bcopy(bp->b_un.b_addr + DOSPARTOFF, dp, + NDOSPART * sizeof(*dp)); + for (i = 0; i < NDOSPART; i++, dp++) + /* is this ours? */ + if (dp->dp_size && + dp->dp_typ == DOSPTYP_386BSD + && dospartoff == 0) { + + /* need sector address for SCSI/IDE, + cylinder for ESDI/ST506/RLL */ + dospartoff = dp->dp_start; + cyl = DPCYL(dp->dp_scyl, dp->dp_ssect); + + /* update disklabel with details */ + lp->d_partitions[0].p_size = + dp->dp_size; + lp->d_partitions[0].p_offset = + dp->dp_start; + lp->d_ntracks = dp->dp_ehd + 1; + lp->d_nsectors = DPSECT(dp->dp_esect); + lp->d_subtype |= (lp->d_subtype & 3) + + i | DSTYPE_INDOSPART; + lp->d_secpercyl = lp->d_ntracks * + lp->d_nsectors; + } + } + + } + + /* next, dig out disk label */ + bp->b_blkno = dospartoff + LABELSECTOR; + bp->b_cylin = cyl; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_BUSY | B_READ; + (*strat)(bp); + + /* if successful, locate disk label within block and validate */ + if (biowait(bp)) { + msg = "disk label I/O error"; + goto done; + } else for (dlp = (struct disklabel *)bp->b_un.b_addr; + dlp <= (struct disklabel *)(bp->b_un.b_addr+DEV_BSIZE-sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) { + if (msg == NULL) + msg = "no disk label"; + } else if (dlp->d_npartitions > MAXPARTITIONS || + dkcksum(dlp) != 0) + msg = "disk label corrupted"; + else { + *lp = *dlp; + msg = NULL; + break; + } + } + + if (msg) + goto done; + + /* obtain bad sector table if requested and present */ + if (bdp && (lp->d_flags & D_BADSECT)) { + struct dkbad *db; + + i = 0; + do { + /* read a bad sector table */ + bp->b_flags = B_BUSY | B_READ; + bp->b_blkno = lp->d_secperunit - lp->d_nsectors + i; + if (lp->d_secsize > DEV_BSIZE) + bp->b_blkno *= lp->d_secsize / DEV_BSIZE; + else + bp->b_blkno /= DEV_BSIZE / lp->d_secsize; + bp->b_bcount = lp->d_secsize; + bp->b_cylin = lp->d_ncylinders - 1; + (*strat)(bp); + + /* if successful, validate, otherwise try another */ + if (biowait(bp)) { + msg = "bad sector table I/O error"; + } else { + db = (struct dkbad *)(bp->b_un.b_addr); +#define DKBAD_MAGIC 0x4321 + if (db->bt_mbz == 0 + && db->bt_flag == DKBAD_MAGIC) { + msg = NULL; + *bdp = *db; + break; + } else + msg = "bad sector table corrupted"; + } + } while ((bp->b_flags & B_ERROR) && (i += 2) < 10 && + i < lp->d_nsectors); + } + +done: + bp->b_flags = B_INVAL | B_AGE | B_READ; +#ifndef old + /* if desired, pass back allocated block so caller can use */ + if (bpp) + *bpp = bp; + else +#endif + brelse(bp); + return (msg); +} + +/* + * Check new disk label for sensibility + * before setting it. + */ +setdisklabel(olp, nlp, openmask, dp) + register struct disklabel *olp, *nlp; + u_long openmask; + struct dos_partition *dp; +{ + register i; + register struct partition *opp, *npp; + + /* sanity clause */ + if (nlp->d_secpercyl == 0 || nlp->d_secsize == 0 + || (nlp->d_secsize % DEV_BSIZE) != 0) + return(EINVAL); + + /* special case to allow disklabel to be invalidated */ + if (nlp->d_magic == 0xffffffff) { + *olp = *nlp; + return (0); + } + + if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC || + dkcksum(nlp) != 0) + return (EINVAL); + + /* XXX missing check if other dos partitions will be overwritten */ + + while ((i = ffs((long)openmask)) != 0) { + i--; + openmask &= ~(1 << i); + if (nlp->d_npartitions <= i) + return (EBUSY); + opp = &olp->d_partitions[i]; + npp = &nlp->d_partitions[i]; + if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size) + return (EBUSY); + /* + * Copy internally-set partition information + * if new label doesn't include it. XXX + */ + if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) { + npp->p_fstype = opp->p_fstype; + npp->p_fsize = opp->p_fsize; + npp->p_frag = opp->p_frag; + npp->p_cpg = opp->p_cpg; + } + } + nlp->d_checksum = 0; + nlp->d_checksum = dkcksum(nlp); + *olp = *nlp; + return (0); +} + + +/* + * Write disk label back to device after modification. + */ +writedisklabel(dev, strat, lp, dp) + dev_t dev; + int (*strat)(); + register struct disklabel *lp; + struct dos_partition *dp; +{ + struct buf *bp; + struct disklabel *dlp; + int labelpart, error = 0, dospartoff, cyl, i; + + labelpart = dkpart(dev); +#ifdef nope + if (lp->d_partitions[labelpart].p_offset != 0) { + if (lp->d_partitions[0].p_offset != 0) + return (EXDEV); /* not quite right */ + labelpart = 0; + } +#else + labelpart = 3; +#endif + + bp = geteblk((int)lp->d_secsize); + /* request no partition relocation by driver on I/O operations */ + bp->b_dev = makedev(major(dev), dkminor((dkunit(dev)), 3)); + + /* do dos partitions in the process of getting disklabel? */ + dospartoff = 0; + cyl = LABELSECTOR / lp->d_secpercyl; + if (dp) { + bp->b_blkno = DOSBBSECTOR; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_BUSY | B_READ; + bp->b_cylin = DOSBBSECTOR / lp->d_secpercyl; + (*strat)(bp); + if ((error = biowait(bp)) == 0) { + bcopy(bp->b_un.b_addr + DOSPARTOFF, dp, + NDOSPART * sizeof(*dp)); + for (i = 0; i < NDOSPART; i++, dp++) + if(dp->dp_size && dp->dp_typ == DOSPTYP_386BSD + && dospartoff == 0) { + /* need sector address for SCSI/IDE, + cylinder for ESDI/ST506/RLL */ + dospartoff = dp->dp_start; + cyl = dp->dp_scyl | + ((dp->dp_ssect & 0xc0) << 2); + } + } + + } + +#ifdef maybe + /* disklabel in appropriate location? */ + if (lp->d_partitions[0].p_offset != 0 + && lp->d_partitions[0].p_offset != dospartoff) { + error = EXDEV; + goto done; + } +#endif + + bp->b_blkno = dospartoff + LABELSECTOR; + bp->b_cylin = cyl; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_READ; + (*strat)(bp); + if (error = biowait(bp)) + goto done; + for (dlp = (struct disklabel *)bp->b_un.b_addr; + dlp <= (struct disklabel *) + (bp->b_un.b_addr + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC && + dkcksum(dlp) == 0) { + *dlp = *lp; + bp->b_flags = B_WRITE; + (*strat)(bp); + error = biowait(bp); + goto done; + } + } + error = ESRCH; +done: + brelse(bp); + return (error); +} + +/* + * Compute checksum for disk label. + */ +dkcksum(lp) + register struct disklabel *lp; +{ + register u_short *start, *end; + register u_short sum = 0; + + start = (u_short *)lp; + end = (u_short *)&lp->d_partitions[lp->d_npartitions]; + while (start < end) + sum ^= *start++; + return (sum); +} + +/* + * Determine the size of the transfer, and make sure it is + * within the boundaries of the partition. Adjust transfer + * if needed, and signal errors or early completion. + */ +int +bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) +{ + struct partition *p = lp->d_partitions + dkpart(bp->b_dev); + int labelsect = lp->d_partitions[0].p_offset; + int maxsz = p->p_size, + sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; + + /* overwriting disk label ? */ + /* XXX should also protect bootstrap in first 8K */ + if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && +#if LABELSECTOR != 0 + bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && +#endif + (bp->b_flags & B_READ) == 0 && wlabel == 0) { + bp->b_error = EROFS; + goto bad; + } + +#if defined(DOSBBSECTOR) && defined(notyet) + /* overwriting master boot record? */ + if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && + (bp->b_flags & B_READ) == 0 && wlabel == 0) { + bp->b_error = EROFS; + goto bad; + } +#endif + + /* beyond partition? */ + if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { + /* if exactly at end of disk, return an EOF */ + if (bp->b_blkno == maxsz) { + bp->b_resid = bp->b_bcount; + return(0); + } + /* or truncate if part of it fits */ + sz = maxsz - bp->b_blkno; + if (sz <= 0) { + bp->b_error = EINVAL; + goto bad; + } + bp->b_bcount = sz << DEV_BSHIFT; + } + + /* calculate cylinder for disksort to order transfers with */ + bp->b_cylin = (bp->b_blkno + p->p_offset) / lp->d_secpercyl; + return(1); + +bad: + bp->b_flags |= B_ERROR; + return(-1); +} + +/* + * Disk error is the preface to plaintive error messages + * about failing disk transfers. It prints messages of the form + +hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) + + * if the offset of the error in the transfer and a disk label + * are both available. blkdone should be -1 if the position of the error + * is unknown; the disklabel pointer may be null from drivers that have not + * been converted to use them. The message is printed with printf + * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. + * The message should be completed (with at least a newline) with printf + * or addlog, respectively. There is no trailing space. + */ +void +diskerr(bp, dname, what, pri, blkdone, lp) + register struct buf *bp; + char *dname, *what; + int pri, blkdone; + register struct disklabel *lp; +{ + int unit = dkunit(bp->b_dev), part = dkpart(bp->b_dev); + register int (*pr) __P((const char *, ...)); + char partname = 'a' + part; + int sn; + + if (pri != LOG_PRINTF) { + log(pri, ""); + pr = addlog; + } else + pr = printf; + (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, + bp->b_flags & B_READ ? "read" : "writ"); + sn = bp->b_blkno; + if (bp->b_bcount <= DEV_BSIZE) + (*pr)("%d", sn); + else { + if (blkdone >= 0) { + sn += blkdone; + (*pr)("%d of ", sn); + } + (*pr)("%d-%d", bp->b_blkno, + bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); + } + if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { +#ifdef tahoe + sn *= DEV_BSIZE / lp->d_secsize; /* XXX */ +#endif + sn += lp->d_partitions[part].p_offset; + (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, + sn / lp->d_secpercyl); + sn %= lp->d_secpercyl; + (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors); + } + (*pr)("\n"); +} diff --git a/sys/ufs/ufs_inode.c b/sys/ufs/ufs_inode.c new file mode 100644 index 000000000000..5f8a9f377ec6 --- /dev/null +++ b/sys/ufs/ufs_inode.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 1982, 1986, 1989 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_inode.c 7.40 (Berkeley) 5/8/91 + * $Id: ufs_inode.c,v 1.3 1993/10/16 18:17:52 rgrimes Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "mount.h" +#include "proc.h" +#include "file.h" +#include "buf.h" +#include "vnode.h" +#include "kernel.h" +#include "malloc.h" + +#include "quota.h" +#include "inode.h" +#include "fs.h" +#include "ufsmount.h" + +#define INOHSZ 512 +#if ((INOHSZ&(INOHSZ-1)) == 0) +#define INOHASH(dev,ino) (((dev)+(ino))&(INOHSZ-1)) +#else +#define INOHASH(dev,ino) (((unsigned)((dev)+(ino)))%INOHSZ) +#endif + +union ihead { + union ihead *ih_head[2]; + struct inode *ih_chain[2]; +} ihead[INOHSZ]; + +int prtactive; /* 1 => print out reclaim of active vnodes */ + +/* + * Initialize hash links for inodes. + */ +ufs_init() +{ + register int i; + register union ihead *ih = ihead; + +#ifndef lint + if (VN_MAXPRIVATE < sizeof(struct inode)) + panic("ihinit: too small"); +#endif /* not lint */ + for (i = INOHSZ; --i >= 0; ih++) { + ih->ih_head[0] = ih; + ih->ih_head[1] = ih; + } +#ifdef QUOTA + dqinit(); +#endif /* QUOTA */ +} + +/* + * Look up a UFS dinode number to find its incore vnode. + * If it is not in core, read it in from the specified device. + * If it is in core, wait for the lock bit to clear, then + * return the inode locked. Detection and handling of mount + * points must be done by the calling routine. + */ +iget(xp, ino, ipp) + struct inode *xp; + ino_t ino; + struct inode **ipp; +{ + dev_t dev = xp->i_dev; + struct mount *mntp = ITOV(xp)->v_mount; + register struct fs *fs = VFSTOUFS(mntp)->um_fs; + extern struct vnodeops ufs_vnodeops, spec_inodeops; + register struct inode *ip, *iq; + register struct vnode *vp; + struct vnode *nvp; + struct buf *bp; + struct dinode *dp; + union ihead *ih; + int i, error; + + ih = &ihead[INOHASH(dev, ino)]; +loop: + for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { + if (ino != ip->i_number || dev != ip->i_dev) + continue; + if ((ip->i_flag&ILOCKED) != 0) { + ip->i_flag |= IWANT; + sleep((caddr_t)ip, PINOD); + goto loop; + } + if (vget(ITOV(ip))) + goto loop; + *ipp = ip; + return(0); + } + /* + * Allocate a new inode. + */ + if (error = getnewvnode(VT_UFS, mntp, &ufs_vnodeops, &nvp)) { + *ipp = 0; + return (error); + } + ip = VTOI(nvp); + ip->i_vnode = nvp; + ip->i_flag = 0; + ip->i_devvp = 0; + ip->i_mode = 0; + ip->i_diroff = 0; + ip->i_lockf = 0; +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) + ip->i_dquot[i] = NODQUOT; +#endif + for (i=0; i < DI_SPARE_SZ; i++) + ip->i_di_spare[i] = (unsigned long)0L; + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip->i_dev = dev; + ip->i_number = ino; + insque(ip, ih); + ILOCK(ip); + /* + * Read in the disk contents for the inode. + */ + if (error = bread(VFSTOUFS(mntp)->um_devvp, fsbtodb(fs, itod(fs, ino)), + (int)fs->fs_bsize, NOCRED, &bp)) { + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. + * Iput() will take care of putting it back on the free list. + */ + remque(ip); + ip->i_forw = ip; + ip->i_back = ip; + /* + * Unlock and discard unneeded inode. + */ + iput(ip); + brelse(bp); + *ipp = 0; + return (error); + } + dp = bp->b_un.b_dino; + dp += itoo(fs, ino); + ip->i_din = *dp; + brelse(bp); + /* + * Initialize the associated vnode + */ + vp = ITOV(ip); + vp->v_type = IFTOVT(ip->i_mode); + if (vp->v_type == VFIFO) { +#ifdef FIFO + extern struct vnodeops fifo_inodeops; + vp->v_op = &fifo_inodeops; +#else + iput(ip); + *ipp = 0; + return (EOPNOTSUPP); +#endif /* FIFO */ + } + if (vp->v_type == VCHR || vp->v_type == VBLK) { + vp->v_op = &spec_inodeops; + if (nvp = checkalias(vp, ip->i_rdev, mntp)) { + /* + * Reinitialize aliased inode. + */ + vp = nvp; + iq = VTOI(vp); + iq->i_vnode = vp; + iq->i_flag = 0; + ILOCK(iq); + iq->i_din = ip->i_din; + iq->i_dev = dev; + iq->i_number = ino; + insque(iq, ih); + /* + * Discard unneeded vnode + */ + ip->i_mode = 0; + iput(ip); + ip = iq; + } + } + if (ino == ROOTINO) + vp->v_flag |= VROOT; + /* + * Finish inode initialization. + */ + ip->i_fs = fs; + ip->i_devvp = VFSTOUFS(mntp)->um_devvp; + VREF(ip->i_devvp); + /* + * Set up a generation number for this inode if it does not + * already have one. This should only happen on old filesystems. + */ + if (ip->i_gen == 0) { + if (++nextgennumber < (u_long)time.tv_sec) + nextgennumber = time.tv_sec; + ip->i_gen = nextgennumber; + if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) + ip->i_flag |= IMOD; + } + *ipp = ip; + return (0); +} + +/* + * Unlock and decrement the reference count of an inode structure. + */ +iput(ip) + register struct inode *ip; +{ + + if ((ip->i_flag & ILOCKED) == 0) + panic("iput"); + IUNLOCK(ip); + vrele(ITOV(ip)); +} + +/* + * Last reference to an inode, write the inode out and if necessary, + * truncate and deallocate the file. + */ +ufs_inactive(vp, p) + struct vnode *vp; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + int mode, error = 0; + + if (prtactive && vp->v_usecount != 0) + vprint("ufs_inactive: pushing active", vp); + /* + * Get rid of inodes related to stale file handles. + */ + if (ip->i_mode == 0) { + if ((vp->v_flag & VXLOCK) == 0) + vgone(vp); + return (0); + } + ILOCK(ip); + if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { +#ifdef QUOTA + if (!getinoquota(ip)) + (void) chkiq(ip, -1, NOCRED, 0); +#endif + error = itrunc(ip, (u_long)0, 0); + mode = ip->i_mode; + ip->i_mode = 0; + ip->i_rdev = 0; + ip->i_flag |= IUPD|ICHG; + ifree(ip, ip->i_number, mode); + } + IUPDAT(ip, &time, &time, 0); + IUNLOCK(ip); + ip->i_flag = 0; + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + if (vp->v_usecount == 0 && ip->i_mode == 0) + vgone(vp); + return (error); +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +ufs_reclaim(vp) + register struct vnode *vp; +{ + register struct inode *ip = VTOI(vp); + int i; + + if (prtactive && vp->v_usecount != 0) + vprint("ufs_reclaim: pushing active", vp); + /* + * Remove the inode from its hash chain. + */ + remque(ip); + ip->i_forw = ip; + ip->i_back = ip; + /* + * Purge old data structures associated with the inode. + */ + cache_purge(vp); + if (ip->i_devvp) { + vrele(ip->i_devvp); + ip->i_devvp = 0; + } +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) { + if (ip->i_dquot[i] != NODQUOT) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } +#endif + ip->i_flag = 0; + return (0); +} + +/* + * Update the access, modified, and inode change times as specified + * by the IACC, IMOD, and ICHG flags respectively. The IUPD flag + * is used to specify that the inode needs to be updated but that + * the times have already been set. The access and modified times + * are taken from the second and third parameters; the inode change + * time is always taken from the current time. If waitfor is set, + * then wait for the disk write of the inode to complete. + */ +iupdat(ip, ta, tm, waitfor) + register struct inode *ip; + struct timeval *ta, *tm; + int waitfor; +{ + struct buf *bp; + struct vnode *vp = ITOV(ip); + struct dinode *dp; + register struct fs *fs; + int error; + + fs = ip->i_fs; + if ((ip->i_flag & (IUPD|IACC|ICHG|IMOD)) == 0) + return (0); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (0); + error = bread(ip->i_devvp, fsbtodb(fs, itod(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + if (ip->i_flag&IACC) + ip->i_atime = ta->tv_sec; + if (ip->i_flag&IUPD) + ip->i_mtime = tm->tv_sec; + if (ip->i_flag&ICHG) + ip->i_ctime = time.tv_sec; + ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD); + dp = bp->b_un.b_dino + itoo(fs, ip->i_number); + *dp = ip->i_din; + if (waitfor) { + return (bwrite(bp)); + } else { + bdwrite(bp); + return (0); + } +} + +#define SINGLE 0 /* index of single indirect block */ +#define DOUBLE 1 /* index of double indirect block */ +#define TRIPLE 2 /* index of triple indirect block */ +/* + * Truncate the inode ip to at most length size. Free affected disk + * blocks -- the blocks of the file are removed in reverse order. + * + * NB: triple indirect blocks are untested. + */ +itrunc(oip, length, flags) + register struct inode *oip; + u_long length; + int flags; +{ + register daddr_t lastblock; + daddr_t bn, lbn, lastiblock[NIADDR]; + register struct fs *fs; + register struct inode *ip; + struct buf *bp; + int offset, osize, size, level; + long count, nblocks, blocksreleased = 0; + register int i; + int aflags, error, allerror; + struct inode tip; + + vnode_pager_setsize(ITOV(oip), length); + if (FASTLINK(oip)) { + if (length != 0) + panic("itrunc fastlink to non-zero"); + bzero(oip->i_symlink, MAXFASTLINK); + oip->i_size = 0; + oip->i_din.di_spare[0] = 0; + } + if (oip->i_size <= length) { + oip->i_flag |= ICHG|IUPD; + error = iupdat(oip, &time, &time, 1); + return (error); + } + /* + * Calculate index into inode's block list of + * last direct and indirect blocks (if any) + * which we want to keep. Lastblock is -1 when + * the file is truncated to 0. + */ + fs = oip->i_fs; + lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; + lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); + lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); + nblocks = btodb(fs->fs_bsize); + /* + * Update the size of the file. If the file is not being + * truncated to a block boundry, the contents of the + * partial block following the end of the file must be + * zero'ed in case it ever become accessable again because + * of subsequent file growth. + */ + osize = oip->i_size; + offset = blkoff(fs, length); + if (offset == 0) { + oip->i_size = length; + } else { + lbn = lblkno(fs, length); + aflags = B_CLRBUF; + if (flags & IO_SYNC) + aflags |= B_SYNC; +#ifdef QUOTA + if (error = getinoquota(oip)) + return (error); +#endif + if (error = balloc(oip, lbn, offset, &bp, aflags)) + return (error); + oip->i_size = length; + size = blksize(fs, oip, lbn); + (void) vnode_pager_uncache(ITOV(oip)); + bzero(bp->b_un.b_addr + offset, (unsigned)(size - offset)); + allocbuf(bp, size); + if (flags & IO_SYNC) + bwrite(bp); + else + bdwrite(bp); + } + /* + * Update file and block pointers + * on disk before we start freeing blocks. + * If we crash before free'ing blocks below, + * the blocks will be returned to the free list. + * lastiblock values are also normalized to -1 + * for calls to indirtrunc below. + */ + tip = *oip; + tip.i_size = osize; + for (level = TRIPLE; level >= SINGLE; level--) + if (lastiblock[level] < 0) { + oip->i_ib[level] = 0; + lastiblock[level] = -1; + } + for (i = NDADDR - 1; i > lastblock; i--) + oip->i_db[i] = 0; + oip->i_flag |= ICHG|IUPD; + vinvalbuf(ITOV(oip), (length > 0)); + allerror = iupdat(oip, &time, &time, MNT_WAIT); + + /* + * Indirect blocks first. + */ + ip = &tip; + for (level = TRIPLE; level >= SINGLE; level--) { + bn = ip->i_ib[level]; + if (bn != 0) { + error = indirtrunc(ip, bn, lastiblock[level], level, + &count); + if (error) + allerror = error; + blocksreleased += count; + if (lastiblock[level] < 0) { + ip->i_ib[level] = 0; + blkfree(ip, bn, (off_t)fs->fs_bsize); + blocksreleased += nblocks; + } + } + if (lastiblock[level] >= 0) + goto done; + } + + /* + * All whole direct blocks or frags. + */ + for (i = NDADDR - 1; i > lastblock; i--) { + register off_t bsize; + + bn = ip->i_db[i]; + if (bn == 0) + continue; + ip->i_db[i] = 0; + bsize = (off_t)blksize(fs, ip, i); + blkfree(ip, bn, bsize); + blocksreleased += btodb(bsize); + } + if (lastblock < 0) + goto done; + + /* + * Finally, look for a change in size of the + * last direct block; release any frags. + */ + bn = ip->i_db[lastblock]; + if (bn != 0) { + off_t oldspace, newspace; + + /* + * Calculate amount of space we're giving + * back as old block size minus new block size. + */ + oldspace = blksize(fs, ip, lastblock); + ip->i_size = length; + newspace = blksize(fs, ip, lastblock); + if (newspace == 0) + panic("itrunc: newspace"); + if (oldspace - newspace > 0) { + /* + * Block number of space to be free'd is + * the old block # plus the number of frags + * required for the storage we're keeping. + */ + bn += numfrags(fs, newspace); + blkfree(ip, bn, oldspace - newspace); + blocksreleased += btodb(oldspace - newspace); + } + } +done: +/* BEGIN PARANOIA */ + for (level = SINGLE; level <= TRIPLE; level++) + if (ip->i_ib[level] != oip->i_ib[level]) + panic("itrunc1"); + for (i = 0; i < NDADDR; i++) + if (ip->i_db[i] != oip->i_db[i]) + panic("itrunc2"); +/* END PARANOIA */ + oip->i_blocks -= blocksreleased; + if (oip->i_blocks < 0) /* sanity */ + oip->i_blocks = 0; + oip->i_flag |= ICHG; +#ifdef QUOTA + if (!getinoquota(oip)) + (void) chkdq(oip, -blocksreleased, NOCRED, 0); +#endif + return (allerror); +} + +/* + * Release blocks associated with the inode ip and + * stored in the indirect block bn. Blocks are free'd + * in LIFO order up to (but not including) lastbn. If + * level is greater than SINGLE, the block is an indirect + * block and recursive calls to indirtrunc must be used to + * cleanse other indirect blocks. + * + * NB: triple indirect blocks are untested. + */ +indirtrunc(ip, bn, lastbn, level, countp) + register struct inode *ip; + daddr_t bn, lastbn; + int level; + long *countp; +{ + register int i; + struct buf *bp; + register struct fs *fs = ip->i_fs; + register daddr_t *bap; + daddr_t *copy, nb, last; + long blkcount, factor; + int nblocks, blocksreleased = 0; + int error, allerror = 0; + + /* + * Calculate index in current block of last + * block to be kept. -1 indicates the entire + * block so we need not calculate the index. + */ + factor = 1; + for (i = SINGLE; i < level; i++) + factor *= NINDIR(fs); + last = lastbn; + if (lastbn > 0) + last /= factor; + nblocks = btodb(fs->fs_bsize); + /* + * Get buffer of block pointers, zero those + * entries corresponding to blocks to be free'd, + * and update on disk copy first. + */ + error = bread(ip->i_devvp, fsbtodb(fs, bn), (int)fs->fs_bsize, + NOCRED, &bp); + if (error) { + brelse(bp); + *countp = 0; + return (error); + } + bap = bp->b_un.b_daddr; + MALLOC(copy, daddr_t *, fs->fs_bsize, M_TEMP, M_WAITOK); + bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->fs_bsize); + bzero((caddr_t)&bap[last + 1], + (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t)); + if (last == -1) + bp->b_flags |= B_INVAL; + error = bwrite(bp); + if (error) + allerror = error; + bap = copy; + + /* + * Recursively free totally unused blocks. + */ + for (i = NINDIR(fs) - 1; i > last; i--) { + nb = bap[i]; + if (nb == 0) + continue; + if (level > SINGLE) { + error = indirtrunc(ip, nb, (daddr_t)-1, level - 1, + &blkcount); + if (error) + allerror = error; + blocksreleased += blkcount; + } + blkfree(ip, nb, (off_t)fs->fs_bsize); + blocksreleased += nblocks; + } + + /* + * Recursively free last partial block. + */ + if (level > SINGLE && lastbn >= 0) { + last = lastbn % factor; + nb = bap[i]; + if (nb != 0) { + error = indirtrunc(ip, nb, last, level - 1, &blkcount); + if (error) + allerror = error; + blocksreleased += blkcount; + } + } + FREE(copy, M_TEMP); + *countp = blocksreleased; + return (allerror); +} + +/* + * Lock an inode. If its already locked, set the WANT bit and sleep. + */ +ilock(ip) + register struct inode *ip; +{ + + while (ip->i_flag & ILOCKED) { + ip->i_flag |= IWANT; + if (ip->i_spare0 == curproc->p_pid) + panic("locking against myself"); + ip->i_spare1 = curproc->p_pid; + (void) sleep((caddr_t)ip, PINOD); + } + ip->i_spare1 = 0; + ip->i_spare0 = curproc->p_pid; + ip->i_flag |= ILOCKED; +} + +/* + * Unlock an inode. If WANT bit is on, wakeup. + */ +iunlock(ip) + register struct inode *ip; +{ + + if ((ip->i_flag & ILOCKED) == 0) + vprint("iunlock: unlocked inode", ITOV(ip)); + ip->i_spare0 = 0; + ip->i_flag &= ~ILOCKED; + if (ip->i_flag&IWANT) { + ip->i_flag &= ~IWANT; + wakeup((caddr_t)ip); + } +} diff --git a/sys/ufs/ufs_lockf.c b/sys/ufs/ufs_lockf.c new file mode 100644 index 000000000000..98aeb8f9d714 --- /dev/null +++ b/sys/ufs/ufs_lockf.c @@ -0,0 +1,794 @@ +/* + * Copyright (c) 1982, 1986, 1989 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_lockf.c 7.7 (Berkeley) 7/2/91 + * $Id: ufs_lockf.c,v 1.5 1993/10/25 03:19:43 davidg Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "kernel.h" +#include "file.h" +#include "proc.h" +#include "vnode.h" +#include "malloc.h" +#include "fcntl.h" + +#include "lockf.h" +#include "quota.h" +#include "inode.h" + + + +/* + * Advisory record locking support + */ +lf_advlock(head, size, id, op, fl, flags) + struct lockf **head; + u_long size; + caddr_t id; + int op; + register struct flock *fl; + int flags; +{ + register struct lockf *lock; + off_t start, end; + int error; + + /* + * Avoid the common case of unlocking when inode has no locks. + */ + if (*head == (struct lockf *)0) { + if (op != F_SETLK) { + fl->l_type = F_UNLCK; + return (0); + } + } + + /* + * Convert the flock structure into a start and end. + */ + switch (fl->l_whence) { + + case SEEK_SET: + case SEEK_CUR: + /* + * Caller is responsible for adding any necessary offset + * when SEEK_CUR is used. + */ + start = fl->l_start; + break; + + case SEEK_END: + start = size + fl->l_start; + break; + + default: + return (EINVAL); + } + if (start < 0) + return (EINVAL); + if (fl->l_len == 0) + end = -1; + else + end = start + fl->l_len - 1; + /* + * Create the lockf structure + */ + MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + lock->lf_start = start; + lock->lf_end = end; + lock->lf_id = id; + lock->lf_head = head; + lock->lf_type = fl->l_type; + lock->lf_next = (struct lockf *)0; + lock->lf_block = (struct lockf *)0; + lock->lf_flags = flags; + /* + * Do the requested operation. + */ + switch(op) { + case F_SETLK: + return (lf_setlock(lock)); + + case F_UNLCK: + error = lf_clearlock(lock); + FREE(lock, M_LOCKF); + return (error); + + case F_GETLK: + error = lf_getlock(lock, fl); + FREE(lock, M_LOCKF); + return (error); + + default: + free(lock, M_LOCKF); + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * This variable controls the maximum number of processes that will + * be checked in doing deadlock detection. + */ +int maxlockdepth = MAXDEPTH; + +#ifdef LOCKF_DEBUG +int lockf_debug = 0; +#endif /* LOCKF_DEBUG */ + +#define NOLOCKF (struct lockf *)0 +#define SELF 0x1 +#define OTHERS 0x2 + +/* + * Set a byte-range lock. + */ +lf_setlock(lock) + register struct lockf *lock; +{ + register struct lockf *block; + struct lockf **head = lock->lf_head; + struct lockf **prev, *overlap, *ltmp; + static char lockstr[] = "lockf"; + int ovcase, priority, needtolink, error; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_setlock", lock); +#endif /* LOCKF_DEBUG */ + + /* + * Set the priority + */ + priority = PLOCK; + if (lock->lf_type == F_WRLCK) + priority += 4; + priority |= PCATCH; + /* + * Scan lock list for this file looking for locks that would block us. + */ + while (block = lf_getblock(lock)) { + /* + * Free the structure and return if nonblocking. + */ + if ((lock->lf_flags & F_WAIT) == 0) { + FREE(lock, M_LOCKF); + return (EAGAIN); + } + /* + * We are blocked. Since flock style locks cover + * the whole file, there is no chance for deadlock. + * For byte-range locks we must check for deadlock. + * + * Deadlock detection is done by looking through the + * wait channels to see if there are any cycles that + * involve us. MAXDEPTH is set just to make sure we + * do not go off into neverland. + */ + if ((lock->lf_flags & F_POSIX) && + (block->lf_flags & F_POSIX)) { + register struct proc *wproc; + register struct lockf *waitblock; + int i = 0; + + /* The block is waiting on something */ + wproc = (struct proc *)block->lf_id; + while (wproc->p_wchan && + (wproc->p_wmesg == lockstr) && + (i++ < maxlockdepth)) { + waitblock = (struct lockf *)wproc->p_wchan; + /* Get the owner of the blocking lock */ + waitblock = waitblock->lf_next; + if ((waitblock->lf_flags & F_POSIX) == 0) + break; + wproc = (struct proc *)waitblock->lf_id; + if (wproc == (struct proc *)lock->lf_id) { + free(lock, M_LOCKF); + return (EDEADLK); + } + } + } + /* + * For flock type locks, we must first remove + * any shared locks that we hold before we sleep + * waiting for an exclusive lock. + */ + if ((lock->lf_flags & F_FLOCK) && + lock->lf_type == F_WRLCK) { + lock->lf_type = F_UNLCK; + (void) lf_clearlock(lock); + lock->lf_type = F_WRLCK; + } + /* + * Add our lock to the blocked list and sleep until we're free. + * Remember who blocked us (for deadlock detection). + */ + lock->lf_next = block; + lf_addblock(block, lock); +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: blocking on", block); + lf_printlist("lf_setlock", block); + } +#endif /* LOCKF_DEBUG */ + if (error = tsleep((caddr_t)lock, priority, lockstr, 0)) { + +#ifdef PK_LOCKF_FIX /* Paul Kranenburg's lockf fix (buggy!) */ + /* Don't leave a dangling pointer in block list */ + if (lf_getblock(lock) == block) { + struct lockf **prev; + + /* Still there, find us on list */ + prev = &block->lf_block; + while ((block = block->lf_block) != NOLOCKF) { + if (block == lock) { + *prev = block->lf_block; + break; + } + prev = &block->lf_block; + } + } + free(lock, M_LOCKF); +#else /* Mark Tinguely's fix instead */ + (void) lf_clearlock(lock); + return (error); +#endif +#if 0 /* ...and this is the original code -DLG */ + free(lock, M_LOCKF); +#endif + return (error); + } + } + /* + * No blocks!! Add the lock. Note that we will + * downgrade or upgrade any overlapping locks this + * process already owns. + * + * Skip over locks owned by other processes. + * Handle any locks that overlap and are owned by ourselves. + */ + prev = head; + block = *head; + needtolink = 1; + for (;;) { + if (ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap)) + block = overlap->lf_next; + /* + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + switch (ovcase) { + case 0: /* no overlap */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + break; + + case 1: /* overlap == lock */ + /* + * If downgrading lock, others may be + * able to acquire it. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) + lf_wakelock(overlap); + overlap->lf_type = lock->lf_type; + FREE(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + + case 2: /* overlap contains lock */ + /* + * Check for common starting point and different types. + */ + if (overlap->lf_type == lock->lf_type) { + free(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + } + if (overlap->lf_start == lock->lf_start) { + *prev = lock; + lock->lf_next = overlap; + overlap->lf_start = lock->lf_end + 1; + } else + lf_split(overlap, lock); + lf_wakelock(overlap); + break; + + case 3: /* lock contains overlap */ + /* + * If downgrading lock, others may be able to + * acquire it, otherwise take the list. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) { + lf_wakelock(overlap); + } else { + ltmp = lock->lf_block; + lock->lf_block = overlap->lf_block; + lf_addblock(lock, ltmp); + } + /* + * Add the new lock if necessary and delete the overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap->lf_next; + prev = &lock->lf_next; + needtolink = 0; + } else + *prev = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + /* + * Add lock after overlap on the list. + */ + lock->lf_next = overlap->lf_next; + overlap->lf_next = lock; + overlap->lf_end = lock->lf_start - 1; + prev = &lock->lf_next; + lf_wakelock(overlap); + needtolink = 0; + continue; + + case 5: /* overlap ends after lock */ + /* + * Add the new lock before overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + overlap->lf_start = lock->lf_end + 1; + lf_wakelock(overlap); + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: got the lock", lock); + lf_printlist("lf_setlock", lock); + } +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Remove a byte-range lock on an inode. + * + * Generally, find the lock (or an overlap to that lock) + * and remove it (or shrink it), then wakeup anyone we can. + */ +lf_clearlock(unlock) + register struct lockf *unlock; +{ + struct lockf **head = unlock->lf_head; + register struct lockf *lf = *head; + struct lockf *overlap, **prev; + int ovcase; + + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (unlock->lf_type != F_UNLCK) + panic("lf_clearlock: bad type"); + if (lockf_debug & 1) + lf_print("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + prev = head; + while (ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) { + /* + * Wakeup the list of locks to be retried. + */ + lf_wakelock(overlap); + + switch (ovcase) { + + case 1: /* overlap == lock */ + *prev = overlap->lf_next; + FREE(overlap, M_LOCKF); + break; + + case 2: /* overlap contains lock: split it */ + if (overlap->lf_start == unlock->lf_start) { + overlap->lf_start = unlock->lf_end + 1; + break; + } + lf_split(overlap, unlock); + overlap->lf_next = unlock->lf_next; + break; + + case 3: /* lock contains overlap */ + *prev = overlap->lf_next; + lf = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + overlap->lf_end = unlock->lf_start - 1; + prev = &overlap->lf_next; + lf = overlap->lf_next; + continue; + + case 5: /* overlap ends after lock */ + overlap->lf_start = unlock->lf_end + 1; + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_printlist("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Check whether there is a blocking lock, + * and if so return its process identifier. + */ +lf_getlock(lock, fl) + register struct lockf *lock; + register struct flock *fl; +{ + register struct lockf *block; + off_t start, end; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_getlock", lock); +#endif /* LOCKF_DEBUG */ + + if (block = lf_getblock(lock)) { + fl->l_type = block->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = block->lf_start; + if (block->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = block->lf_end - block->lf_start + 1; + if (block->lf_flags & F_POSIX) + fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; + else + fl->l_pid = -1; + } else { + fl->l_type = F_UNLCK; + } + return (0); +} + +/* + * Walk the list of locks for an inode and + * return the first blocking lock. + */ +struct lockf * +lf_getblock(lock) + register struct lockf *lock; +{ + struct lockf **prev, *overlap, *lf = *(lock->lf_head); + int ovcase; + + prev = lock->lf_head; + while (ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) { + /* + * We've found an overlap, see if it blocks us + */ + if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) + return (overlap); + /* + * Nope, point to the next one on the list and + * see if it blocks us + */ + lf = overlap->lf_next; + } + return (NOLOCKF); +} + +/* + * Walk the list of locks for an inode to + * find an overlapping lock (if any). + * + * NOTE: this returns only the FIRST overlapping lock. There + * may be more than one. + */ +lf_findoverlap(lf, lock, type, prev, overlap) + register struct lockf *lf; + struct lockf *lock; + int type; + struct lockf ***prev; + struct lockf **overlap; +{ + off_t start, end; + + *overlap = lf; + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_findoverlap: looking for overlap in", lock); +#endif /* LOCKF_DEBUG */ + start = lock->lf_start; + end = lock->lf_end; + while (lf != NOLOCKF) { + if (((type & SELF) && lf->lf_id != lock->lf_id) || + ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("\tchecking", lf); +#endif /* LOCKF_DEBUG */ + /* + * OK, check for overlap + * + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + if ((lf->lf_end != -1 && start > lf->lf_end) || + (end != -1 && lf->lf_start > end)) { + /* Case 0 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("no overlap\n"); +#endif /* LOCKF_DEBUG */ + if ((type & SELF) && end != -1 && lf->lf_start > end) + return (0); + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } + if ((lf->lf_start == start) && (lf->lf_end == end)) { + /* Case 1 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap == lock\n"); +#endif /* LOCKF_DEBUG */ + return (1); + } + if ((lf->lf_start <= start) && + (end != -1) && + ((lf->lf_end >= end) || (lf->lf_end == -1))) { + /* Case 2 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap contains lock\n"); +#endif /* LOCKF_DEBUG */ + return (2); + } + if (start <= lf->lf_start && + (end == -1 || + (lf->lf_end != -1 && end >= lf->lf_end))) { + /* Case 3 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("lock contains overlap\n"); +#endif /* LOCKF_DEBUG */ + return (3); + } + if ((lf->lf_start < start) && + ((lf->lf_end >= start) || (lf->lf_end == -1))) { + /* Case 4 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap starts before lock\n"); +#endif /* LOCKF_DEBUG */ + return (4); + } + if ((lf->lf_start > start) && + (end != -1) && + ((lf->lf_end > end) || (lf->lf_end == -1))) { + /* Case 5 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap ends after lock\n"); +#endif /* LOCKF_DEBUG */ + return (5); + } + panic("lf_findoverlap: default"); + } + return (0); +} + +/* + * Add a lock to the end of the blocked list. + */ +lf_addblock(lock, blocked) + struct lockf *lock; + struct lockf *blocked; +{ + register struct lockf *lf; + + if (blocked == NOLOCKF) + return; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("addblock: adding", blocked); + lf_print("to blocked list of", lock); + } +#endif /* LOCKF_DEBUG */ + if ((lf = lock->lf_block) == NOLOCKF) { + lock->lf_block = blocked; + return; + } + while (lf->lf_block != NOLOCKF) + lf = lf->lf_block; + lf->lf_block = blocked; + return; +} + +/* + * Split a lock and a contained region into + * two or three locks as necessary. + */ +lf_split(lock1, lock2) + register struct lockf *lock1; + register struct lockf *lock2; +{ + register struct lockf *splitlock; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("lf_split", lock1); + lf_print("splitting from", lock2); + } +#endif /* LOCKF_DEBUG */ + /* + * Check to see if spliting into only two pieces. + */ + if (lock1->lf_start == lock2->lf_start) { + lock1->lf_start = lock2->lf_end + 1; + lock2->lf_next = lock1; + return; + } + if (lock1->lf_end == lock2->lf_end) { + lock1->lf_end = lock2->lf_start - 1; + lock2->lf_next = lock1->lf_next; + lock1->lf_next = lock2; + return; + } + /* + * Make a new lock consisting of the last part of + * the encompassing lock + */ + MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); + bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock); + splitlock->lf_start = lock2->lf_end + 1; + splitlock->lf_block = NOLOCKF; + lock1->lf_end = lock2->lf_start - 1; + /* + * OK, now link it in + */ + splitlock->lf_next = lock1->lf_next; + lock2->lf_next = splitlock; + lock1->lf_next = lock2; +} + +/* + * Wakeup a blocklist + */ +lf_wakelock(listhead) + struct lockf *listhead; +{ + register struct lockf *blocklist, *wakelock; + + blocklist = listhead->lf_block; + listhead->lf_block = NOLOCKF; + while (blocklist != NOLOCKF) { + wakelock = blocklist; + blocklist = blocklist->lf_block; + wakelock->lf_block = NOLOCKF; + wakelock->lf_next = NOLOCKF; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_wakelock: awakening", wakelock); +#endif /* LOCKF_DEBUG */ + wakeup((caddr_t)wakelock); + } +} + +#ifdef LOCKF_DEBUG +/* + * Print out a lock. + */ +lf_print(tag, lock) + char *tag; + register struct lockf *lock; +{ + + printf("%s: lock 0x%lx for ", tag, lock); + if (lock->lf_flags & F_POSIX) + printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid); + else + printf("id 0x%x", lock->lf_id); + printf(" in ino %d on dev <%d, %d>, %s, start %d, end %d", + lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev), + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : + "unknown", lock->lf_start, lock->lf_end); + if (lock->lf_block) + printf(" block 0x%x\n", lock->lf_block); + else + printf("\n"); +} + +lf_printlist(tag, lock) + char *tag; + struct lockf *lock; +{ + register struct lockf *lf; + + printf("%s: Lock list for ino %d on dev <%d, %d>:\n", + tag, lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev)); + for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { + printf("\tlock 0x%lx for ", lf); + if (lf->lf_flags & F_POSIX) + printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid); + else + printf("id 0x%x", lf->lf_id); + printf(", %s, start %d, end %d", + lf->lf_type == F_RDLCK ? "shared" : + lf->lf_type == F_WRLCK ? "exclusive" : + lf->lf_type == F_UNLCK ? "unlock" : + "unknown", lf->lf_start, lf->lf_end); + if (lf->lf_block) + printf(" block 0x%x\n", lf->lf_block); + else + printf("\n"); + } +} +#endif /* LOCKF_DEBUG */ diff --git a/sys/ufs/ufs_lookup.c b/sys/ufs/ufs_lookup.c new file mode 100644 index 000000000000..c557e4e3bec3 --- /dev/null +++ b/sys/ufs/ufs_lookup.c @@ -0,0 +1,915 @@ +/* + * Copyright (c) 1989 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_lookup.c 7.33 (Berkeley) 5/19/91 + * $Id: ufs_lookup.c,v 1.4 1993/10/16 18:17:55 rgrimes Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "namei.h" +#include "buf.h" +#include "file.h" +#include "vnode.h" + +#include "quota.h" +#include "inode.h" +#include "dir.h" +#include "fs.h" + +struct nchstats nchstats; +#ifdef DIAGNOSTIC +int dirchk = 1; +#else +int dirchk = 0; +#endif + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the file system is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and iput + * instead of two iputs. + * + * Overall outline of ufs_lookup: + * + * check accessibility of directory + * look for name in cache, if found, then if at end of path + * and deleting or creating, drop it, else return name + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + * + * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode unlocked. + */ +ufs_lookup(vdp, ndp, p) + register struct vnode *vdp; + register struct nameidata *ndp; + struct proc *p; +{ + register struct inode *dp; /* the directory we are searching */ + register struct fs *fs; /* file system that directory is in */ + struct buf *bp = 0; /* a buffer of directory entries */ + register struct direct *ep; /* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + enum {NONE, COMPACT, FOUND} slotstatus; + int slotoffset = -1; /* offset of area with free space */ + int slotsize; /* size of area at slotoffset */ + int slotfreespace; /* amount of space free in slot */ + int slotneeded; /* size of the entry we're seeking */ + int numdirpasses; /* strategy for directory search */ + int endsearch; /* offset to end directory search */ + int prevoff; /* ndp->ni_ufs.ufs_offset of previous entry */ + struct inode *pdp; /* saved dp during symlink work */ + struct inode *tdp; /* returned by iget */ + off_t enduseful; /* pointer past last used dir slot */ + int flag; /* LOOKUP, CREATE, RENAME, or DELETE */ + int lockparent; /* 1 => lockparent flag is set */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int error; + + ndp->ni_dvp = vdp; + ndp->ni_vp = NULL; + dp = VTOI(vdp); + fs = dp->i_fs; + lockparent = ndp->ni_nameiop & LOCKPARENT; + flag = ndp->ni_nameiop & OPMASK; + wantparent = ndp->ni_nameiop & (LOCKPARENT|WANTPARENT); + + /* + * Check accessiblity of directory. + */ + if ((dp->i_mode&IFMT) != IFDIR) + return (ENOTDIR); + if (error = ufs_access(vdp, VEXEC, ndp->ni_cred, p)) + return (error); + + /* + * We now have a segment name to search for, and a directory to search. + * + * Before tediously performing a linear scan of the directory, + * check the name cache to see if the directory/name pair + * we are looking for is known already. + */ + if (error = cache_lookup(ndp)) { + int vpid; /* capability number of vnode */ + + if (error == ENOENT) + return (error); +#ifdef PARANOID + if (vdp == ndp->ni_rootdir && ndp->ni_isdotdot) + panic("ufs_lookup: .. through root"); +#endif + /* + * Get the next vnode in the path. + * See comment below starting `Step through' for + * an explaination of the locking protocol. + */ + pdp = dp; + dp = VTOI(ndp->ni_vp); + vdp = ndp->ni_vp; + vpid = vdp->v_id; + if (pdp == dp) { + VREF(vdp); + error = 0; + } else if (ndp->ni_isdotdot) { + IUNLOCK(pdp); + error = vget(vdp); + if (!error && lockparent && *ndp->ni_next == '\0') + ILOCK(pdp); + } else { + error = vget(vdp); + if (!lockparent || error || *ndp->ni_next != '\0') + IUNLOCK(pdp); + } + /* + * Check that the capability number did not change + * while we were waiting for the lock. + */ + if (!error) { + if (vpid == vdp->v_id) + return (0); + iput(dp); + if (lockparent && pdp != dp && *ndp->ni_next == '\0') + IUNLOCK(pdp); + } + ILOCK(pdp); + dp = pdp; + vdp = ITOV(dp); + ndp->ni_vp = NULL; + } + + /* + * Suppress search for slots unless creating + * file and at end of pathname, in which case + * we watch for a place to put the new file in + * case it doesn't already exist. + */ + slotstatus = FOUND; + if ((flag == CREATE || flag == RENAME) && *ndp->ni_next == 0) { + slotstatus = NONE; + slotfreespace = 0; + slotneeded = ((sizeof (struct direct) - (MAXNAMLEN + 1)) + + ((ndp->ni_namelen + 1 + 3) &~ 3)); + } + + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + if (flag != LOOKUP || dp->i_diroff == 0 || dp->i_diroff > dp->i_size) { + ndp->ni_ufs.ufs_offset = 0; + numdirpasses = 1; + } else { + ndp->ni_ufs.ufs_offset = dp->i_diroff; + entryoffsetinblock = blkoff(fs, ndp->ni_ufs.ufs_offset); + if (entryoffsetinblock != 0) { + if (error = blkatoff(dp, ndp->ni_ufs.ufs_offset, + (char **)0, &bp)) + return (error); + } + numdirpasses = 2; + nchstats.ncs_2passes++; + } + endsearch = roundup(dp->i_size, DIRBLKSIZ); + enduseful = 0; + +searchloop: + while (ndp->ni_ufs.ufs_offset < endsearch) { + /* + * If offset is on a block boundary, + * read the next directory block. + * Release previous if it exists. + */ + if (blkoff(fs, ndp->ni_ufs.ufs_offset) == 0) { + if (bp != NULL) + brelse(bp); + if (error = blkatoff(dp, ndp->ni_ufs.ufs_offset, + (char **)0, &bp)) + return (error); + entryoffsetinblock = 0; + } + /* + * If still looking for a slot, and at a DIRBLKSIZE + * boundary, have to start looking for free space again. + */ + if (slotstatus == NONE && + (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { + slotoffset = -1; + slotfreespace = 0; + } + /* + * Get pointer to next entry. + * Full validation checks are slow, so we only check + * enough to insure forward progress through the + * directory. Complete checks can be run by patching + * "dirchk" to be true. + */ + ep = (struct direct *)(bp->b_un.b_addr + entryoffsetinblock); + if (ep->d_reclen == 0 || + dirchk && dirbadentry(ep, entryoffsetinblock)) { + int i; + + dirbad(dp, ndp->ni_ufs.ufs_offset, "mangled entry"); + i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); + ndp->ni_ufs.ufs_offset += i; + entryoffsetinblock += i; + continue; + } + + /* + * If an appropriate sized slot has not yet been found, + * check to see if one is available. Also accumulate space + * in the current block so that we can determine if + * compaction is viable. + */ + if (slotstatus != FOUND) { + int size = ep->d_reclen; + + if (ep->d_ino != 0) + size -= DIRSIZ(ep); + if (size > 0) { + if (size >= slotneeded) { + slotstatus = FOUND; + slotoffset = ndp->ni_ufs.ufs_offset; + slotsize = ep->d_reclen; + } else if (slotstatus == NONE) { + slotfreespace += size; + if (slotoffset == -1) + slotoffset = + ndp->ni_ufs.ufs_offset; + if (slotfreespace >= slotneeded) { + slotstatus = COMPACT; + slotsize = + ndp->ni_ufs.ufs_offset + + ep->d_reclen - slotoffset; + } + } + } + } + + /* + * Check for a name match. + */ + if (ep->d_ino) { + if (ep->d_namlen == ndp->ni_namelen && + !bcmp(ndp->ni_ptr, ep->d_name, + (unsigned)ep->d_namlen)) { + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + ndp->ni_ufs.ufs_ino = ep->d_ino; + ndp->ni_ufs.ufs_reclen = ep->d_reclen; + goto found; + } + } + prevoff = ndp->ni_ufs.ufs_offset; + ndp->ni_ufs.ufs_offset += ep->d_reclen; + entryoffsetinblock += ep->d_reclen; + if (ep->d_ino) + enduseful = ndp->ni_ufs.ufs_offset; + } +/* notfound: */ + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + ndp->ni_ufs.ufs_offset = 0; + endsearch = dp->i_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp); + /* + * If creating, and at end of pathname and current + * directory has not been removed, then can consider + * allowing file to be created. + */ + if ((flag == CREATE || flag == RENAME) && + *ndp->ni_next == 0 && dp->i_nlink != 0) { + /* + * Access for write is interpreted as allowing + * creation of files in the directory. + */ + if (error = ufs_access(vdp, VWRITE, ndp->ni_cred, p)) + return (error); + /* + * Return an indication of where the new directory + * entry should be put. If we didn't find a slot, + * then set ndp->ni_ufs.ufs_count to 0 indicating + * that the new slot belongs at the end of the + * directory. If we found a slot, then the new entry + * can be put in the range from ndp->ni_ufs.ufs_offset + * to ndp->ni_ufs.ufs_offset + ndp->ni_ufs.ufs_count. + */ + if (slotstatus == NONE) { + ndp->ni_ufs.ufs_offset = roundup(dp->i_size, DIRBLKSIZ); + ndp->ni_ufs.ufs_count = 0; + enduseful = ndp->ni_ufs.ufs_offset; + } else { + ndp->ni_ufs.ufs_offset = slotoffset; + ndp->ni_ufs.ufs_count = slotsize; + if (enduseful < slotoffset + slotsize) + enduseful = slotoffset + slotsize; + } + ndp->ni_ufs.ufs_endoff = roundup(enduseful, DIRBLKSIZ); + dp->i_flag |= IUPD|ICHG; + /* + * We return with the directory locked, so that + * the parameters we set up above will still be + * valid if we actually decide to do a direnter(). + * We return ni_vp == NULL to indicate that the entry + * does not currently exist; we leave a pointer to + * the (locked) directory inode in ndp->ni_dvp. + * The pathname buffer is saved so that the name + * can be obtained later. + * + * NB - if the directory is unlocked, then this + * information cannot be used. + */ + ndp->ni_nameiop |= SAVENAME; + if (!lockparent) + IUNLOCK(dp); + } + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if (ndp->ni_makeentry && flag != CREATE) + cache_enter(ndp); + return (ENOENT); + +found: + if (numdirpasses == 2) + nchstats.ncs_pass2++; + /* + * Check that directory length properly reflects presence + * of this entry. + */ + if (entryoffsetinblock + DIRSIZ(ep) > dp->i_size) { + dirbad(dp, ndp->ni_ufs.ufs_offset, "i_size too small"); + dp->i_size = entryoffsetinblock + DIRSIZ(ep); + dp->i_flag |= IUPD|ICHG; + } + + brelse(bp); + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if (*ndp->ni_next == '\0' && flag == LOOKUP) + dp->i_diroff = ndp->ni_ufs.ufs_offset &~ (DIRBLKSIZ - 1); + + /* + * If deleting, and at end of pathname, return + * parameters which can be used to remove file. + * If the wantparent flag isn't set, we return only + * the directory (in ndp->ni_dvp), otherwise we go + * on and lock the inode, being careful with ".". + */ + if (flag == DELETE && *ndp->ni_next == 0) { + /* + * Write access to directory required to delete files. + */ + if (error = ufs_access(vdp, VWRITE, ndp->ni_cred, p)) + return (error); + /* + * Return pointer to current entry in ndp->ni_ufs.ufs_offset, + * and distance past previous entry (if there + * is a previous entry in this block) in ndp->ni_ufs.ufs_count. + * Save directory inode pointer in ndp->ni_dvp for dirremove(). + */ + if ((ndp->ni_ufs.ufs_offset&(DIRBLKSIZ-1)) == 0) + ndp->ni_ufs.ufs_count = 0; + else + ndp->ni_ufs.ufs_count = ndp->ni_ufs.ufs_offset - prevoff; + if (dp->i_number == ndp->ni_ufs.ufs_ino) { + VREF(vdp); + ndp->ni_vp = vdp; + return (0); + } + if (error = iget(dp, ndp->ni_ufs.ufs_ino, &tdp)) + return (error); + /* + * If directory is "sticky", then user must own + * the directory, or the file in it, else she + * may not delete it (unless she's root). This + * implements append-only directories. + */ + if ((dp->i_mode & ISVTX) && + ndp->ni_cred->cr_uid != 0 && + ndp->ni_cred->cr_uid != dp->i_uid && + tdp->i_uid != ndp->ni_cred->cr_uid) { + iput(tdp); + return (EPERM); + } + ndp->ni_vp = ITOV(tdp); + if (!lockparent) + IUNLOCK(dp); + return (0); + } + + /* + * If rewriting (RENAME), return the inode and the + * information required to rewrite the present directory + * Must get inode of directory entry to verify it's a + * regular file, or empty directory. + */ + if (flag == RENAME && wantparent && *ndp->ni_next == 0) { + if (error = ufs_access(vdp, VWRITE, ndp->ni_cred, p)) + return (error); + /* + * Careful about locking second inode. + * This can only occur if the target is ".". + */ + if (dp->i_number == ndp->ni_ufs.ufs_ino) + return (EISDIR); + if (error = iget(dp, ndp->ni_ufs.ufs_ino, &tdp)) + return (error); + ndp->ni_vp = ITOV(tdp); + ndp->ni_nameiop |= SAVENAME; + if (!lockparent) + IUNLOCK(dp); + return (0); + } + + /* + * Step through the translation in the name. We do not `iput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the `iget' for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the file system has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = dp; + if (ndp->ni_isdotdot) { + IUNLOCK(pdp); /* race to get the inode */ + if (error = iget(dp, ndp->ni_ufs.ufs_ino, &tdp)) { + ILOCK(pdp); + return (error); + } + if (lockparent && *ndp->ni_next == '\0') + ILOCK(pdp); + ndp->ni_vp = ITOV(tdp); + } else if (dp->i_number == ndp->ni_ufs.ufs_ino) { + VREF(vdp); /* we want ourself, ie "." */ + ndp->ni_vp = vdp; + } else { + if (error = iget(dp, ndp->ni_ufs.ufs_ino, &tdp)) + return (error); + if (!lockparent || *ndp->ni_next != '\0') + IUNLOCK(pdp); + ndp->ni_vp = ITOV(tdp); + } + + /* + * Insert name into cache if appropriate. + */ + if (ndp->ni_makeentry) + cache_enter(ndp); + return (0); +} + + +dirbad(ip, offset, how) + struct inode *ip; + off_t offset; + char *how; +{ + + printf("%s: bad dir ino %d at offset %d: %s\n", + ip->i_fs->fs_fsmnt, ip->i_number, offset, how); + if (ip->i_fs->fs_ronly == 0) + panic("bad dir"); +} + +/* + * Do consistency checking on a directory entry: + * record length must be multiple of 4 + * entry must fit in rest of its DIRBLKSIZ block + * record must be large enough to contain entry + * name is not longer than MAXNAMLEN + * name must be as long as advertised, and null terminated + */ +dirbadentry(ep, entryoffsetinblock) + register struct direct *ep; + int entryoffsetinblock; +{ + register int i; + + if ((ep->d_reclen & 0x3) != 0 || + ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || + ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN) + return (1); + for (i = 0; i < ep->d_namlen; i++) + if (ep->d_name[i] == '\0') + return (1); + return (ep->d_name[i]); +} + +/* + * Write a directory entry after a call to namei, using the parameters + * that it left in nameidata. The argument ip is the inode which the new + * directory entry will refer to. The nameidata field ndp->ni_dvp is a + * pointer to the directory to be written, which was left locked by namei. + * Remaining parameters (ndp->ni_ufs.ufs_offset, ndp->ni_ufs.ufs_count) + * indicate how the space for the new entry is to be obtained. + */ +direnter(ip, ndp) + struct inode *ip; + register struct nameidata *ndp; +{ + register struct direct *ep, *nep; + register struct inode *dp = VTOI(ndp->ni_dvp); + struct buf *bp; + int loc, spacefree, error = 0; + u_int dsize; + int newentrysize; + char *dirbuf; + struct uio auio; + struct iovec aiov; + struct direct newdir; + +#ifdef DIAGNOSTIC + if ((ndp->ni_nameiop & SAVENAME) == 0) + panic("direnter: missing name"); +#endif + newdir.d_ino = ip->i_number; + newdir.d_namlen = ndp->ni_namelen; + bcopy(ndp->ni_ptr, newdir.d_name, (unsigned)ndp->ni_namelen + 1); + newentrysize = DIRSIZ(&newdir); + if (ndp->ni_ufs.ufs_count == 0) { + /* + * If ndp->ni_ufs.ufs_count is 0, then namei could find no + * space in the directory. Here, ndp->ni_ufs.ufs_offset will + * be on a directory block boundary and we will write the + * new entry into a fresh block. + */ + if (ndp->ni_ufs.ufs_offset & (DIRBLKSIZ - 1)) + panic("wdir: newblk"); + auio.uio_offset = ndp->ni_ufs.ufs_offset; + newdir.d_reclen = DIRBLKSIZ; + auio.uio_resid = newentrysize; + aiov.iov_len = newentrysize; + aiov.iov_base = (caddr_t)&newdir; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = (struct proc *)0; + error = ufs_write(ndp->ni_dvp, &auio, IO_SYNC, ndp->ni_cred); + if (DIRBLKSIZ > dp->i_fs->fs_fsize) { + panic("wdir: blksize"); /* XXX - should grow w/balloc */ + } else if (!error) { + dp->i_size = roundup(dp->i_size, DIRBLKSIZ); + dp->i_flag |= ICHG; + } + return (error); + } + + /* + * If ndp->ni_ufs.ufs_count is non-zero, then namei found space + * for the new entry in the range ndp->ni_ufs.ufs_offset to + * ndp->ni_ufs.ufs_offset + ndp->ni_ufs.ufs_count in the directory. + * To use this space, we may have to compact the entries located + * there, by copying them together towards the beginning of the + * block, leaving the free space in one usable chunk at the end. + */ + + /* + * Increase size of directory if entry eats into new space. + * This should never push the size past a new multiple of + * DIRBLKSIZE. + * + * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. + */ + if (ndp->ni_ufs.ufs_offset + ndp->ni_ufs.ufs_count > dp->i_size) + dp->i_size = ndp->ni_ufs.ufs_offset + ndp->ni_ufs.ufs_count; + /* + * Get the block containing the space for the new directory entry. + */ + if (error = blkatoff(dp, ndp->ni_ufs.ufs_offset, (char **)&dirbuf, &bp)) + return (error); + /* + * Find space for the new entry. In the simple case, the entry at + * offset base will have the space. If it does not, then namei + * arranged that compacting the region ndp->ni_ufs.ufs_offset to + * ndp->ni_ufs.ufs_offset + ndp->ni_ufs.ufs_count would yield the + * space. + */ + ep = (struct direct *)dirbuf; + dsize = DIRSIZ(ep); + spacefree = ep->d_reclen - dsize; + for (loc = ep->d_reclen; loc < ndp->ni_ufs.ufs_count; ) { + nep = (struct direct *)(dirbuf + loc); + if (ep->d_ino) { + /* trim the existing slot */ + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + } else { + /* overwrite; nothing there; header is ours */ + spacefree += dsize; + } + dsize = DIRSIZ(nep); + spacefree += nep->d_reclen - dsize; + loc += nep->d_reclen; + bcopy((caddr_t)nep, (caddr_t)ep, dsize); + } + /* + * Update the pointer fields in the previous entry (if any), + * copy in the new entry, and write out the block. + */ + if (ep->d_ino == 0) { + if (spacefree + dsize < newentrysize) + panic("wdir: compact1"); + newdir.d_reclen = spacefree + dsize; + } else { + if (spacefree < newentrysize) + panic("wdir: compact2"); + newdir.d_reclen = spacefree; + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + } + bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize); + error = bwrite(bp); + dp->i_flag |= IUPD|ICHG; + if (!error && ndp->ni_ufs.ufs_endoff && + ndp->ni_ufs.ufs_endoff < dp->i_size) + error = itrunc(dp, (u_long)ndp->ni_ufs.ufs_endoff, IO_SYNC); + return (error); +} + +/* + * Remove a directory entry after a call to namei, using + * the parameters which it left in nameidata. The entry + * ni_ufs.ufs_offset contains the offset into the directory of the + * entry to be eliminated. The ni_ufs.ufs_count field contains the + * size of the previous record in the directory. If this + * is 0, the first entry is being deleted, so we need only + * zero the inode number to mark the entry as free. If the + * entry is not the first in the directory, we must reclaim + * the space of the now empty record by adding the record size + * to the size of the previous entry. + */ +dirremove(ndp) + register struct nameidata *ndp; +{ + register struct inode *dp = VTOI(ndp->ni_dvp); + struct direct *ep; + struct buf *bp; + int error; + + if (ndp->ni_ufs.ufs_count == 0) { + /* + * First entry in block: set d_ino to zero. + */ + error = blkatoff(dp, ndp->ni_ufs.ufs_offset, (char **)&ep, &bp); + if (error) + return (error); + ep->d_ino = 0; + error = bwrite(bp); + dp->i_flag |= IUPD|ICHG; + return (error); + } + /* + * Collapse new free space into previous entry. + */ + if (error = blkatoff(dp, ndp->ni_ufs.ufs_offset - ndp->ni_ufs.ufs_count, + (char **)&ep, &bp)) { + return (error); + } + ep->d_reclen += ndp->ni_ufs.ufs_reclen; + error = bwrite(bp); + dp->i_flag |= IUPD|ICHG; + return (error); +} + +/* + * Rewrite an existing directory entry to point at the inode + * supplied. The parameters describing the directory entry are + * set up by a call to namei. + */ +dirrewrite(dp, ip, ndp) + struct inode *dp, *ip; + struct nameidata *ndp; +{ + struct direct *ep; + struct buf *bp; + int error; + + if (error = blkatoff(dp, ndp->ni_ufs.ufs_offset, (char **)&ep, &bp)) + return (error); + ep->d_ino = ip->i_number; + error = bwrite(bp); + dp->i_flag |= IUPD|ICHG; + return (error); +} + +/* + * Return buffer with contents of block "offset" + * from the beginning of directory "ip". If "res" + * is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +blkatoff(ip, offset, res, bpp) + struct inode *ip; + off_t offset; + char **res; + struct buf **bpp; +{ + register struct fs *fs = ip->i_fs; + daddr_t lbn = lblkno(fs, offset); + int bsize = blksize(fs, ip, lbn); + struct buf *bp; + daddr_t bn; + int error; + + *bpp = 0; + if (error = bread(ITOV(ip), lbn, bsize, NOCRED, &bp)) { + brelse(bp); + return (error); + } + if (res) + *res = bp->b_un.b_addr + blkoff(fs, offset); + *bpp = bp; + return (0); +} + +/* + * Check if a directory is empty or not. + * Inode supplied must be locked. + * + * Using a struct dirtemplate here is not precisely + * what we want, but better than using a struct direct. + * + * NB: does not handle corrupted directories. + */ +dirempty(ip, parentino, cred) + register struct inode *ip; + ino_t parentino; + struct ucred *cred; +{ + register off_t off; + struct dirtemplate dbuf; + register struct direct *dp = (struct direct *)&dbuf; + int error, count; +#define MINDIRSIZ (sizeof (struct dirtemplate) / 2) + + for (off = 0; off < ip->i_size; off += dp->d_reclen) { + error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, + UIO_SYSSPACE, IO_NODELOCKED, cred, &count, (struct proc *)0); + /* + * Since we read MINDIRSIZ, residual must + * be 0 unless we're at end of file. + */ + if (error || count != 0) + return (0); + /* avoid infinite loops */ + if (dp->d_reclen == 0) + return (0); + /* skip empty entries */ + if (dp->d_ino == 0) + continue; + /* accept only "." and ".." */ + if (dp->d_namlen > 2) + return (0); + if (dp->d_name[0] != '.') + return (0); + /* + * At this point d_namlen must be 1 or 2. + * 1 implies ".", 2 implies ".." if second + * char is also "." + */ + if (dp->d_namlen == 1) + continue; + if (dp->d_name[1] == '.' && dp->d_ino == parentino) + continue; + return (0); + } + return (1); +} + +/* + * Check if source directory is in the path of the target directory. + * Target is supplied locked, source is unlocked. + * The target is always iput() before returning. + */ +checkpath(source, target, cred) + struct inode *source, *target; + struct ucred *cred; +{ + struct dirtemplate dirbuf; + struct inode *ip; + int error = 0; + + ip = target; + if (ip->i_number == source->i_number) { + error = EEXIST; + goto out; + } + if (ip->i_number == ROOTINO) + goto out; + + for (;;) { + if ((ip->i_mode&IFMT) != IFDIR) { + error = ENOTDIR; + break; + } + error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED, cred, (int *)0, (struct proc *)0); + if (error != 0) + break; + if (dirbuf.dotdot_namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + error = ENOTDIR; + break; + } + if (dirbuf.dotdot_ino == source->i_number) { + error = EINVAL; + break; + } + if (dirbuf.dotdot_ino == ROOTINO) + break; + iput(ip); + if (error = iget(ip, dirbuf.dotdot_ino, &ip)) + break; + } + +out: + if (error == ENOTDIR) + printf("checkpath: .. not a directory\n"); + if (ip != NULL) + iput(ip); + return (error); +} diff --git a/sys/ufs/ufs_quota.c b/sys/ufs/ufs_quota.c new file mode 100644 index 000000000000..d38dae185a89 --- /dev/null +++ b/sys/ufs/ufs_quota.c @@ -0,0 +1,936 @@ +/* + * Copyright (c) 1982, 1986, 1990 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_quota.c 7.11 (Berkeley) 6/21/91 + * $Id: ufs_quota.c,v 1.2 1993/10/16 18:17:57 rgrimes Exp $ + */ + +#include "param.h" +#include "kernel.h" +#include "systm.h" +#include "namei.h" +#include "malloc.h" +#include "file.h" +#include "proc.h" +#include "vnode.h" +#include "mount.h" + +#include "fs.h" +#include "quota.h" +#include "inode.h" +#include "ufsmount.h" + +/* + * Quota name to error message mapping. + */ +static char *quotatypes[] = INITQFNAMES; + +/* + * Set up the quotas for an inode. + * + * This routine completely defines the semantics of quotas. + * If other criterion want to be used to establish quotas, the + * MAXQUOTAS value in quotas.h should be increased, and the + * additional dquots set up here. + */ +getinoquota(ip) + register struct inode *ip; +{ + struct ufsmount *ump; + struct vnode *vp = ITOV(ip); + int error; + + ump = VFSTOUFS(vp->v_mount); + /* + * Set up the user quota based on file uid. + * EINVAL means that quotas are not enabled. + */ + if (ip->i_dquot[USRQUOTA] == NODQUOT && + (error = + dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) && + error != EINVAL) + return (error); + /* + * Set up the group quota based on file gid. + * EINVAL means that quotas are not enabled. + */ + if (ip->i_dquot[GRPQUOTA] == NODQUOT && + (error = + dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && + error != EINVAL) + return (error); + return (0); +} + +/* + * Update disk usage, and take corrective action. + */ +chkdq(ip, change, cred, flags) + register struct inode *ip; + long change; + struct ucred *cred; + int flags; +{ + register struct dquot *dq; + register int i; + int ncurblocks, error; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + ncurblocks = dq->dq_curblocks + change; + if (ncurblocks >= 0) + dq->dq_curblocks = ncurblocks; + else + dq->dq_curblocks = 0; + dq->dq_flags &= ~DQ_BLKS; + dq->dq_flags |= DQ_MOD; + } + return (0); + } + if ((flags & FORCE) == 0 && cred->cr_uid != 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + if (error = chkdqchg(ip, change, cred, i)) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + dq->dq_curblocks += change; + dq->dq_flags |= DQ_MOD; + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +chkdqchg(ip, change, cred, type) + struct inode *ip; + long change; + struct ucred *cred; + int type; +{ + register struct dquot *dq = ip->i_dquot[type]; + long ncurblocks = dq->dq_curblocks + change; + + /* + * If user would exceed their hard limit, disallow space allocation. + */ + if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s disk limit reached\n", + ip->i_fs->fs_fsmnt, quotatypes[type]); + dq->dq_flags |= DQ_BLKS; + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow space + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + if (dq->dq_curblocks < dq->dq_bsoftlimit) { + dq->dq_btime = time.tv_sec + + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type]; + if (ip->i_uid == cred->cr_uid) + uprintf("\n%s: warning, %s %s\n", + ip->i_fs->fs_fsmnt, quotatypes[type], + "disk quota exceeded"); + return (0); + } + if (time.tv_sec > dq->dq_btime) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s %s\n", + ip->i_fs->fs_fsmnt, quotatypes[type], + "disk quota exceeded too long"); + dq->dq_flags |= DQ_BLKS; + } + return (EDQUOT); + } + } + return (0); +} + +/* + * Check the inode limit, applying corrective action. + */ +chkiq(ip, change, cred, flags) + register struct inode *ip; + long change; + struct ucred *cred; + int flags; +{ + register struct dquot *dq; + register int i; + int ncurinodes, error; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + ncurinodes = dq->dq_curinodes + change; + if (ncurinodes >= 0) + dq->dq_curinodes = ncurinodes; + else + dq->dq_curinodes = 0; + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + } + return (0); + } + if ((flags & FORCE) == 0 && cred->cr_uid != 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + if (error = chkiqchg(ip, change, cred, i)) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + dq->dq_curinodes += change; + dq->dq_flags |= DQ_MOD; + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +chkiqchg(ip, change, cred, type) + struct inode *ip; + long change; + struct ucred *cred; + int type; +{ + register struct dquot *dq = ip->i_dquot[type]; + long ncurinodes = dq->dq_curinodes + change; + + /* + * If user would exceed their hard limit, disallow inode allocation. + */ + if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s inode limit reached\n", + ip->i_fs->fs_fsmnt, quotatypes[type]); + dq->dq_flags |= DQ_INODS; + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow inode + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + if (dq->dq_curinodes < dq->dq_isoftlimit) { + dq->dq_itime = time.tv_sec + + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type]; + if (ip->i_uid == cred->cr_uid) + uprintf("\n%s: warning, %s %s\n", + ip->i_fs->fs_fsmnt, quotatypes[type], + "inode quota exceeded"); + return (0); + } + if (time.tv_sec > dq->dq_itime) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s %s\n", + ip->i_fs->fs_fsmnt, quotatypes[type], + "inode quota exceeded too long"); + dq->dq_flags |= DQ_INODS; + } + return (EDQUOT); + } + } + return (0); +} + +#ifdef DIAGNOSTIC +/* + * On filesystems with quotas enabled, + * it is an error for a file to change size and not + * to have a dquot structure associated with it. + */ +chkdquot(ip) + register struct inode *ip; +{ + struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount); + register int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP || + (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING))) + continue; + if (ip->i_dquot[i] == NODQUOT) { + vprint("chkdquot: missing dquot", ITOV(ip)); + panic("missing dquot"); + } + } +} +#endif /* DIAGNOSTIC */ + +/* + * Code to process quotactl commands. + */ + +/* + * Q_QUOTAON - set up a quota file for a particular file system. + */ +quotaon(p, mp, type, fname) + struct proc *p; + struct mount *mp; + register int type; + caddr_t fname; +{ + register struct ufsmount *ump = VFSTOUFS(mp); + register struct vnode *vp, **vpp; + struct vnode *nextvp; + struct dquot *dq; + int error; + struct nameidata nd; + + vpp = &ump->um_quotas[type]; + nd.ni_segflg = UIO_USERSPACE; + nd.ni_dirp = fname; + if (error = vn_open(&nd, p, FREAD|FWRITE, 0)) + return (error); + vp = nd.ni_vp; + VOP_UNLOCK(vp); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (EACCES); + } + if (vfs_busy(mp)) { + (void) vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (EBUSY); + } + if (*vpp != vp) + quotaoff(p, mp, type); + ump->um_qflags[type] |= QTF_OPENING; + mp->mnt_flag |= MNT_QUOTA; + vp->v_flag |= VSYSTEM; + *vpp = vp; + /* + * Save the credential of the process that turned on quotas. + * Set up the time limits for this quota. + */ + crhold(p->p_ucred); + ump->um_cred[type] = p->p_ucred; + ump->um_btime[type] = MAX_DQ_TIME; + ump->um_itime[type] = MAX_IQ_TIME; + if (dqget(NULLVP, 0, ump, type, &dq) == 0) { + if (dq->dq_btime > 0) + ump->um_btime[type] = dq->dq_btime; + if (dq->dq_itime > 0) + ump->um_itime[type] = dq->dq_itime; + dqrele(NULLVP, dq); + } + /* + * Search vnodes associated with this mount point, + * adding references to quota file being opened. + * NB: only need to add dquot's for inodes being modified. + */ +again: + for (vp = mp->mnt_mounth; vp; vp = nextvp) { + nextvp = vp->v_mountf; + if (vp->v_writecount == 0) + continue; + if (vget(vp)) + goto again; + if (error = getinoquota(VTOI(vp))) { + vput(vp); + break; + } + vput(vp); + if (vp->v_mountf != nextvp || vp->v_mount != mp) + goto again; + } + ump->um_qflags[type] &= ~QTF_OPENING; + if (error) + quotaoff(p, mp, type); + vfs_unbusy(mp); + return (error); +} + +/* + * Q_QUOTAOFF - turn off disk quotas for a filesystem. + */ +quotaoff(p, mp, type) + struct proc *p; + struct mount *mp; + register int type; +{ + register struct vnode *vp; + struct vnode *qvp, *nextvp; + struct ufsmount *ump = VFSTOUFS(mp); + register struct dquot *dq; + register struct inode *ip; + int error; + + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("quotaoff: not busy"); + if ((qvp = ump->um_quotas[type]) == NULLVP) + return (0); + ump->um_qflags[type] |= QTF_CLOSING; + /* + * Search vnodes associated with this mount point, + * deleting any references to quota file being closed. + */ +again: + for (vp = mp->mnt_mounth; vp; vp = nextvp) { + nextvp = vp->v_mountf; + if (vget(vp)) + goto again; + ip = VTOI(vp); + dq = ip->i_dquot[type]; + ip->i_dquot[type] = NODQUOT; + dqrele(vp, dq); + vput(vp); + if (vp->v_mountf != nextvp || vp->v_mount != mp) + goto again; + } + dqflush(qvp); + qvp->v_flag &= ~VSYSTEM; + error = vn_close(qvp, FREAD|FWRITE, p->p_ucred, p); + ump->um_quotas[type] = NULLVP; + crfree(ump->um_cred[type]); + ump->um_cred[type] = NOCRED; + ump->um_qflags[type] &= ~QTF_CLOSING; + for (type = 0; type < MAXQUOTAS; type++) + if (ump->um_quotas[type] != NULLVP) + break; + if (type == MAXQUOTAS) + mp->mnt_flag &= ~MNT_QUOTA; + return (error); +} + +/* + * Q_GETQUOTA - return current values in a dqblk structure. + */ +getquota(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + struct dquot *dq; + int error; + + if (error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq)) + return (error); + error = copyout((caddr_t)&dq->dq_dqb, addr, sizeof (struct dqblk)); + dqrele(NULLVP, dq); + return (error); +} + +/* + * Q_SETQUOTA - assign an entire dqblk structure. + */ +setquota(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + register struct dquot *dq; + struct dquot *ndq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dqblk newlim; + int error; + + if (error = copyin(addr, (caddr_t)&newlim, sizeof (struct dqblk))) + return (error); + if (error = dqget(NULLVP, id, ump, type, &ndq)) + return (error); + dq = ndq; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + /* + * Copy all but the current values. + * Reset time limit if previously had no soft limit or were + * under it, but now have a soft limit and are over it. + */ + newlim.dqb_curblocks = dq->dq_curblocks; + newlim.dqb_curinodes = dq->dq_curinodes; + if (dq->dq_id != 0) { + newlim.dqb_btime = dq->dq_btime; + newlim.dqb_itime = dq->dq_itime; + } + if (newlim.dqb_bsoftlimit && + dq->dq_curblocks >= newlim.dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) + newlim.dqb_btime = time.tv_sec + ump->um_btime[type]; + if (newlim.dqb_isoftlimit && + dq->dq_curinodes >= newlim.dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) + newlim.dqb_itime = time.tv_sec + ump->um_itime[type]; + dq->dq_dqb = newlim; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + else + dq->dq_flags &= ~DQ_FAKE; + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SETUSE - set current inode and block usage. + */ +setuse(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + register struct dquot *dq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dquot *ndq; + struct dqblk usage; + int error; + + if (error = copyin(addr, (caddr_t)&usage, sizeof (struct dqblk))) + return (error); + if (error = dqget(NULLVP, id, ump, type, &ndq)) + return (error); + dq = ndq; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + /* + * Reset time limit if have a soft limit and were + * previously under it, but are now over it. + */ + if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && + usage.dqb_curblocks >= dq->dq_bsoftlimit) + dq->dq_btime = time.tv_sec + ump->um_btime[type]; + if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && + usage.dqb_curinodes >= dq->dq_isoftlimit) + dq->dq_itime = time.tv_sec + ump->um_itime[type]; + dq->dq_curblocks = usage.dqb_curblocks; + dq->dq_curinodes = usage.dqb_curinodes; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SYNC - sync quota files to disk. + */ +qsync(mp) + struct mount *mp; +{ + struct ufsmount *ump = VFSTOUFS(mp); + register struct vnode *vp, *nextvp; + register struct dquot *dq; + register int i; + + /* + * Check if the mount point has any quotas. + * If not, simply return. + */ + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("qsync: not busy"); + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + if (i == MAXQUOTAS) + return (0); + /* + * Search vnodes associated with this mount point, + * synchronizing any modified dquot structures. + */ +again: + for (vp = mp->mnt_mounth; vp; vp = nextvp) { + nextvp = vp->v_mountf; + if (VOP_ISLOCKED(vp)) + continue; + if (vget(vp)) + goto again; + for (i = 0; i < MAXQUOTAS; i++) { + dq = VTOI(vp)->i_dquot[i]; + if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) + dqsync(vp, dq); + } + vput(vp); + if (vp->v_mountf != nextvp || vp->v_mount != mp) + goto again; + } + return (0); +} + +/* + * Code pertaining to management of the in-core dquot data structures. + */ + +/* + * Dquot cache - hash chain headers. + */ +union dqhead { + union dqhead *dqh_head[2]; + struct dquot *dqh_chain[2]; +}; +#define dqh_forw dqh_chain[0] +#define dqh_back dqh_chain[1] + +union dqhead *dqhashtbl; +long dqhash; + +/* + * Dquot free list. + */ +#define DQUOTINC 5 /* minimum free dquots desired */ +struct dquot *dqfreel, **dqback = &dqfreel; +long numdquot, desireddquot = DQUOTINC; + +/* + * Initialize the quota system. + */ +dqinit() +{ + register union dqhead *dhp; + register long dqhashsize; + + dqhashsize = roundup((desiredvnodes + 1) * sizeof *dhp / 2, + NBPG * CLSIZE); + dqhashtbl = (union dqhead *)malloc(dqhashsize, M_DQUOT, M_WAITOK); + for (dqhash = 1; dqhash <= dqhashsize / sizeof *dhp; dqhash <<= 1) + /* void */; + dqhash = (dqhash >> 1) - 1; + for (dhp = &dqhashtbl[dqhash]; dhp >= dqhashtbl; dhp--) { + dhp->dqh_head[0] = dhp; + dhp->dqh_head[1] = dhp; + } +} + +/* + * Obtain a dquot structure for the specified identifier and quota file + * reading the information from the file if necessary. + */ +dqget(vp, id, ump, type, dqp) + struct vnode *vp; + u_long id; + register struct ufsmount *ump; + register int type; + struct dquot **dqp; +{ + register struct dquot *dq; + register union dqhead *dh; + register struct dquot *dp; + register struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + dqvp = ump->um_quotas[type]; + if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { + *dqp = NODQUOT; + return (EINVAL); + } + /* + * Check the cache first. + */ + dh = &dqhashtbl[((((int)(dqvp)) >> 8) + id) & dqhash]; + for (dq = dh->dqh_forw; dq != (struct dquot *)dh; dq = dq->dq_forw) { + if (dq->dq_id != id || + dq->dq_ump->um_quotas[dq->dq_type] != dqvp) + continue; + /* + * Cache hit with no references. Take + * the structure off the free list. + */ + if (dq->dq_cnt == 0) { + dp = dq->dq_freef; + if (dp != NODQUOT) + dp->dq_freeb = dq->dq_freeb; + else + dqback = dq->dq_freeb; + *dq->dq_freeb = dp; + } + DQREF(dq); + *dqp = dq; + return (0); + } + /* + * Not in cache, allocate a new one. + */ + if (dqfreel == NODQUOT && numdquot < MAXQUOTAS * desiredvnodes) + desireddquot += DQUOTINC; + if (numdquot < desireddquot) { + dq = (struct dquot *)malloc(sizeof *dq, M_DQUOT, M_WAITOK); + bzero((char *)dq, sizeof *dq); + numdquot++; + } else { + if ((dq = dqfreel) == NULL) { + tablefull("dquot"); + *dqp = NODQUOT; + return (EUSERS); + } + if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) + panic("free dquot isn't"); + if ((dp = dq->dq_freef) != NODQUOT) + dp->dq_freeb = &dqfreel; + else + dqback = &dqfreel; + dqfreel = dp; + dq->dq_freef = NULL; + dq->dq_freeb = NULL; + remque(dq); + } + /* + * Initialize the contents of the dquot structure. + */ + if (vp != dqvp) + VOP_LOCK(dqvp); + insque(dq, dh); + DQREF(dq); + dq->dq_flags = DQ_LOCK; + dq->dq_id = id; + dq->dq_ump = ump; + dq->dq_type = type; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t)&dq->dq_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(id * sizeof (struct dqblk)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = (struct proc *)0; + error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); + if (auio.uio_resid == sizeof(struct dqblk) && error == 0) + bzero((caddr_t)&dq->dq_dqb, sizeof(struct dqblk)); + if (vp != dqvp) + VOP_UNLOCK(dqvp); + if (dq->dq_flags & DQ_WANT) + wakeup((caddr_t)dq); + dq->dq_flags = 0; + /* + * I/O error in reading quota file, release + * quota structure and reflect problem to caller. + */ + if (error) { + remque(dq); + dq->dq_forw = dq; /* on a private, unfindable hash list */ + dq->dq_back = dq; + dqrele(vp, dq); + *dqp = NODQUOT; + return (error); + } + /* + * Check for no limit to enforce. + * Initialize time values if necessary. + */ + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + if (dq->dq_id != 0) { + if (dq->dq_btime == 0) + dq->dq_btime = time.tv_sec + ump->um_btime[type]; + if (dq->dq_itime == 0) + dq->dq_itime = time.tv_sec + ump->um_itime[type]; + } + *dqp = dq; + return (0); +} + +/* + * Obtain a reference to a dquot. + */ +dqref(dq) + struct dquot *dq; +{ + + dq->dq_cnt++; +} + +/* + * Release a reference to a dquot. + */ +dqrele(vp, dq) + struct vnode *vp; + register struct dquot *dq; +{ + + if (dq == NODQUOT) + return; + if (dq->dq_cnt > 1) { + dq->dq_cnt--; + return; + } + if (dq->dq_flags & DQ_MOD) + (void) dqsync(vp, dq); + if (--dq->dq_cnt > 0) + return; + if (dqfreel != NODQUOT) { + *dqback = dq; + dq->dq_freeb = dqback; + } else { + dqfreel = dq; + dq->dq_freeb = &dqfreel; + } + dq->dq_freef = NODQUOT; + dqback = &dq->dq_freef; +} + +/* + * Update the disk quota in the quota file. + */ +dqsync(vp, dq) + struct vnode *vp; + register struct dquot *dq; +{ + struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + if (dq == NODQUOT) + panic("dqsync: dquot"); + if ((dq->dq_flags & DQ_MOD) == 0) + return (0); + if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP) + panic("dqsync: file"); + if (vp != dqvp) + VOP_LOCK(dqvp); + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+2); + if ((dq->dq_flags & DQ_MOD) == 0) { + if (vp != dqvp) + VOP_UNLOCK(dqvp); + return (0); + } + } + dq->dq_flags |= DQ_LOCK; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t)&dq->dq_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_procp = (struct proc *)0; + error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); + if (auio.uio_resid && error == 0) + error = EIO; + if (dq->dq_flags & DQ_WANT) + wakeup((caddr_t)dq); + dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT); + if (vp != dqvp) + VOP_UNLOCK(dqvp); + return (error); +} + +/* + * Flush all entries from the cache for a particular vnode. + */ +dqflush(vp) + register struct vnode *vp; +{ + register union dqhead *dh; + register struct dquot *dq, *nextdq; + + /* + * Move all dquot's that used to refer to this quota + * file off their hash chains (they will eventually + * fall off the head of the free list and be re-used). + */ + for (dh = &dqhashtbl[dqhash]; dh >= dqhashtbl; dh--) { + for (dq = dh->dqh_forw; dq != (struct dquot *)dh; dq = nextdq) { + nextdq = dq->dq_forw; + if (dq->dq_ump->um_quotas[dq->dq_type] != vp) + continue; + if (dq->dq_cnt) + panic("dqflush: stray dquot"); + remque(dq); + dq->dq_forw = dq; + dq->dq_back = dq; + dq->dq_ump = (struct ufsmount *)0; + } + } +} diff --git a/sys/ufs/ufs_subr.c b/sys/ufs/ufs_subr.c new file mode 100644 index 000000000000..04c004e18b05 --- /dev/null +++ b/sys/ufs/ufs_subr.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 1982, 1986, 1989 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_subr.c 7.13 (Berkeley) 6/28/90 + * $Id: ufs_subr.c,v 1.2 1993/10/16 18:17:59 rgrimes Exp $ + */ + +#ifdef KERNEL +#include "param.h" +#include "../ufs/fs.h" +#else +#include <sys/param.h> +#include <ufs/fs.h> +#endif + +extern int around[9]; +extern int inside[9]; +extern u_char *fragtbl[]; + +/* + * Update the frsum fields to reflect addition or deletion + * of some frags. + */ +fragacct(fs, fragmap, fraglist, cnt) + struct fs *fs; + int fragmap; + long fraglist[]; + int cnt; +{ + int inblk; + register int field, subfield; + register int siz, pos; + + inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; + fragmap <<= 1; + for (siz = 1; siz < fs->fs_frag; siz++) { + if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) + continue; + field = around[siz]; + subfield = inside[siz]; + for (pos = siz; pos <= fs->fs_frag; pos++) { + if ((fragmap & field) == subfield) { + fraglist[siz] += cnt; + pos += siz; + field <<= siz; + subfield <<= siz; + } + field <<= 1; + subfield <<= 1; + } + } +} + +/* + * block operations + * + * check if a block is available + */ +isblock(fs, cp, h) + struct fs *fs; + unsigned char *cp; + daddr_t h; +{ + unsigned char mask; + + switch ((int)fs->fs_frag) { + case 8: + return (cp[h] == 0xff); + case 4: + mask = 0x0f << ((h & 0x1) << 2); + return ((cp[h >> 1] & mask) == mask); + case 2: + mask = 0x03 << ((h & 0x3) << 1); + return ((cp[h >> 2] & mask) == mask); + case 1: + mask = 0x01 << (h & 0x7); + return ((cp[h >> 3] & mask) == mask); + default: + panic("isblock"); + return (NULL); + } +} + +/* + * take a block out of the map + */ +clrblock(fs, cp, h) + struct fs *fs; + u_char *cp; + daddr_t h; +{ + + switch ((int)fs->fs_frag) { + case 8: + cp[h] = 0; + return; + case 4: + cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); + return; + case 2: + cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); + return; + case 1: + cp[h >> 3] &= ~(0x01 << (h & 0x7)); + return; + default: + panic("clrblock"); + } +} + +/* + * put a block into the map + */ +setblock(fs, cp, h) + struct fs *fs; + unsigned char *cp; + daddr_t h; +{ + + switch ((int)fs->fs_frag) { + + case 8: + cp[h] = 0xff; + return; + case 4: + cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); + return; + case 2: + cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); + return; + case 1: + cp[h >> 3] |= (0x01 << (h & 0x7)); + return; + default: + panic("setblock"); + } +} + +#if (!defined(vax) && !defined(tahoe) && !defined(hp300)) \ + || defined(VAX630) || defined(VAX650) +/* + * C definitions of special instructions. + * Normally expanded with inline. + */ +scanc(size, cp, table, mask) + u_int size; + register u_char *cp, table[]; + register u_char mask; +{ + register u_char *end = &cp[size]; + + while (cp < end && (table[*cp] & mask) == 0) + cp++; + return (end - cp); +} +#endif + +#if !defined(vax) && !defined(tahoe) && !defined(hp300) +skpc(mask, size, cp) + register u_char mask; + u_int size; + register u_char *cp; +{ + register u_char *end = &cp[size]; + + while (cp < end && *cp == mask) + cp++; + return (end - cp); +} + +locc(mask, size, cp) + register u_char mask; + u_int size; + register u_char *cp; +{ + register u_char *end = &cp[size]; + + while (cp < end && *cp != mask) + cp++; + return (end - cp); +} +#endif diff --git a/sys/ufs/ufs_tables.c b/sys/ufs/ufs_tables.c new file mode 100644 index 000000000000..04b3a33b1ca9 --- /dev/null +++ b/sys/ufs/ufs_tables.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 1982, 1986 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_tables.c 7.4 (Berkeley) 6/28/90 + * $Id: ufs_tables.c,v 1.2 1993/10/16 18:18:00 rgrimes Exp $ + */ + +#ifdef KERNEL +#include "param.h" +#else +#include <sys/param.h> +#endif + +/* + * Bit patterns for identifying fragments in the block map + * used as ((map & around) == inside) + */ +int around[9] = { + 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff +}; +int inside[9] = { + 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe +}; + +/* + * Given a block map bit pattern, the frag tables tell whether a + * particular size fragment is available. + * + * used as: + * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] { + * at least one fragment of the indicated size is available + * } + * + * These tables are used by the scanc instruction on the VAX to + * quickly find an appropriate fragment. + */ +u_char fragtbl124[256] = { + 0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e, + 0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e, + 0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce, + 0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a, +}; + +u_char fragtbl8[256] = { + 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, + 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c, + 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80, +}; + +/* + * The actual fragtbl array. + */ +u_char *fragtbl[MAXFRAG + 1] = { + 0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8, +}; diff --git a/sys/ufs/ufs_vfsops.c b/sys/ufs/ufs_vfsops.c new file mode 100644 index 000000000000..8265be2d0874 --- /dev/null +++ b/sys/ufs/ufs_vfsops.c @@ -0,0 +1,758 @@ +/* + * Copyright (c) 1989, 1991 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_vfsops.c 7.56 (Berkeley) 6/28/91 + * $Id: ufs_vfsops.c,v 1.4 1993/10/16 18:18:01 rgrimes Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "namei.h" +#include "proc.h" +#include "kernel.h" +#include "vnode.h" +#include "specdev.h" +#include "mount.h" +#include "buf.h" +#include "file.h" +#include "dkbad.h" /* XXX */ +#include "disklabel.h" +#include "ioctl.h" +#include "errno.h" +#include "malloc.h" + +#include "quota.h" +#include "fs.h" +#include "ufsmount.h" +#include "inode.h" + +struct vfsops ufs_vfsops = { + ufs_mount, + ufs_start, + ufs_unmount, + ufs_root, + ufs_quotactl, + ufs_statfs, + ufs_sync, + ufs_fhtovp, + ufs_vptofh, + ufs_init +}; + +/* + * Flag to allow forcible unmounting. + */ +int doforce = 1; + +/* + * Called by vfs_mountroot when ufs is going to be mounted as root. + * + * Name is updated by mount(8) after booting. + */ +#define ROOTNAME "root_device" + +ufs_mountroot() +{ + register struct mount *mp; + extern struct vnode *rootvp; + struct proc *p = curproc; /* XXX */ + struct ufsmount *ump; + register struct fs *fs; + u_int size; + int error; + + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); + mp->mnt_op = &ufs_vfsops; + mp->mnt_flag = MNT_RDONLY; + mp->mnt_exroot = 0; + mp->mnt_mounth = NULLVP; + error = mountfs(rootvp, mp, p); + if (error) { + free((caddr_t)mp, M_MOUNT); + return (error); + } + if (error = vfs_lock(mp)) { + (void)ufs_unmount(mp, 0, p); + free((caddr_t)mp, M_MOUNT); + return (error); + } + rootfs = mp; + mp->mnt_next = mp; + mp->mnt_prev = mp; + mp->mnt_vnodecovered = NULLVP; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + bzero(fs->fs_fsmnt, sizeof(fs->fs_fsmnt)); + fs->fs_fsmnt[0] = '/'; + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) ufs_statfs(mp, &mp->mnt_stat, p); + vfs_unlock(mp); + inittodr(fs->fs_time); + return (0); +} + +/* + * VFS Operations. + * + * mount system call + */ +ufs_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct ufs_args args; + struct ufsmount *ump; + register struct fs *fs; + u_int size; + int error; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct ufs_args))) + return (error); + /* + * Process export requests. + */ + if ((args.exflags & MNT_EXPORTED) || (mp->mnt_flag & MNT_EXPORTED)) { + if (args.exflags & MNT_EXPORTED) + mp->mnt_flag |= MNT_EXPORTED; + else + mp->mnt_flag &= ~MNT_EXPORTED; + if (args.exflags & MNT_EXRDONLY) + mp->mnt_flag |= MNT_EXRDONLY; + else + mp->mnt_flag &= ~MNT_EXRDONLY; + mp->mnt_exroot = args.exroot; + } + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_ronly && (mp->mnt_flag & MNT_RDONLY) == 0) + fs->fs_ronly = 0; + if (args.fspec == 0) + return (0); + } + /* + * Not an update, or updating the name: look up the name + * and verify that it refers to a sensible block device. + */ + ndp->ni_nameiop = LOOKUP | FOLLOW; + ndp->ni_segflg = UIO_USERSPACE; + ndp->ni_dirp = args.fspec; + if (error = namei(ndp, p)) + return (error); + devvp = ndp->ni_vp; + if (devvp->v_type != VBLK) { + vrele(devvp); + return (ENOTBLK); + } + if (major(devvp->v_rdev) >= nblkdev) { + vrele(devvp); + return (ENXIO); + } + if ((mp->mnt_flag & MNT_UPDATE) == 0) + error = mountfs(devvp, mp, p); + else { + if (devvp != ump->um_devvp) + error = EINVAL; /* needs translation */ + else + vrele(devvp); + } + if (error) { + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_fs; + (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); + bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + mp->mnt_stat.f_mntonname[MNAMELEN-1] = '\0'; + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) ufs_statfs(mp, &mp->mnt_stat, p); + return (0); +} + +/* + * Common code for mount and mountroot + */ +mountfs(devvp, mp, p) + register struct vnode *devvp; + struct mount *mp; + struct proc *p; +{ + register struct ufsmount *ump = (struct ufsmount *)0; + struct buf *bp = NULL; + register struct fs *fs; + dev_t dev = devvp->v_rdev; + struct partinfo dpart; + caddr_t base, space; + int havepart = 0, blks; + int error, i, size; + int needclose = 0; + int ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + extern struct vnode *rootvp; + + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if (error = mountedon(devvp)) + return (error); + if (vcount(devvp) > 1 && devvp != rootvp) + return (EBUSY); + vinvalbuf(devvp, 1); + if (error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p)) + return (error); + needclose = 1; + if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) + size = DEV_BSIZE; + else { + havepart = 1; + size = dpart.disklab->d_secsize; + } + if (error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) + goto out; + fs = bp->b_un.b_fs; + if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || + fs->fs_bsize < sizeof(struct fs)) { + error = EINVAL; /* XXX needs translation */ + goto out; + } + ump = (struct ufsmount *)malloc(sizeof *ump, M_UFSMNT, M_WAITOK); + ump->um_fs = (struct fs *)malloc((u_long)fs->fs_sbsize, M_SUPERBLK, + M_WAITOK); + bcopy((caddr_t)bp->b_un.b_addr, (caddr_t)ump->um_fs, + (u_int)fs->fs_sbsize); + if (fs->fs_sbsize < SBSIZE) + bp->b_flags |= B_INVAL; + brelse(bp); + bp = NULL; + fs = ump->um_fs; + fs->fs_ronly = ronly; + if (ronly == 0) + fs->fs_fmod = 1; + if (havepart) { + dpart.part->p_fstype = FS_BSDFFS; + dpart.part->p_fsize = fs->fs_fsize; + dpart.part->p_frag = fs->fs_frag; + dpart.part->p_cpg = fs->fs_cpg; + } + blks = howmany(fs->fs_cssize, fs->fs_fsize); + base = space = (caddr_t)malloc((u_long)fs->fs_cssize, M_SUPERBLK, + M_WAITOK); + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, + NOCRED, &bp); + if (error) { + free((caddr_t)base, M_SUPERBLK); + goto out; + } + bcopy((caddr_t)bp->b_un.b_addr, space, (u_int)size); + fs->fs_csp[fragstoblks(fs, i)] = (struct csum *)space; + space += size; + brelse(bp); + bp = NULL; + } + mp->mnt_data = (qaddr_t)ump; + mp->mnt_stat.f_fsid.val[0] = (long)dev; + mp->mnt_stat.f_fsid.val[1] = MOUNT_UFS; + mp->mnt_flag |= MNT_LOCAL; + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + for (i = 0; i < MAXQUOTAS; i++) + ump->um_quotas[i] = NULLVP; + devvp->v_specflags |= SI_MOUNTEDON; + + /* Sanity checks for old file systems. XXX */ + fs->fs_npsect = MAX(fs->fs_npsect, fs->fs_nsect); /* XXX */ + fs->fs_interleave = MAX(fs->fs_interleave, 1); /* XXX */ + if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ + fs->fs_nrpos = 8; /* XXX */ + return (0); +out: + if (bp) + brelse(bp); + if (needclose) + (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + if (ump) { + free((caddr_t)ump->um_fs, M_SUPERBLK); + free((caddr_t)ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + } + return (error); +} + +/* + * Make a filesystem operational. + * Nothing to do at the moment. + */ +/* ARGSUSED */ +ufs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * unmount system call + */ +ufs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + register struct ufsmount *ump; + register struct fs *fs; + int i, error, ronly, flags = 0; + + if (mntflags & MNT_FORCE) { + if (!doforce || mp == rootfs) + return (EINVAL); + flags |= FORCECLOSE; + } + mntflushbuf(mp, 0); + if (mntinvalbuf(mp)) + return (EBUSY); + ump = VFSTOUFS(mp); +#ifdef QUOTA + if (mp->mnt_flag & MNT_QUOTA) { + if (error = vflush(mp, NULLVP, SKIPSYSTEM|flags)) + return (error); + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP) + continue; + quotaoff(p, mp, i); + } + /* + * Here we fall through to vflush again to ensure + * that we have gotten rid of all the system vnodes. + */ + } +#endif + if (error = vflush(mp, NULLVP, flags)) + return (error); + fs = ump->um_fs; + ronly = !fs->fs_ronly; + ump->um_devvp->v_specflags &= ~SI_MOUNTEDON; + error = VOP_CLOSE(ump->um_devvp, ronly ? FREAD : FREAD|FWRITE, + NOCRED, p); + vrele(ump->um_devvp); + free((caddr_t)fs->fs_csp[0], M_SUPERBLK); + free((caddr_t)fs, M_SUPERBLK); + free((caddr_t)ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Check to see if a filesystem is mounted on a block device. + */ +mountedon(vp) + register struct vnode *vp; +{ + register struct vnode *vq; + + if (vp->v_specflags & SI_MOUNTEDON) + return (EBUSY); + if (vp->v_flag & VALIASED) { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vq->v_specflags & SI_MOUNTEDON) + return (EBUSY); + } + } + return (0); +} + +/* + * Return root of a filesystem + */ +ufs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + register struct inode *ip; + struct inode *nip; + struct vnode tvp; + int error; + + tvp.v_mount = mp; + ip = VTOI(&tvp); + ip->i_vnode = &tvp; + ip->i_dev = VFSTOUFS(mp)->um_dev; + error = iget(ip, (ino_t)ROOTINO, &nip); + if (error) + return (error); + *vpp = ITOV(nip); + return (0); +} + +/* + * Do operations associated with quotas + */ +ufs_quotactl(mp, cmds, uid, arg, p) + struct mount *mp; + int cmds; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + struct ufsmount *ump = VFSTOUFS(mp); + int cmd, type, error; + +#ifndef QUOTA + return (EOPNOTSUPP); +#else + if (uid == (uid_t)(-1)) + uid = p->p_cred->p_ruid; + cmd = cmds >> SUBCMDSHIFT; + + switch (cmd) { + case Q_GETQUOTA: + case Q_SYNC: + if (uid == p->p_cred->p_ruid) + break; + /* fall through */ + default: + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + } + + type = cmd & SUBCMDMASK; + if ((u_int)type >= MAXQUOTAS) + return (EINVAL); + + switch (cmd) { + + case Q_QUOTAON: + return (quotaon(p, mp, type, arg)); + + case Q_QUOTAOFF: + if (vfs_busy(mp)) + return (0); + error = quotaoff(p, mp, type); + vfs_unbusy(mp); + return (error); + + case Q_SETQUOTA: + return (setquota(mp, uid, type, arg)); + + case Q_SETUSE: + return (setuse(mp, uid, type, arg)); + + case Q_GETQUOTA: + return (getquota(mp, uid, type, arg)); + + case Q_SYNC: + if (vfs_busy(mp)) + return (0); + error = qsync(mp); + vfs_unbusy(mp); + return (error); + + default: + return (EINVAL); + } + /* NOTREACHED */ +#endif +} + +/* + * Get file system statistics. + */ +ufs_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct ufsmount *ump; + register struct fs *fs; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_magic != FS_MAGIC) + panic("ufs_statfs"); + sbp->f_type = MOUNT_UFS; + sbp->f_fsize = fs->fs_fsize; + sbp->f_bsize = fs->fs_bsize; + sbp->f_blocks = fs->fs_dsize; + sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + + fs->fs_cstotal.cs_nffree; + sbp->f_bavail = (fs->fs_dsize * (100 - fs->fs_minfree) / 100) - + (fs->fs_dsize - sbp->f_bfree); + sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; + sbp->f_ffree = fs->fs_cstotal.cs_nifree; + if (sbp != &mp->mnt_stat) { + bcopy((caddr_t)mp->mnt_stat.f_mntonname, + (caddr_t)&sbp->f_mntonname[0], MNAMELEN); + bcopy((caddr_t)mp->mnt_stat.f_mntfromname, + (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + } + return (0); +} + +int syncprt = 0; + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +ufs_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + register struct vnode *vp; + register struct inode *ip; + register struct ufsmount *ump = VFSTOUFS(mp); + register struct fs *fs; + int error, allerror = 0; + + if (syncprt) + bufstats(); + fs = ump->um_fs; + /* + * Write back modified superblock. + * Consistency check that the superblock + * is still in the buffer cache. + */ + if (fs->fs_fmod != 0) { + if (fs->fs_ronly != 0) { /* XXX */ + printf("fs = %s\n", fs->fs_fsmnt); + panic("update: rofs mod"); + } + fs->fs_fmod = 0; + fs->fs_time = time.tv_sec; + allerror = sbupdate(ump, waitfor); + } + /* + * Write back each (modified) inode. + */ +loop: + for (vp = mp->mnt_mounth; vp; vp = vp->v_mountf) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + if (VOP_ISLOCKED(vp)) + continue; + ip = VTOI(vp); + if ((ip->i_flag & (IMOD|IACC|IUPD|ICHG)) == 0 && + vp->v_dirtyblkhd == NULL) + continue; + if (vget(vp)) + goto loop; + if (vp->v_dirtyblkhd) + vflushbuf(vp, 0); + if ((ip->i_flag & (IMOD|IACC|IUPD|ICHG)) && + (error = iupdat(ip, &time, &time, 0))) + allerror = error; + vput(vp); + } + /* + * Force stale file system control information to be flushed. + */ + vflushbuf(ump->um_devvp, waitfor == MNT_WAIT ? B_SYNC : 0); +#ifdef QUOTA + qsync(mp); +#endif + return (allerror); +} + +/* + * Write a superblock and associated information back to disk. + */ +sbupdate(mp, waitfor) + struct ufsmount *mp; + int waitfor; +{ + register struct fs *fs = mp->um_fs; + register struct buf *bp; + int blks; + caddr_t space; + int i, size, error = 0; + + bp = getblk(mp->um_devvp, SBLOCK, (int)fs->fs_sbsize); + bcopy((caddr_t)fs, bp->b_un.b_addr, (u_int)fs->fs_sbsize); + /* Restore compatibility to old file systems. XXX */ + if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ + bp->b_un.b_fs->fs_nrpos = -1; /* XXX */ + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = (caddr_t)fs->fs_csp[0]; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size); + bcopy(space, bp->b_un.b_addr, (u_int)size); + space += size; + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + } + return (error); +} + +/* + * Print out statistics on the current allocation of the buffer pool. + * Can be enabled to print out on every ``sync'' by setting "syncprt" + * above. + */ +bufstats() +{ + int s, i, j, count; + register struct buf *bp, *dp; + int counts[MAXBSIZE/CLBYTES+1]; + static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; + + for (bp = bfreelist, i = 0; bp < &bfreelist[BQUEUES]; bp++, i++) { + count = 0; + for (j = 0; j <= MAXBSIZE/CLBYTES; j++) + counts[j] = 0; + s = splbio(); + for (dp = bp->av_forw; dp != bp; dp = dp->av_forw) { + counts[dp->b_bufsize/CLBYTES]++; + count++; + } + splx(s); + printf("%s: total-%d", bname[i], count); + for (j = 0; j <= MAXBSIZE/CLBYTES; j++) + if (counts[j] != 0) + printf(", %d-%d", j * CLBYTES, counts[j]); + printf("\n"); + } +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is in range + * - call iget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the generation number matches + */ +ufs_fhtovp(mp, fhp, vpp) + register struct mount *mp; + struct fid *fhp; + struct vnode **vpp; +{ + register struct ufid *ufhp; + register struct fs *fs; + register struct inode *ip; + struct inode *nip; + struct vnode tvp; + int error; + + ufhp = (struct ufid *)fhp; + fs = VFSTOUFS(mp)->um_fs; + if (ufhp->ufid_ino < ROOTINO || + ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) { + *vpp = NULLVP; + return (EINVAL); + } + tvp.v_mount = mp; + ip = VTOI(&tvp); + ip->i_vnode = &tvp; + ip->i_dev = VFSTOUFS(mp)->um_dev; + if (error = iget(ip, ufhp->ufid_ino, &nip)) { + *vpp = NULLVP; + return (error); + } + ip = nip; + if (ip->i_mode == 0) { + iput(ip); + *vpp = NULLVP; + return (EINVAL); + } + if (ip->i_gen != ufhp->ufid_gen) { + iput(ip); + *vpp = NULLVP; + return (EINVAL); + } + *vpp = ITOV(ip); + return (0); +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +ufs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + register struct inode *ip = VTOI(vp); + register struct ufid *ufhp; + + ufhp = (struct ufid *)fhp; + ufhp->ufid_len = sizeof(struct ufid); + ufhp->ufid_ino = ip->i_number; + ufhp->ufid_gen = ip->i_gen; + return (0); +} diff --git a/sys/ufs/ufs_vnops.c b/sys/ufs/ufs_vnops.c new file mode 100644 index 000000000000..4c254ad0d546 --- /dev/null +++ b/sys/ufs/ufs_vnops.c @@ -0,0 +1,1808 @@ +/* + * Copyright (c) 1982, 1986, 1989 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_vnops.c 7.64 (Berkeley) 5/16/91 + * $Id: ufs_vnops.c,v 1.5.2.1 1993/11/13 22:52:24 rgrimes Exp $ + */ + +#include "param.h" +#include "systm.h" +#include "namei.h" +#include "resourcevar.h" +#include "kernel.h" +#include "file.h" +#include "stat.h" +#include "buf.h" +#include "proc.h" +#include "conf.h" +#include "mount.h" +#include "vnode.h" +#include "specdev.h" +#include "fifo.h" +#include "malloc.h" + +#include "lockf.h" +#include "quota.h" +#include "inode.h" +#include "dir.h" +#include "fs.h" + +/* + * Create a regular file + */ +ufs_create(ndp, vap, p) + struct nameidata *ndp; + struct vattr *vap; + struct proc *p; +{ + struct inode *ip; + int error; + + if (error = maknode(MAKEIMODE(vap->va_type, vap->va_mode), ndp, &ip)) + return (error); + ndp->ni_vp = ITOV(ip); + return (0); +} + +/* + * Mknod vnode call + */ +/* ARGSUSED */ +ufs_mknod(ndp, vap, cred, p) + struct nameidata *ndp; + struct ucred *cred; + struct vattr *vap; + struct proc *p; +{ + register struct vnode *vp; + struct inode *ip; + int error; + + if (error = maknode(MAKEIMODE(vap->va_type, vap->va_mode), ndp, &ip)) + return (error); + ip->i_flag |= IACC|IUPD|ICHG; + if (vap->va_rdev != VNOVAL) { + /* + * Want to be able to use this to make badblock + * inodes, so don't truncate the dev number. + */ + ip->i_rdev = vap->va_rdev; + } + /* + * Remove inode so that it will be reloaded by iget and + * checked to see if it is an alias of an existing entry + * in the inode cache. + */ + vp = ITOV(ip); + vput(vp); + vp->v_type = VNON; + vgone(vp); + return (0); +} + +/* + * Open called. + * + * Nothing to do. + */ +/* ARGSUSED */ +ufs_open(vp, mode, cred, p) + struct vnode *vp; + int mode; + struct ucred *cred; + struct proc *p; +{ + + return (0); +} + +/* + * Close called + * + * Update the times on the inode. + */ +/* ARGSUSED */ +ufs_close(vp, fflag, cred, p) + struct vnode *vp; + int fflag; + struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + + if (vp->v_usecount > 1 && !(ip->i_flag & ILOCKED)) + ITIMES(ip, &time, &time); + return (0); +} + +/* + * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. + * The mode is shifted to select the owner/group/other fields. The + * super user is granted all permissions. + */ +ufs_access(vp, mode, cred, p) + struct vnode *vp; + register int mode; + struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + register gid_t *gp; + int i, error; + +#ifdef DIAGNOSTIC + if (!VOP_ISLOCKED(vp)) { + vprint("ufs_access: not locked", vp); + panic("ufs_access: not locked"); + } +#endif +#ifdef QUOTA + if (mode & VWRITE) { + switch (vp->v_type) { + case VREG: case VDIR: case VLNK: + if (error = getinoquota(ip)) + return (error); + } + } +#endif /* QUOTA */ + /* + * If you're the super-user, you always get access. + */ + if (cred->cr_uid == 0) + return (0); + /* + * Access check is based on only one of owner, group, public. + * If not owner, then check group. If not a member of the + * group, then check public access. + */ + if (cred->cr_uid != ip->i_uid) { + mode >>= 3; + gp = cred->cr_groups; + for (i = 0; i < cred->cr_ngroups; i++, gp++) + if (ip->i_gid == *gp) + goto found; + mode >>= 3; +found: + ; + } + if ((ip->i_mode & mode) == mode) + return (0); + return (EACCES); +} + +/* ARGSUSED */ +ufs_getattr(vp, vap, cred, p) + struct vnode *vp; + register struct vattr *vap; + struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + + ITIMES(ip, &time, &time); + /* + * Copy from inode table + */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ~IFMT; + vap->va_nlink = ip->i_nlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + vap->va_rdev = (dev_t)ip->i_rdev; +#ifdef tahoe + vap->va_size = ip->i_size; + vap->va_size_rsv = 0; +#else + vap->va_qsize = ip->i_din.di_qsize; +#endif + vap->va_atime.tv_sec = ip->i_atime; + vap->va_atime.tv_usec = 0; + vap->va_mtime.tv_sec = ip->i_mtime; + vap->va_mtime.tv_usec = 0; + vap->va_ctime.tv_sec = ip->i_ctime; + vap->va_ctime.tv_usec = 0; + vap->va_flags = ip->i_flags; + vap->va_gen = ip->i_gen; + /* this doesn't belong here */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = ip->i_fs->fs_bsize; + vap->va_bytes = dbtob(ip->i_blocks); + vap->va_bytes_rsv = 0; + vap->va_type = vp->v_type; + return (0); +} + +/* + * Set attribute vnode op. called from several syscalls + */ +ufs_setattr(vp, vap, cred, p) + register struct vnode *vp; + register struct vattr *vap; + register struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + int error = 0; + + /* + * Check for unsetable attributes. + */ + if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { + return (EINVAL); + } + /* + * Go through the fields and update iff not VNOVAL. + */ + if (vap->va_uid != (u_short)VNOVAL || vap->va_gid != (u_short)VNOVAL) + if (error = chown1(vp, vap->va_uid, vap->va_gid, p)) + return (error); + if (vap->va_size != VNOVAL) { + if (vp->v_type == VDIR) + return (EISDIR); + if (error = itrunc(ip, vap->va_size, 0)) /* XXX IO_SYNC? */ + return (error); + } + if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag))) + return (error); + if (vap->va_atime.tv_sec != VNOVAL) + ip->i_flag |= IACC; + if (vap->va_mtime.tv_sec != VNOVAL) + ip->i_flag |= IUPD; + ip->i_flag |= ICHG; + if (error = iupdat(ip, &vap->va_atime, &vap->va_mtime, 1)) + return (error); + } + if (vap->va_mode != (u_short)VNOVAL) + error = chmod1(vp, (int)vap->va_mode, p); + if (vap->va_flags != VNOVAL) { + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag))) + return (error); + if (cred->cr_uid == 0) { + ip->i_flags = vap->va_flags; + } else { + ip->i_flags &= 0xffff0000; + ip->i_flags |= (vap->va_flags & 0xffff); + } + ip->i_flag |= ICHG; + } + return (error); +} + +/* + * Change the mode on a file. + * Inode must be locked before calling. + */ +chmod1(vp, mode, p) + register struct vnode *vp; + register int mode; + struct proc *p; +{ + register struct ucred *cred = p->p_ucred; + register struct inode *ip = VTOI(vp); + int error; + + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag))) + return (error); + if (cred->cr_uid) { + if (vp->v_type != VDIR && (mode & ISVTX)) + return (EFTYPE); + if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) + return (EPERM); + } + ip->i_mode &= ~07777; + ip->i_mode |= mode & 07777; + ip->i_flag |= ICHG; + if ((vp->v_flag & VTEXT) && (ip->i_mode & ISVTX) == 0) + (void) vnode_pager_uncache(vp); + return (0); +} + +/* + * Perform chown operation on inode ip; + * inode must be locked prior to call. + */ +chown1(vp, uid, gid, p) + register struct vnode *vp; + uid_t uid; + gid_t gid; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + register struct ucred *cred = p->p_ucred; + uid_t ouid; + gid_t ogid; + int error = 0; +#ifdef QUOTA + register int i; + long change; +#endif + + if (uid == (u_short)VNOVAL) + uid = ip->i_uid; + if (gid == (u_short)VNOVAL) + gid = ip->i_gid; + /* + * If we don't own the file, are trying to change the owner + * of the file, or are not a member of the target group, + * the caller must be superuser or the call fails. + */ + if ((cred->cr_uid != ip->i_uid || uid != ip->i_uid || + !groupmember((gid_t)gid, cred)) && + (error = suser(cred, &p->p_acflag))) + return (error); + ouid = ip->i_uid; + ogid = ip->i_gid; +#ifdef QUOTA + if (error = getinoquota(ip)) + return (error); + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + change = ip->i_blocks; + (void) chkdq(ip, -change, cred, CHOWN); + (void) chkiq(ip, -1, cred, CHOWN); + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } +#endif + ip->i_uid = uid; + ip->i_gid = gid; +#ifdef QUOTA + if ((error = getinoquota(ip)) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { + if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) + goto good; + else + (void) chkdq(ip, -change, cred, CHOWN|FORCE); + } + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } + ip->i_uid = ouid; + ip->i_gid = ogid; + if (getinoquota(ip) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + (void) chkdq(ip, change, cred, FORCE|CHOWN); + (void) chkiq(ip, 1, cred, FORCE|CHOWN); + (void) getinoquota(ip); + } + return (error); +good: + if (getinoquota(ip)) + panic("chown: lost quota"); +#endif /* QUOTA */ + if (ouid != uid || ogid != gid) + ip->i_flag |= ICHG; + if (ouid != uid && cred->cr_uid != 0) + ip->i_mode &= ~ISUID; + if (ogid != gid && cred->cr_uid != 0) + ip->i_mode &= ~ISGID; + return (0); +} + +/* + * Vnode op for reading. + */ +/* ARGSUSED */ +ufs_read(vp, uio, ioflag, cred) + struct vnode *vp; + register struct uio *uio; + int ioflag; + struct ucred *cred; +{ + register struct inode *ip = VTOI(vp); + register struct fs *fs; + struct buf *bp; + daddr_t lbn, bn, rablock; + int size, diff, error = 0; + long n, on, type; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("ufs_read mode"); + type = ip->i_mode & IFMT; + if (type != IFDIR && type != IFREG && type != IFLNK) + panic("ufs_read type"); +#endif + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset < 0) + return (EINVAL); + ip->i_flag |= IACC; + fs = ip->i_fs; + do { + lbn = lblkno(fs, uio->uio_offset); + on = blkoff(fs, uio->uio_offset); + n = MIN((unsigned)(fs->fs_bsize - on), uio->uio_resid); + diff = ip->i_size - uio->uio_offset; + if (diff <= 0) + return (0); + if (diff < n) + n = diff; + size = blksize(fs, ip, lbn); + rablock = lbn + 1; + if (vp->v_lastr + 1 == lbn && + lblktosize(fs, rablock) < ip->i_size) + error = breada(ITOV(ip), lbn, size, rablock, + blksize(fs, ip, rablock), NOCRED, &bp); + else + error = bread(ITOV(ip), lbn, size, NOCRED, &bp); + vp->v_lastr = lbn; + n = MIN(n, size - bp->b_resid); + if (error) { + brelse(bp); + return (error); + } + error = uiomove(bp->b_un.b_addr + on, (int)n, uio); +#if OMIT /* 20 Aug 92*/ + if (n + on == fs->fs_bsize || uio->uio_offset == ip->i_size) + bp->b_flags |= B_AGE; +#endif /* OMIT*/ + brelse(bp); + } while (error == 0 && uio->uio_resid > 0 && n != 0); + return (error); +} + +/* + * Vnode op for writing. + */ +ufs_write(vp, uio, ioflag, cred) + register struct vnode *vp; + struct uio *uio; + int ioflag; + struct ucred *cred; +{ + struct proc *p = uio->uio_procp; + register struct inode *ip = VTOI(vp); + register struct fs *fs; + struct buf *bp; + daddr_t lbn, bn; + u_long osize; + int n, on, flags; + int size, resid, error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("ufs_write mode"); +#endif + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = ip->i_size; + /* fall through */ + case VLNK: + break; + + case VDIR: + if ((ioflag & IO_SYNC) == 0) + panic("ufs_write nonsync dir write"); + break; + + default: + panic("ufs_write type"); + } + if (uio->uio_offset < 0) + return (EINVAL); + if (uio->uio_resid == 0) + return (0); + /* + * Maybe this should be above the vnode op call, but so long as + * file servers have no limits, i don't think it matters + */ + if (vp->v_type == VREG && p && + uio->uio_offset + uio->uio_resid > + p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { + psignal(p, SIGXFSZ); + return (EFBIG); + } + resid = uio->uio_resid; + osize = ip->i_size; + fs = ip->i_fs; + flags = 0; + if (ioflag & IO_SYNC) + flags = B_SYNC; + + (void) vnode_pager_uncache(vp); + do { + lbn = lblkno(fs, uio->uio_offset); + on = blkoff(fs, uio->uio_offset); + n = MIN((unsigned)(fs->fs_bsize - on), uio->uio_resid); + if (n < fs->fs_bsize) + flags |= B_CLRBUF; + else + flags &= ~B_CLRBUF; + if (error = balloc(ip, lbn, (int)(on + n), &bp, flags)) + break; + bn = bp->b_blkno; + if (uio->uio_offset + n > ip->i_size) { + ip->i_size = uio->uio_offset + n; + vnode_pager_setsize(vp, ip->i_size); + } + size = blksize(fs, ip, lbn); + n = MIN(n, size - bp->b_resid); + error = uiomove(bp->b_un.b_addr + on, n, uio); + if (ioflag & IO_SYNC) + (void) bwrite(bp); + else if (n + on == fs->fs_bsize) { + bp->b_flags |= B_AGE; + bawrite(bp); + } else + bdwrite(bp); + ip->i_flag |= IUPD|ICHG; + if (cred->cr_uid != 0) + ip->i_mode &= ~(ISUID|ISGID); + } while (error == 0 && uio->uio_resid > 0 && n != 0); + if (error == EFAULT || error && (ioflag & IO_UNIT)) { + (void) itrunc(ip, osize, ioflag & IO_SYNC); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + if (!error && (ioflag & IO_SYNC)) + error = iupdat(ip, &time, &time, 1); + return (error); +} + +/* ARGSUSED */ +ufs_ioctl(vp, com, data, fflag, cred, p) + struct vnode *vp; + int com; + caddr_t data; + int fflag; + struct ucred *cred; + struct proc *p; +{ + + return (ENOTTY); +} + +/* ARGSUSED */ +ufs_select(vp, which, fflags, cred, p) + struct vnode *vp; + int which, fflags; + struct ucred *cred; + struct proc *p; +{ + + /* + * We should really check to see if I/O is possible. + */ + return (1); +} + +/* + * Mmap a file + * + * NB Currently unsupported. + */ +/* ARGSUSED */ +ufs_mmap(vp, fflags, cred, p) + struct vnode *vp; + int fflags; + struct ucred *cred; + struct proc *p; +{ + + return (EINVAL); +} + +/* + * Synch an open file. + */ +/* ARGSUSED */ +ufs_fsync(vp, fflags, cred, waitfor, p) + struct vnode *vp; + int fflags; + struct ucred *cred; + int waitfor; + struct proc *p; +{ + struct inode *ip = VTOI(vp); + + if (fflags & FWRITE) + ip->i_flag |= ICHG; + vflushbuf(vp, waitfor == MNT_WAIT ? B_SYNC : 0); + return (iupdat(ip, &time, &time, waitfor == MNT_WAIT)); +} + +/* + * Seek on a file + * + * Nothing to do, so just return. + */ +/* ARGSUSED */ +ufs_seek(vp, oldoff, newoff, cred) + struct vnode *vp; + off_t oldoff, newoff; + struct ucred *cred; +{ + + return (0); +} + +/* + * ufs remove + * Hard to avoid races here, especially + * in unlinking directories. + */ +ufs_remove(ndp, p) + struct nameidata *ndp; + struct proc *p; +{ + register struct inode *ip, *dp; + int error; + + ip = VTOI(ndp->ni_vp); + dp = VTOI(ndp->ni_dvp); + error = dirremove(ndp); + if (!error) { + ip->i_nlink--; + ip->i_flag |= ICHG; + } + if (dp == ip) + vrele(ITOV(ip)); + else + iput(ip); + iput(dp); + return (error); +} + +/* + * link vnode call + */ +ufs_link(vp, ndp, p) + register struct vnode *vp; + register struct nameidata *ndp; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + int error; + +#ifdef DIAGNOSTIC + if ((ndp->ni_nameiop & HASBUF) == 0) + panic("ufs_link: no name"); +#endif + if ((unsigned short)ip->i_nlink >= LINK_MAX) { + free(ndp->ni_pnbuf, M_NAMEI); + return (EMLINK); + } + if (ndp->ni_dvp != vp) + ILOCK(ip); + ip->i_nlink++; + ip->i_flag |= ICHG; + error = iupdat(ip, &time, &time, 1); + if (!error) + error = direnter(ip, ndp); + if (ndp->ni_dvp != vp) + IUNLOCK(ip); + FREE(ndp->ni_pnbuf, M_NAMEI); + vput(ndp->ni_dvp); + if (error) { + ip->i_nlink--; + ip->i_flag |= ICHG; + } + return (error); +} + +/* + * Rename system call. + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + */ +ufs_rename(fndp, tndp, p) + register struct nameidata *fndp, *tndp; + struct proc *p; +{ + register struct inode *ip, *xp, *dp; + struct dirtemplate dirbuf; + int doingdirectory = 0, oldparent = 0, newparent = 0; + int error = 0; + +#ifdef DIAGNOSTIC + if ((tndp->ni_nameiop & HASBUF) == 0 || + (fndp->ni_nameiop & HASBUF) == 0) + panic("ufs_rename: no name"); +#endif + dp = VTOI(fndp->ni_dvp); + ip = VTOI(fndp->ni_vp); + /* + * Check if just deleting a link name. + */ + if (fndp->ni_vp == tndp->ni_vp) { + VOP_ABORTOP(tndp); + vput(tndp->ni_dvp); + vput(tndp->ni_vp); + vrele(fndp->ni_dvp); + if ((ip->i_mode&IFMT) == IFDIR) { + VOP_ABORTOP(fndp); + vrele(fndp->ni_vp); + return (EINVAL); + } + doingdirectory = 0; + goto unlinkit; + } + ILOCK(ip); + if ((ip->i_mode&IFMT) == IFDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((fndp->ni_namelen == 1 && fndp->ni_ptr[0] == '.') || + dp == ip || fndp->ni_isdotdot || (ip->i_flag & IRENAME)) { + VOP_ABORTOP(tndp); + vput(tndp->ni_dvp); + if (tndp->ni_vp) + vput(tndp->ni_vp); + VOP_ABORTOP(fndp); + vrele(fndp->ni_dvp); + vput(fndp->ni_vp); + return (EINVAL); + } + ip->i_flag |= IRENAME; + oldparent = dp->i_number; + doingdirectory++; + } + vrele(fndp->ni_dvp); + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + ip->i_nlink++; + ip->i_flag |= ICHG; + error = iupdat(ip, &time, &time, 1); + IUNLOCK(ip); + + /* + * When the target exists, both the directory + * and target vnodes are returned locked. + */ + dp = VTOI(tndp->ni_dvp); + xp = NULL; + if (tndp->ni_vp) + xp = VTOI(tndp->ni_vp); + /* + * If ".." must be changed (ie the directory gets a new + * parent) then the source directory must not be in the + * directory heirarchy above the target, as this would + * orphan everything below the source directory. Also + * the user must have write permission in the source so + * as to be able to change "..". We must repeat the call + * to namei, as the parent directory is unlocked by the + * call to checkpath(). + */ + if (oldparent != dp->i_number) + newparent = dp->i_number; + if (doingdirectory && newparent) { + VOP_LOCK(fndp->ni_vp); + error = ufs_access(fndp->ni_vp, VWRITE, tndp->ni_cred, p); + VOP_UNLOCK(fndp->ni_vp); + if (error) + goto bad; + if (xp != NULL) + iput(xp); + if (error = checkpath(ip, dp, tndp->ni_cred)) + goto out; + if ((tndp->ni_nameiop & SAVESTART) == 0) + panic("ufs_rename: lost to startdir"); + if (error = lookup(tndp, p)) + goto out; + dp = VTOI(tndp->ni_dvp); + xp = NULL; + if (tndp->ni_vp) + xp = VTOI(tndp->ni_vp); + } + /* + * 2) If target doesn't exist, link the target + * to the source and unlink the source. + * Otherwise, rewrite the target directory + * entry to reference the source inode and + * expunge the original entry's existence. + */ + if (xp == NULL) { + if (dp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't fool with the link count. + */ + if (doingdirectory && newparent) { + if ((unsigned short)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto bad; + } + dp->i_nlink++; + dp->i_flag |= ICHG; + if (error = iupdat(dp, &time, &time, 1)) + goto bad; + } + if (error = direnter(ip, tndp)) { + if (doingdirectory && newparent) { + dp->i_nlink--; + dp->i_flag |= ICHG; + (void) iupdat(dp, &time, &time, 1); + } + goto bad; + } + iput(dp); + } else { + if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Short circuit rename(foo, foo). + */ + if (xp->i_number == ip->i_number) + panic("rename: same file"); + /* + * If the parent directory is "sticky", then the user must + * own the parent directory, or the destination of the rename, + * otherwise the destination may not be changed (except by + * root). This implements append-only directories. + */ + if ((dp->i_mode & ISVTX) && tndp->ni_cred->cr_uid != 0 && + tndp->ni_cred->cr_uid != dp->i_uid && + xp->i_uid != tndp->ni_cred->cr_uid) { + error = EPERM; + goto bad; + } + /* + * Target must be empty if a directory and have no links + * to it. Also, ensure source and target are compatible + * (both directories, or both not directories). + */ + if ((xp->i_mode&IFMT) == IFDIR) { + if (!dirempty(xp, dp->i_number, tndp->ni_cred) || + xp->i_nlink > 2) { + error = ENOTEMPTY; + goto bad; + } + if (!doingdirectory) { + error = ENOTDIR; + goto bad; + } + cache_purge(ITOV(dp)); + } else if (doingdirectory) { + error = EISDIR; + goto bad; + } + if (error = dirrewrite(dp, ip, tndp)) + goto bad; + /* + * If the target directory is in the same + * directory as the source directory, + * decrement the link count on the parent + * of the target directory. + */ + if (doingdirectory && !newparent) { + dp->i_nlink--; + dp->i_flag |= ICHG; + } + vput(ITOV(dp)); + /* + * Adjust the link count of the target to + * reflect the dirrewrite above. If this is + * a directory it is empty and there are + * no links to it, so we can squash the inode and + * any space associated with it. We disallowed + * renaming over top of a directory with links to + * it above, as the remaining link would point to + * a directory without "." or ".." entries. + */ + xp->i_nlink--; + if (doingdirectory) { + if (--xp->i_nlink != 0) + panic("rename: linked directory"); + error = itrunc(xp, (u_long)0, IO_SYNC); + } + xp->i_flag |= ICHG; + iput(xp); + xp = NULL; + } + + /* + * 3) Unlink the source. + */ +unlinkit: + fndp->ni_nameiop &= ~MODMASK; + fndp->ni_nameiop |= LOCKPARENT | LOCKLEAF; + if ((fndp->ni_nameiop & SAVESTART) == 0) + panic("ufs_rename: lost from startdir"); + (void) lookup(fndp, p); + if (fndp->ni_vp != NULL) { + xp = VTOI(fndp->ni_vp); + dp = VTOI(fndp->ni_dvp); + } else { + /* + * From name has disappeared. + */ + if (doingdirectory) + panic("rename: lost dir entry"); + vrele(ITOV(ip)); + return (0); + } + /* + * Ensure that the directory entry still exists and has not + * changed while the new name has been entered. If the source is + * a file then the entry may have been unlinked or renamed. In + * either case there is no further work to be done. If the source + * is a directory then it cannot have been rmdir'ed; its link + * count of three would cause a rmdir to fail with ENOTEMPTY. + * The IRENAME flag ensures that it cannot be moved by another + * rename. + */ + if (xp != ip) { + if (doingdirectory) + panic("rename: lost dir entry"); + } else { + /* + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { + dp->i_nlink--; + dp->i_flag |= ICHG; + error = vn_rdwr(UIO_READ, ITOV(xp), (caddr_t)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, + tndp->ni_cred, (int *)0, (struct proc *)0); + if (error == 0) { + if (dirbuf.dotdot_namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + dirbad(xp, 12, "rename: mangled dir"); + } else { + dirbuf.dotdot_ino = newparent; + (void) vn_rdwr(UIO_WRITE, ITOV(xp), + (caddr_t)&dirbuf, + sizeof (struct dirtemplate), + (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED|IO_SYNC, + tndp->ni_cred, (int *)0, + (struct proc *)0); + cache_purge(ITOV(dp)); + } + } + } + error = dirremove(fndp); + if (!error) { + xp->i_nlink--; + xp->i_flag |= ICHG; + } + xp->i_flag &= ~IRENAME; + } + if (dp) + vput(ITOV(dp)); + if (xp) + vput(ITOV(xp)); + vrele(ITOV(ip)); + return (error); + +bad: + if (xp) + vput(ITOV(xp)); + vput(ITOV(dp)); +out: + ip->i_nlink--; + ip->i_flag |= ICHG; + vrele(ITOV(ip)); + return (error); +} + +/* + * A virgin directory (no blushing please). + */ +struct dirtemplate mastertemplate = { + 0, 12, 1, ".", + 0, DIRBLKSIZ - 12, 2, ".." +}; + +/* + * Mkdir system call + */ +ufs_mkdir(ndp, vap, p) + struct nameidata *ndp; + struct vattr *vap; + struct proc *p; +{ + register struct inode *ip, *dp; + struct inode *tip; + struct vnode *dvp; + struct dirtemplate dirtemplate; + int error; + int dmode; + +#ifdef DIAGNOSTIC + if ((ndp->ni_nameiop & HASBUF) == 0) + panic("ufs_mkdir: no name"); +#endif + dvp = ndp->ni_dvp; + dp = VTOI(dvp); + if ((unsigned short)dp->i_nlink >= LINK_MAX) { + free(ndp->ni_pnbuf, M_NAMEI); + iput(dp); + return (EMLINK); + } + dmode = vap->va_mode&0777; + dmode |= IFDIR; + /* + * Must simulate part of maknode here to acquire the inode, but + * not have it entered in the parent directory. The entry is made + * later after writing "." and ".." entries. + */ + if (error = ialloc(dp, dirpref(dp->i_fs), dmode, ndp->ni_cred, &tip)) { + free(ndp->ni_pnbuf, M_NAMEI); + iput(dp); + return (error); + } + ip = tip; + ip->i_uid = ndp->ni_cred->cr_uid; + ip->i_gid = dp->i_gid; +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, ndp->ni_cred, 0))) { + free(ndp->ni_pnbuf, M_NAMEI); + ifree(ip, ip->i_number, dmode); + iput(ip); + iput(dp); + return (error); + } +#endif + ip->i_flag |= IACC|IUPD|ICHG; + ip->i_mode = dmode; + ITOV(ip)->v_type = VDIR; /* Rest init'd in iget() */ + ip->i_nlink = 2; + error = iupdat(ip, &time, &time, 1); + + /* + * Bump link count in parent directory + * to reflect work done below. Should + * be done before reference is created + * so reparation is possible if we crash. + */ + dp->i_nlink++; + dp->i_flag |= ICHG; + if (error = iupdat(dp, &time, &time, 1)) + goto bad; + + /* + * Initialize directory with "." + * and ".." from static template. + */ + dirtemplate = mastertemplate; + dirtemplate.dot_ino = ip->i_number; + dirtemplate.dotdot_ino = dp->i_number; + error = vn_rdwr(UIO_WRITE, ITOV(ip), (caddr_t)&dirtemplate, + sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED|IO_SYNC, ndp->ni_cred, (int *)0, (struct proc *)0); + if (error) { + dp->i_nlink--; + dp->i_flag |= ICHG; + goto bad; + } + if (DIRBLKSIZ > dp->i_fs->fs_fsize) { + panic("mkdir: blksize"); /* XXX - should grow w/balloc() */ + } else { + ip->i_size = DIRBLKSIZ; + ip->i_flag |= ICHG; + } + /* + * Directory all set up, now + * install the entry for it in + * the parent directory. + */ + if (error = direnter(ip, ndp)) { + dp->i_nlink--; + dp->i_flag |= ICHG; + } +bad: + /* + * No need to do an explicit itrunc here, + * vrele will do this for us because we set + * the link count to 0. + */ + if (error) { + ip->i_nlink = 0; + ip->i_flag |= ICHG; + iput(ip); + } else + ndp->ni_vp = ITOV(ip); + FREE(ndp->ni_pnbuf, M_NAMEI); + iput(dp); + return (error); +} + +/* + * Rmdir system call. + */ +ufs_rmdir(ndp, p) + register struct nameidata *ndp; + struct proc *p; +{ + register struct inode *ip, *dp; + int error = 0; + + ip = VTOI(ndp->ni_vp); + dp = VTOI(ndp->ni_dvp); + /* + * No rmdir "." please. + */ + if (dp == ip) { + vrele(ITOV(dp)); + iput(ip); + return (EINVAL); + } + /* + * Verify the directory is empty (and valid). + * (Rmdir ".." won't be valid since + * ".." will contain a reference to + * the current directory and thus be + * non-empty.) + */ + if (ip->i_nlink != 2 || !dirempty(ip, dp->i_number, ndp->ni_cred)) { + error = ENOTEMPTY; + goto out; + } + /* + * Delete reference to directory before purging + * inode. If we crash in between, the directory + * will be reattached to lost+found, + */ + if (error = dirremove(ndp)) + goto out; + dp->i_nlink--; + dp->i_flag |= ICHG; + cache_purge(ITOV(dp)); + iput(dp); + ndp->ni_dvp = NULL; + /* + * Truncate inode. The only stuff left + * in the directory is "." and "..". The + * "." reference is inconsequential since + * we're quashing it. The ".." reference + * has already been adjusted above. We've + * removed the "." reference and the reference + * in the parent directory, but there may be + * other hard links so decrement by 2 and + * worry about them later. + */ + ip->i_nlink -= 2; + error = itrunc(ip, (u_long)0, IO_SYNC); + cache_purge(ITOV(ip)); +out: + if (ndp->ni_dvp) + iput(dp); + iput(ip); + return (error); +} + +/* + * symlink -- make a symbolic link + */ +ufs_symlink(ndp, vap, target, p) + struct nameidata *ndp; + struct vattr *vap; + char *target; + struct proc *p; +{ + struct inode *ip; + int len = strlen(target); + int error; + + error = maknode(IFLNK | vap->va_mode, ndp, &ip); + if (error) + return (error); +#ifdef FASTLINKS + if (len <= MAXFASTLINK) { + ip->i_din.di_spare[0] = len; + ip->i_size = len; + bcopy(target, ip->i_symlink, len); + ip->i_flag |= ICHG; + error = iupdat(ip, &time, &time, 1); + } else +#endif + error = vn_rdwr(UIO_WRITE, ITOV(ip), target, len, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, ndp->ni_cred, (int *)0, + (struct proc *)0); + iput(ip); + return (error); +} + +/* + * Vnode op for read and write + */ +ufs_readdir(vp, uio, cred, eofflagp) + struct vnode *vp; + register struct uio *uio; + struct ucred *cred; + int *eofflagp; +{ + int count, lost, error; + + count = uio->uio_resid; + count &= ~(DIRBLKSIZ - 1); + lost = uio->uio_resid - count; + if (count < DIRBLKSIZ || (uio->uio_offset & (DIRBLKSIZ -1))) + return (EINVAL); + uio->uio_resid = count; + uio->uio_iov->iov_len = count; + error = ufs_read(vp, uio, 0, cred); + uio->uio_resid += lost; + if ((VTOI(vp)->i_size - uio->uio_offset) <= 0) + *eofflagp = 1; + else + *eofflagp = 0; + return (error); +} + +/* + * Return target name of a symbolic link + */ +ufs_readlink(vp, uiop, cred) + struct vnode *vp; + struct uio *uiop; + struct ucred *cred; +{ + struct inode *ip = VTOI(vp); + if (FASTLINK(ip)) + return (uiomove(ip->i_symlink, ip->i_size, uiop)); + else + return (ufs_read(vp, uiop, 0, cred)); +} + +/* + * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually + * done. If a buffer has been saved in anticipation of a CREATE, delete it. + */ +/* ARGSUSED */ +ufs_abortop(ndp) + struct nameidata *ndp; +{ + + if ((ndp->ni_nameiop & (HASBUF | SAVESTART)) == HASBUF) + FREE(ndp->ni_pnbuf, M_NAMEI); + return (0); +} + +/* + * Lock an inode. + */ +ufs_lock(vp) + struct vnode *vp; +{ + register struct inode *ip = VTOI(vp); + + ILOCK(ip); + return (0); +} + +/* + * Unlock an inode. + */ +ufs_unlock(vp) + struct vnode *vp; +{ + register struct inode *ip = VTOI(vp); + + if (!(ip->i_flag & ILOCKED)) + panic("ufs_unlock NOT LOCKED"); + IUNLOCK(ip); + return (0); +} + +/* + * Check for a locked inode. + */ +ufs_islocked(vp) + struct vnode *vp; +{ + + if (VTOI(vp)->i_flag & ILOCKED) + return (1); + return (0); +} + +/* + * Get access to bmap + */ +ufs_bmap(vp, bn, vpp, bnp) + struct vnode *vp; + daddr_t bn; + struct vnode **vpp; + daddr_t *bnp; +{ + struct inode *ip = VTOI(vp); + + if (vpp != NULL) + *vpp = ip->i_devvp; + if (bnp == NULL) + return (0); + return (bmap(ip, bn, bnp)); +} + +/* + * Calculate the logical to physical mapping if not done already, + * then call the device strategy routine. + */ +int checkoverlap = 0; + +ufs_strategy(bp) + register struct buf *bp; +{ + register struct inode *ip = VTOI(bp->b_vp); + struct vnode *vp; + int error; + + if (bp->b_vp->v_type == VBLK || bp->b_vp->v_type == VCHR) + panic("ufs_strategy: spec"); + if (bp->b_blkno == bp->b_lblkno) { + if (error = bmap(ip, bp->b_lblkno, &bp->b_blkno)) + return (error); + if ((long)bp->b_blkno == -1) + clrbuf(bp); + } + if ((long)bp->b_blkno == -1) { + biodone(bp); + return (0); + } +#ifdef DIAGNOSTIC + if (checkoverlap) { + register struct buf *ep; + struct buf *ebp; + daddr_t start, last; + + ebp = &buf[nbuf]; + start = bp->b_blkno; + last = start + btodb(bp->b_bcount) - 1; + for (ep = buf; ep < ebp; ep++) { + if (ep == bp || (ep->b_flags & B_INVAL) || + ep->b_vp == NULLVP) + continue; + if (VOP_BMAP(ep->b_vp, (daddr_t)0, &vp, (daddr_t)0)) + continue; + if (vp != ip->i_devvp) + continue; + /* look for overlap */ + if (ep->b_bcount == 0 || ep->b_blkno > last || + ep->b_blkno + btodb(ep->b_bcount) <= start) + continue; + vprint("Disk overlap", vp); + printf("\tstart %d, end %d overlap start %d, end %d\n", + start, last, ep->b_blkno, + ep->b_blkno + btodb(ep->b_bcount) - 1); + panic("Disk buffer overlap"); + } + } +#endif /* DIAGNOSTIC */ + vp = ip->i_devvp; + bp->b_dev = vp->v_rdev; + (*(vp->v_op->vop_strategy))(bp); + return (0); +} + +/* + * Print out the contents of an inode. + */ +ufs_print(vp) + struct vnode *vp; +{ + register struct inode *ip = VTOI(vp); + + printf("tag VT_UFS, ino %d, on dev %d, %d", ip->i_number, + major(ip->i_dev), minor(ip->i_dev)); +#ifdef FIFO + if (vp->v_type == VFIFO) + fifo_printinfo(vp); +#endif /* FIFO */ + printf("%s\n", (ip->i_flag & ILOCKED) ? " (LOCKED)" : ""); + if (ip->i_spare0 == 0) + return; + printf("\towner pid %d", ip->i_spare0); + if (ip->i_spare1) + printf(" waiting pid %d", ip->i_spare1); + printf("\n"); +} + +/* + * Read wrapper for special devices. + */ +ufsspec_read(vp, uio, ioflag, cred) + struct vnode *vp; + struct uio *uio; + int ioflag; + struct ucred *cred; +{ + + /* + * Set access flag. + */ + VTOI(vp)->i_flag |= IACC; + return (spec_read(vp, uio, ioflag, cred)); +} + +/* + * Write wrapper for special devices. + */ +ufsspec_write(vp, uio, ioflag, cred) + struct vnode *vp; + struct uio *uio; + int ioflag; + struct ucred *cred; +{ + + /* + * Set update and change flags. + */ + VTOI(vp)->i_flag |= IUPD|ICHG; + return (spec_write(vp, uio, ioflag, cred)); +} + +/* + * Close wrapper for special devices. + * + * Update the times on the inode then do device close. + */ +ufsspec_close(vp, fflag, cred, p) + struct vnode *vp; + int fflag; + struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + + if (vp->v_usecount > 1 && !(ip->i_flag & ILOCKED)) + ITIMES(ip, &time, &time); + return (spec_close(vp, fflag, cred, p)); +} + +#ifdef FIFO +/* + * Read wrapper for fifo's + */ +ufsfifo_read(vp, uio, ioflag, cred) + struct vnode *vp; + struct uio *uio; + int ioflag; + struct ucred *cred; +{ + + /* + * Set access flag. + */ + VTOI(vp)->i_flag |= IACC; + return (fifo_read(vp, uio, ioflag, cred)); +} + +/* + * Write wrapper for fifo's. + */ +ufsfifo_write(vp, uio, ioflag, cred) + struct vnode *vp; + struct uio *uio; + int ioflag; + struct ucred *cred; +{ + + /* + * Set update and change flags. + */ + VTOI(vp)->i_flag |= IUPD|ICHG; + return (fifo_write(vp, uio, ioflag, cred)); +} + +/* + * Close wrapper for fifo's. + * + * Update the times on the inode then do device close. + */ +ufsfifo_close(vp, fflag, cred, p) + struct vnode *vp; + int fflag; + struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + + if (vp->v_usecount > 1 && !(ip->i_flag & ILOCKED)) + ITIMES(ip, &time, &time); + return (fifo_close(vp, fflag, cred, p)); +} +#endif /* FIFO */ + +/* + * Allocate a new inode. + */ +maknode(mode, ndp, ipp) + int mode; + register struct nameidata *ndp; + struct inode **ipp; +{ + register struct inode *ip; + struct inode *tip; + register struct inode *pdir = VTOI(ndp->ni_dvp); + ino_t ipref; + int error; + +#ifdef DIAGNOSTIC + if ((ndp->ni_nameiop & HASBUF) == 0) + panic("maknode: no name"); +#endif + *ipp = 0; + if ((mode & IFMT) == 0) + mode |= IFREG; + if ((mode & IFMT) == IFDIR) + ipref = dirpref(pdir->i_fs); + else + ipref = pdir->i_number; + if (error = ialloc(pdir, ipref, mode, ndp->ni_cred, &tip)) { + free(ndp->ni_pnbuf, M_NAMEI); + iput(pdir); + return (error); + } + ip = tip; + ip->i_uid = ndp->ni_cred->cr_uid; + ip->i_gid = pdir->i_gid; +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, ndp->ni_cred, 0))) { + free(ndp->ni_pnbuf, M_NAMEI); + ifree(ip, ip->i_number, mode); + iput(ip); + iput(pdir); + return (error); + } +#endif + ip->i_flag |= IACC|IUPD|ICHG; + ip->i_mode = mode; + ITOV(ip)->v_type = IFTOVT(mode); /* Rest init'd in iget() */ + ip->i_nlink = 1; + if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, ndp->ni_cred) && + suser(ndp->ni_cred, NULL)) + ip->i_mode &= ~ISGID; + + /* + * Make sure inode goes to disk before directory entry. + */ + if (error = iupdat(ip, &time, &time, 1)) + goto bad; + if (error = direnter(ip, ndp)) + goto bad; + if ((ndp->ni_nameiop & SAVESTART) == 0) + FREE(ndp->ni_pnbuf, M_NAMEI); + iput(pdir); + *ipp = ip; + return (0); + +bad: + /* + * Write error occurred trying to update the inode + * or the directory so must deallocate the inode. + */ + free(ndp->ni_pnbuf, M_NAMEI); + iput(pdir); + ip->i_nlink = 0; + ip->i_flag |= ICHG; + iput(ip); + return (error); +} + +/* + * Advisory record locking support + */ +ufs_advlock(vp, id, op, fl, flags) + struct vnode *vp; + caddr_t id; + int op; + register struct flock *fl; + int flags; +{ + register struct inode *ip = VTOI(vp); + + return (lf_advlock(&(ip->i_lockf), ip->i_size, id, op, fl, flags)); +} + +/* + * Global vfs data structures for ufs + */ +struct vnodeops ufs_vnodeops = { + ufs_lookup, /* lookup */ + ufs_create, /* create */ + ufs_mknod, /* mknod */ + ufs_open, /* open */ + ufs_close, /* close */ + ufs_access, /* access */ + ufs_getattr, /* getattr */ + ufs_setattr, /* setattr */ + ufs_read, /* read */ + ufs_write, /* write */ + ufs_ioctl, /* ioctl */ + ufs_select, /* select */ + ufs_mmap, /* mmap */ + ufs_fsync, /* fsync */ + ufs_seek, /* seek */ + ufs_remove, /* remove */ + ufs_link, /* link */ + ufs_rename, /* rename */ + ufs_mkdir, /* mkdir */ + ufs_rmdir, /* rmdir */ + ufs_symlink, /* symlink */ + ufs_readdir, /* readdir */ + ufs_readlink, /* readlink */ + ufs_abortop, /* abortop */ + ufs_inactive, /* inactive */ + ufs_reclaim, /* reclaim */ + ufs_lock, /* lock */ + ufs_unlock, /* unlock */ + ufs_bmap, /* bmap */ + ufs_strategy, /* strategy */ + ufs_print, /* print */ + ufs_islocked, /* islocked */ + ufs_advlock, /* advlock */ +}; + +struct vnodeops spec_inodeops = { + spec_lookup, /* lookup */ + spec_create, /* create */ + spec_mknod, /* mknod */ + spec_open, /* open */ + ufsspec_close, /* close */ + ufs_access, /* access */ + ufs_getattr, /* getattr */ + ufs_setattr, /* setattr */ + ufsspec_read, /* read */ + ufsspec_write, /* write */ + spec_ioctl, /* ioctl */ + spec_select, /* select */ + spec_mmap, /* mmap */ + spec_fsync, /* fsync */ + spec_seek, /* seek */ + spec_remove, /* remove */ + spec_link, /* link */ + spec_rename, /* rename */ + spec_mkdir, /* mkdir */ + spec_rmdir, /* rmdir */ + spec_symlink, /* symlink */ + spec_readdir, /* readdir */ + spec_readlink, /* readlink */ + spec_abortop, /* abortop */ + ufs_inactive, /* inactive */ + ufs_reclaim, /* reclaim */ + ufs_lock, /* lock */ + ufs_unlock, /* unlock */ + spec_bmap, /* bmap */ + spec_strategy, /* strategy */ + ufs_print, /* print */ + ufs_islocked, /* islocked */ + spec_advlock, /* advlock */ +}; + +#ifdef FIFO +struct vnodeops fifo_inodeops = { + fifo_lookup, /* lookup */ + fifo_create, /* create */ + fifo_mknod, /* mknod */ + fifo_open, /* open */ + ufsfifo_close, /* close */ + ufs_access, /* access */ + ufs_getattr, /* getattr */ + ufs_setattr, /* setattr */ + ufsfifo_read, /* read */ + ufsfifo_write, /* write */ + fifo_ioctl, /* ioctl */ + fifo_select, /* select */ + fifo_mmap, /* mmap */ + fifo_fsync, /* fsync */ + fifo_seek, /* seek */ + fifo_remove, /* remove */ + fifo_link, /* link */ + fifo_rename, /* rename */ + fifo_mkdir, /* mkdir */ + fifo_rmdir, /* rmdir */ + fifo_symlink, /* symlink */ + fifo_readdir, /* readdir */ + fifo_readlink, /* readlink */ + fifo_abortop, /* abortop */ + ufs_inactive, /* inactive */ + ufs_reclaim, /* reclaim */ + ufs_lock, /* lock */ + ufs_unlock, /* unlock */ + fifo_bmap, /* bmap */ + fifo_strategy, /* strategy */ + ufs_print, /* print */ + ufs_islocked, /* islocked */ + fifo_advlock, /* advlock */ +}; +#endif /* FIFO */ + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, IFREG, IFDIR, IFBLK, IFCHR, IFLNK, IFSOCK, IFIFO, IFMT, +}; diff --git a/sys/ufs/ufsmount.h b/sys/ufs/ufsmount.h new file mode 100644 index 000000000000..6eca1bd4e493 --- /dev/null +++ b/sys/ufs/ufsmount.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 1982, 1986, 1989 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufsmount.h 7.9 (Berkeley) 5/8/91 + * $Id: ufsmount.h,v 1.2 1993/10/16 18:18:05 rgrimes Exp $ + */ + +/* + * This structure describes the UFS specific mount structure data. + */ +struct ufsmount { + struct mount *um_mountp; /* vfs structure for this filesystem */ + dev_t um_dev; /* device mounted */ + struct vnode *um_devvp; /* vnode for block device mounted */ + struct fs *um_fs; /* pointer to superblock */ + struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ + struct ucred *um_cred[MAXQUOTAS]; /* cred for access to quota file */ + time_t um_btime[MAXQUOTAS]; /* block quota time limit */ + time_t um_itime[MAXQUOTAS]; /* inode quota time limit */ + char um_qflags[MAXQUOTAS]; /* quota specific flags, see below */ +}; +/* + * Flags describing the state of quotas. + */ +#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ +#define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ + +#ifdef KERNEL +/* + * Convert mount ptr to ufsmount ptr. + */ +#define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) +#endif /* KERNEL */ + +/* + * Prototypes for UFS mount operations + */ +int ufs_mount __P((struct mount *mp, char *path, caddr_t data, + struct nameidata *ndp, struct proc *p)); +int ufs_start __P((struct mount *mp, int flags, struct proc *p)); +int ufs_unmount __P((struct mount *mp, int mntflags, struct proc *p)); +int ufs_root __P((struct mount *mp, struct vnode **vpp)); +int ufs_quotactl __P((struct mount *mp, int cmds, int uid, /* should be uid_t */ + caddr_t arg, struct proc *p)); +int ufs_statfs __P((struct mount *mp, struct statfs *sbp, struct proc *p)); +int ufs_sync __P((struct mount *mp, int waitfor)); +int ufs_fhtovp __P((struct mount *mp, struct fid *fhp, struct vnode **vpp)); +int ufs_vptofh __P((struct vnode *vp, struct fid *fhp)); +int ufs_init __P(()); |
