aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/libufs/Makefile7
-rw-r--r--lib/libufs/cgroup.c140
-rw-r--r--lib/libufs/inode.c16
-rw-r--r--lib/libufs/libufs.h16
-rw-r--r--lib/libufs/sblock.c50
-rw-r--r--lib/libufs/type.c5
-rw-r--r--sbin/dumpfs/dumpfs.c4
-rw-r--r--sbin/fsck_ffs/Makefile3
-rw-r--r--sbin/fsck_ffs/fsck.h5
-rw-r--r--sbin/fsck_ffs/gjournal.c293
-rw-r--r--sbin/fsck_ffs/main.c27
-rw-r--r--sbin/fsck_ffs/pass5.c1
-rw-r--r--sbin/fsck_ffs/suj.c4699
-rw-r--r--sbin/fsdb/fsdb.c11
-rw-r--r--sbin/fsdb/fsdbutil.c26
-rw-r--r--sbin/tunefs/tunefs.89
-rw-r--r--sbin/tunefs/tunefs.c567
-rw-r--r--sys/kern/vfs_bio.c28
-rw-r--r--sys/kern/vfs_subr.c1
-rw-r--r--sys/sys/buf.h3
-rw-r--r--sys/sys/mount.h4
-rw-r--r--sys/ufs/ffs/ffs_alloc.c252
-rw-r--r--sys/ufs/ffs/ffs_balloc.c13
-rw-r--r--sys/ufs/ffs/ffs_extern.h24
-rw-r--r--sys/ufs/ffs/ffs_inode.c132
-rw-r--r--sys/ufs/ffs/ffs_snapshot.c66
-rw-r--r--sys/ufs/ffs/ffs_softdep.c7286
-rw-r--r--sys/ufs/ffs/ffs_subr.c130
-rw-r--r--sys/ufs/ffs/ffs_vfsops.c68
-rw-r--r--sys/ufs/ffs/ffs_vnops.c1
-rw-r--r--sys/ufs/ffs/fs.h135
-rw-r--r--sys/ufs/ffs/softdep.h446
-rw-r--r--sys/ufs/ufs/dinode.h9
-rw-r--r--sys/ufs/ufs/inode.h3
-rw-r--r--sys/ufs/ufs/ufs_dirhash.c2
-rw-r--r--sys/ufs/ufs/ufs_extern.h24
-rw-r--r--sys/ufs/ufs/ufs_lookup.c184
-rw-r--r--sys/ufs/ufs/ufs_vnops.c583
-rw-r--r--sys/ufs/ufs/ufsmount.h9
-rw-r--r--usr.sbin/makefs/ffs/ffs_bswap.c2
40 files changed, 13155 insertions, 2129 deletions
diff --git a/lib/libufs/Makefile b/lib/libufs/Makefile
index c9232ef54ad8..1dfc2429a1f2 100644
--- a/lib/libufs/Makefile
+++ b/lib/libufs/Makefile
@@ -3,7 +3,7 @@
LIB= ufs
SHLIBDIR?= /lib
-SRCS= block.c cgroup.c inode.c sblock.c type.c
+SRCS= block.c cgroup.c inode.c sblock.c type.c ffs_subr.c ffs_tables.c
INCS= libufs.h
MAN= bread.3 cgread.3 libufs.3 sbread.3 ufs_disk_close.3
@@ -16,8 +16,11 @@ MLINKS+= ufs_disk_close.3 ufs_disk_fillout.3
MLINKS+= ufs_disk_close.3 ufs_disk_fillout_blank.3
MLINKS+= ufs_disk_close.3 ufs_disk_write.3
-WARNS?= 3
+.PATH: ${.CURDIR}/../../sys/ufs/ffs
+WARNS?= 2
+
+DEBUG_FLAGS = -g
CFLAGS+= -D_LIBUFS
.if defined(LIBUFS_DEBUG)
CFLAGS+= -D_LIBUFS_DEBUGGING
diff --git a/lib/libufs/cgroup.c b/lib/libufs/cgroup.c
index 28e5ea87a430..b0cb4a7d63db 100644
--- a/lib/libufs/cgroup.c
+++ b/lib/libufs/cgroup.c
@@ -40,11 +40,143 @@ __FBSDID("$FreeBSD$");
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <libufs.h>
+ufs2_daddr_t
+cgballoc(struct uufsd *disk)
+{
+ u_int8_t *blksfree;
+ struct cg *cgp;
+ struct fs *fs;
+ long bno;
+
+ fs = &disk->d_fs;
+ cgp = &disk->d_cg;
+ blksfree = cg_blksfree(cgp);
+ for (bno = 0; bno < fs->fs_fpg / fs->fs_frag; bno++)
+ if (ffs_isblock(fs, blksfree, bno))
+ goto gotit;
+ return (0);
+gotit:
+ fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
+ ffs_clrblock(fs, blksfree, (long)bno);
+ ffs_clusteracct(fs, cgp, bno, -1);
+ cgp->cg_cs.cs_nbfree--;
+ fs->fs_cstotal.cs_nbfree--;
+ fs->fs_fmod = 1;
+ return (cgbase(fs, cgp->cg_cgx) + blkstofrags(fs, bno));
+}
+
+int
+cgbfree(struct uufsd *disk, ufs2_daddr_t bno, long size)
+{
+ u_int8_t *blksfree;
+ struct fs *fs;
+ struct cg *cgp;
+ ufs1_daddr_t fragno, cgbno;
+ int i, cg, blk, frags, bbase;
+
+ fs = &disk->d_fs;
+ cg = dtog(fs, bno);
+ if (cgread1(disk, cg) != 1)
+ return (-1);
+ cgp = &disk->d_cg;
+ cgbno = dtogd(fs, bno);
+ blksfree = cg_blksfree(cgp);
+ if (size == fs->fs_bsize) {
+ fragno = fragstoblks(fs, cgbno);
+ ffs_setblock(fs, blksfree, fragno);
+ ffs_clusteracct(fs, cgp, fragno, 1);
+ cgp->cg_cs.cs_nbfree++;
+ fs->fs_cstotal.cs_nbfree++;
+ fs->fs_cs(fs, cg).cs_nbfree++;
+ } else {
+ bbase = cgbno - fragnum(fs, cgbno);
+ /*
+ * decrement the counts associated with the old frags
+ */
+ blk = blkmap(fs, blksfree, bbase);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
+ /*
+ * deallocate the fragment
+ */
+ frags = numfrags(fs, size);
+ for (i = 0; i < frags; i++)
+ setbit(blksfree, cgbno + i);
+ cgp->cg_cs.cs_nffree += i;
+ fs->fs_cstotal.cs_nffree += i;
+ fs->fs_cs(fs, cg).cs_nffree += i;
+ /*
+ * add back in counts associated with the new frags
+ */
+ blk = blkmap(fs, blksfree, bbase);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
+ /*
+ * if a complete block has been reassembled, account for it
+ */
+ fragno = fragstoblks(fs, bbase);
+ if (ffs_isblock(fs, blksfree, fragno)) {
+ cgp->cg_cs.cs_nffree -= fs->fs_frag;
+ fs->fs_cstotal.cs_nffree -= fs->fs_frag;
+ fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
+ ffs_clusteracct(fs, cgp, fragno, 1);
+ cgp->cg_cs.cs_nbfree++;
+ fs->fs_cstotal.cs_nbfree++;
+ fs->fs_cs(fs, cg).cs_nbfree++;
+ }
+ }
+ return cgwrite(disk);
+}
+
+ino_t
+cgialloc(struct uufsd *disk)
+{
+ struct ufs2_dinode *dp2;
+ u_int8_t *inosused;
+ struct cg *cgp;
+ struct fs *fs;
+ ino_t ino;
+ int i;
+
+ fs = &disk->d_fs;
+ cgp = &disk->d_cg;
+ inosused = cg_inosused(cgp);
+ for (ino = 0; ino < fs->fs_ipg / NBBY; ino++)
+ if (isclr(inosused, ino))
+ goto gotit;
+ return (0);
+gotit:
+ if (fs->fs_magic == FS_UFS2_MAGIC &&
+ ino + INOPB(fs) > cgp->cg_initediblk &&
+ cgp->cg_initediblk < cgp->cg_niblk) {
+ char block[MAXBSIZE];
+ bzero(block, (int)fs->fs_bsize);
+ dp2 = (struct ufs2_dinode *)&block;
+ for (i = 0; i < INOPB(fs); i++) {
+ dp2->di_gen = arc4random() / 2 + 1;
+ dp2++;
+ }
+ if (bwrite(disk, ino_to_fsba(fs,
+ cgp->cg_cgx * fs->fs_ipg + cgp->cg_initediblk),
+ block, fs->fs_bsize))
+ return (0);
+ cgp->cg_initediblk += INOPB(fs);
+ }
+
+ setbit(inosused, ino);
+ cgp->cg_irotor = ino;
+ cgp->cg_cs.cs_nifree--;
+ fs->fs_cstotal.cs_nifree--;
+ fs->fs_cs(fs, cgp->cg_cgx).cs_nifree--;
+ fs->fs_fmod = 1;
+
+ return (ino + (cgp->cg_cgx * fs->fs_ipg));
+}
+
int
cgread(struct uufsd *disk)
{
@@ -55,14 +187,12 @@ int
cgread1(struct uufsd *disk, int c)
{
struct fs *fs;
- off_t ccg;
fs = &disk->d_fs;
if ((unsigned)c >= fs->fs_ncg) {
return (0);
}
- ccg = fsbtodb(fs, cgtod(fs, c)) * disk->d_bsize;
if (bread(disk, fsbtodb(fs, cgtod(fs, c)), disk->d_cgunion.d_buf,
fs->fs_bsize) == -1) {
ERROR(disk, "unable to read cylinder group");
@@ -73,6 +203,12 @@ cgread1(struct uufsd *disk, int c)
}
int
+cgwrite(struct uufsd *disk)
+{
+ return (cgwrite1(disk, disk->d_lcg));
+}
+
+int
cgwrite1(struct uufsd *disk, int c)
{
struct fs *fs;
diff --git a/lib/libufs/inode.c b/lib/libufs/inode.c
index d8bef61f8917..6d94582b3e54 100644
--- a/lib/libufs/inode.c
+++ b/lib/libufs/inode.c
@@ -93,3 +93,19 @@ gotit: switch (disk->d_ufs) {
ERROR(disk, "unknown UFS filesystem type");
return (-1);
}
+
+int
+putino(struct uufsd *disk)
+{
+ struct fs *fs;
+
+ fs = &disk->d_fs;
+ if (disk->d_inoblock == NULL) {
+ ERROR(disk, "No inode block allocated");
+ return (-1);
+ }
+ if (bwrite(disk, fsbtodb(fs, ino_to_fsba(&disk->d_fs, disk->d_inomin)),
+ disk->d_inoblock, disk->d_fs.fs_bsize) <= 0)
+ return (-1);
+ return (0);
+}
diff --git a/lib/libufs/libufs.h b/lib/libufs/libufs.h
index 42a64f76e2b0..c3541a0c8e6b 100644
--- a/lib/libufs/libufs.h
+++ b/lib/libufs/libufs.h
@@ -71,6 +71,7 @@ struct uufsd {
int d_fd; /* raw device file descriptor */
long d_bsize; /* device bsize */
ufs2_daddr_t d_sblock; /* superblock location */
+ struct csum *d_sbcsum; /* Superblock summary info */
caddr_t d_inoblock; /* inode block */
ino_t d_inomin; /* low inode */
ino_t d_inomax; /* high inode */
@@ -109,14 +110,19 @@ int berase(struct uufsd *, ufs2_daddr_t, ufs2_daddr_t);
/*
* cgroup.c
*/
+ufs2_daddr_t cgballoc(struct uufsd *);
+int cgbfree(struct uufsd *, ufs2_daddr_t, long);
+ino_t cgialloc(struct uufsd *);
int cgread(struct uufsd *);
int cgread1(struct uufsd *, int);
+int cgwrite(struct uufsd *);
int cgwrite1(struct uufsd *, int);
/*
* inode.c
*/
int getino(struct uufsd *, void **, ino_t, int *);
+int putino(struct uufsd *);
/*
* sblock.c
@@ -132,6 +138,16 @@ int ufs_disk_fillout(struct uufsd *, const char *);
int ufs_disk_fillout_blank(struct uufsd *, const char *);
int ufs_disk_write(struct uufsd *);
+/*
+ * ffs_subr.c
+ */
+void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
+void ffs_fragacct(struct fs *, int, int32_t [], int);
+int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
+int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
+void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
+
__END_DECLS
#endif /* __LIBUFS_H__ */
diff --git a/lib/libufs/sblock.c b/lib/libufs/sblock.c
index 8986290039fb..d6bec3ed182e 100644
--- a/lib/libufs/sblock.c
+++ b/lib/libufs/sblock.c
@@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
#include <errno.h>
#include <stdio.h>
#include <string.h>
+#include <stdlib.h>
#include <unistd.h>
#include <libufs.h>
@@ -49,8 +50,11 @@ static int superblocks[] = SBLOCKSEARCH;
int
sbread(struct uufsd *disk)
{
+ uint8_t block[MAXBSIZE];
struct fs *fs;
int sb, superblock;
+ int i, size, blks;
+ uint8_t *space;
ERROR(disk, NULL);
@@ -86,6 +90,34 @@ sbread(struct uufsd *disk)
}
disk->d_bsize = fs->fs_fsize / fsbtodb(fs, 1);
disk->d_sblock = superblock / disk->d_bsize;
+ /*
+ * Read in the superblock summary information.
+ */
+ size = fs->fs_cssize;
+ blks = howmany(size, fs->fs_fsize);
+ size += fs->fs_ncg * sizeof(int32_t);
+ space = malloc(size);
+ if (space == NULL) {
+ ERROR(disk, "failed to allocate space for summary information");
+ return (-1);
+ }
+ fs->fs_csp = (struct csum *)space;
+ for (i = 0; i < blks; i += fs->fs_frag) {
+ size = fs->fs_bsize;
+ if (i + fs->fs_frag > blks)
+ size = (blks - i) * fs->fs_fsize;
+ if (bread(disk, fsbtodb(fs, fs->fs_csaddr + i), block, size)
+ == -1) {
+ ERROR(disk, "Failed to read sb summary information");
+ free(fs->fs_csp);
+ return (-1);
+ }
+ bcopy(block, space, size);
+ space += size;
+ }
+ fs->fs_maxcluster = (uint32_t *)space;
+ disk->d_sbcsum = fs->fs_csp;
+
return (0);
}
@@ -93,6 +125,8 @@ int
sbwrite(struct uufsd *disk, int all)
{
struct fs *fs;
+ int blks, size;
+ uint8_t *space;
unsigned i;
ERROR(disk, NULL);
@@ -107,6 +141,22 @@ sbwrite(struct uufsd *disk, int all)
ERROR(disk, "failed to write superblock");
return (-1);
}
+ /*
+ * Write superblock summary information.
+ */
+ blks = howmany(fs->fs_cssize, fs->fs_fsize);
+ space = (uint8_t *)disk->d_sbcsum;
+ for (i = 0; i < blks; i += fs->fs_frag) {
+ size = fs->fs_bsize;
+ if (i + fs->fs_frag > blks)
+ size = (blks - i) * fs->fs_fsize;
+ if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + i), space, size)
+ == -1) {
+ ERROR(disk, "Failed to write sb summary information");
+ return (-1);
+ }
+ space += size;
+ }
if (all) {
for (i = 0; i < fs->fs_ncg; i++)
if (bwrite(disk, fsbtodb(fs, cgsblock(fs, i)),
diff --git a/lib/libufs/type.c b/lib/libufs/type.c
index 8a553bedbc97..05904b9e40b3 100644
--- a/lib/libufs/type.c
+++ b/lib/libufs/type.c
@@ -66,6 +66,10 @@ ufs_disk_close(struct uufsd *disk)
free((char *)(uintptr_t)disk->d_name);
disk->d_name = NULL;
}
+ if (disk->d_sbcsum != NULL) {
+ free(disk->d_sbcsum);
+ disk->d_sbcsum = NULL;
+ }
return (0);
}
@@ -156,6 +160,7 @@ again: if ((ret = stat(name, &st)) < 0) {
disk->d_mine = 0;
disk->d_ufs = 0;
disk->d_error = NULL;
+ disk->d_sbcsum = NULL;
if (oname != name) {
name = strdup(name);
diff --git a/sbin/dumpfs/dumpfs.c b/sbin/dumpfs/dumpfs.c
index e4b599504012..38c05f605ad8 100644
--- a/sbin/dumpfs/dumpfs.c
+++ b/sbin/dumpfs/dumpfs.c
@@ -238,7 +238,7 @@ dumpfs(const char *name)
if (fsflags & FS_UNCLEAN)
printf("unclean ");
if (fsflags & FS_DOSOFTDEP)
- printf("soft-updates ");
+ printf("soft-updates%s ", (fsflags & FS_SUJ) ? "+journal" : "");
if (fsflags & FS_NEEDSFSCK)
printf("needs fsck run ");
if (fsflags & FS_INDEXDIRS)
@@ -255,7 +255,7 @@ dumpfs(const char *name)
printf("nfsv4acls ");
fsflags &= ~(FS_UNCLEAN | FS_DOSOFTDEP | FS_NEEDSFSCK | FS_INDEXDIRS |
FS_ACLS | FS_MULTILABEL | FS_GJOURNAL | FS_FLAGS_UPDATED |
- FS_NFS4ACLS);
+ FS_NFS4ACLS | FS_SUJ);
if (fsflags != 0)
printf("unknown flags (%#x)", fsflags);
putchar('\n');
diff --git a/sbin/fsck_ffs/Makefile b/sbin/fsck_ffs/Makefile
index aaae685df149..db2930bea627 100644
--- a/sbin/fsck_ffs/Makefile
+++ b/sbin/fsck_ffs/Makefile
@@ -7,8 +7,7 @@ LINKS+= ${BINDIR}/fsck_ffs ${BINDIR}/fsck_4.2bsd
MAN= fsck_ffs.8
MLINKS= fsck_ffs.8 fsck_ufs.8 fsck_ffs.8 fsck_4.2bsd.8
SRCS= dir.c ea.c fsutil.c inode.c main.c pass1.c pass1b.c pass2.c pass3.c \
- pass4.c pass5.c setup.c utilities.c ffs_subr.c ffs_tables.c gjournal.c \
- getmntopts.c
+ pass4.c pass5.c setup.c suj.c utilities.c gjournal.c getmntopts.c
DPADD= ${LIBUFS}
LDADD= -lufs
WARNS?= 2
diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h
index ad7fe138ed63..6dad6b7318b7 100644
--- a/sbin/fsck_ffs/fsck.h
+++ b/sbin/fsck_ffs/fsck.h
@@ -347,10 +347,6 @@ void direrror(ino_t ino, const char *errmesg);
int dirscan(struct inodesc *);
int dofix(struct inodesc *, const char *msg);
int eascan(struct inodesc *, struct ufs2_dinode *dp);
-void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
-void ffs_fragacct(struct fs *, int, int32_t [], int);
-int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
-void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
void fileerror(ino_t cwd, ino_t ino, const char *errmesg);
int findino(struct inodesc *);
int findname(struct inodesc *);
@@ -392,3 +388,4 @@ void sblock_init(void);
void setinodebuf(ino_t);
int setup(char *dev);
void gjournal_check(const char *filesys);
+int suj_check(const char *filesys);
diff --git a/sbin/fsck_ffs/gjournal.c b/sbin/fsck_ffs/gjournal.c
index bd887cab850d..10c32c00c07c 100644
--- a/sbin/fsck_ffs/gjournal.c
+++ b/sbin/fsck_ffs/gjournal.c
@@ -96,27 +96,6 @@ struct ufs2_dinode ufs2_zino;
static void putcgs(void);
/*
- * Write current block of inodes.
- */
-static int
-putino(struct uufsd *disk, ino_t inode)
-{
- caddr_t inoblock;
- struct fs *fs;
- ssize_t ret;
-
- fs = &disk->d_fs;
- inoblock = disk->d_inoblock;
-
- assert(inoblock != NULL);
- assert(inode >= disk->d_inomin && inode <= disk->d_inomax);
- ret = bwrite(disk, fsbtodb(fs, ino_to_fsba(fs, inode)), inoblock,
- fs->fs_bsize);
-
- return (ret == -1 ? -1 : 0);
-}
-
-/*
* Return cylinder group from the cache or load it if it is not in the
* cache yet.
* Don't cache more than MAX_CACHED_CGS cylinder groups.
@@ -242,13 +221,11 @@ cancelcgs(void)
#endif
/*
- * Open the given provider, load statistics.
+ * Open the given provider, load superblock.
*/
static void
-getdisk(void)
+opendisk(void)
{
- int i;
-
if (disk != NULL)
return;
disk = malloc(sizeof(*disk));
@@ -259,24 +236,6 @@ getdisk(void)
disk->d_error);
}
fs = &disk->d_fs;
- fs->fs_csp = malloc((size_t)fs->fs_cssize);
- if (fs->fs_csp == NULL)
- err(1, "malloc(%zu)", (size_t)fs->fs_cssize);
- bzero(fs->fs_csp, (size_t)fs->fs_cssize);
- for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) {
- if (bread(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)),
- (void *)(((char *)fs->fs_csp) + i),
- (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) {
- err(1, "bread: %s", disk->d_error);
- }
- }
- if (fs->fs_contigsumsize > 0) {
- fs->fs_maxcluster = malloc(fs->fs_ncg * sizeof(int32_t));
- if (fs->fs_maxcluster == NULL)
- err(1, "malloc(%zu)", fs->fs_ncg * sizeof(int32_t));
- for (i = 0; i < fs->fs_ncg; i++)
- fs->fs_maxcluster[i] = fs->fs_contigsumsize;
- }
}
/*
@@ -286,11 +245,6 @@ static void
closedisk(void)
{
- free(fs->fs_csp);
- if (fs->fs_contigsumsize > 0) {
- free(fs->fs_maxcluster);
- fs->fs_maxcluster = NULL;
- }
fs->fs_clean = 1;
if (sbwrite(disk, 0) == -1)
err(1, "sbwrite(%s)", devnam);
@@ -301,227 +255,6 @@ closedisk(void)
fs = NULL;
}
-/*
- * Write the statistics back, call closedisk().
- */
-static void
-putdisk(void)
-{
- int i;
-
- assert(disk != NULL && fs != NULL);
- for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) {
- if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)),
- (void *)(((char *)fs->fs_csp) + i),
- (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) {
- err(1, "bwrite: %s", disk->d_error);
- }
- }
- closedisk();
-}
-
-#if 0
-/*
- * Free memory, close the disk, but don't write anything back.
- */
-static void
-canceldisk(void)
-{
- int i;
-
- assert(disk != NULL && fs != NULL);
- free(fs->fs_csp);
- if (fs->fs_contigsumsize > 0)
- free(fs->fs_maxcluster);
- if (ufs_disk_close(disk) == -1)
- err(1, "ufs_disk_close(%s)", devnam);
- free(disk);
- disk = NULL;
- fs = NULL;
-}
-#endif
-
-static int
-isblock(unsigned char *cp, ufs1_daddr_t h)
-{
- unsigned char mask;
-
- switch ((int)fs->fs_frag) {
- case 8:
- return (cp[h] == 0xff);
- case 4:
- mask = 0x0f << ((h & 0x1) << 2);
- return ((cp[h >> 1] & mask) == mask);
- case 2:
- mask = 0x03 << ((h & 0x3) << 1);
- return ((cp[h >> 2] & mask) == mask);
- case 1:
- mask = 0x01 << (h & 0x7);
- return ((cp[h >> 3] & mask) == mask);
- default:
- assert(!"isblock: invalid number of fragments");
- }
- return (0);
-}
-
-/*
- * put a block into the map
- */
-static void
-setblock(unsigned char *cp, ufs1_daddr_t h)
-{
-
- switch ((int)fs->fs_frag) {
- case 8:
- cp[h] = 0xff;
- return;
- case 4:
- cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
- return;
- case 2:
- cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
- return;
- case 1:
- cp[h >> 3] |= (0x01 << (h & 0x7));
- return;
- default:
- assert(!"setblock: invalid number of fragments");
- }
-}
-
-/*
- * check if a block is free
- */
-static int
-isfreeblock(u_char *cp, ufs1_daddr_t h)
-{
-
- switch ((int)fs->fs_frag) {
- case 8:
- return (cp[h] == 0);
- case 4:
- return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
- case 2:
- return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
- case 1:
- return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
- default:
- assert(!"isfreeblock: invalid number of fragments");
- }
- return (0);
-}
-
-/*
- * Update the frsum fields to reflect addition or deletion
- * of some frags.
- */
-void
-fragacct(int fragmap, int32_t fraglist[], int cnt)
-{
- int inblk;
- int field, subfield;
- int siz, pos;
-
- inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
- fragmap <<= 1;
- for (siz = 1; siz < fs->fs_frag; siz++) {
- if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
- continue;
- field = around[siz];
- subfield = inside[siz];
- for (pos = siz; pos <= fs->fs_frag; pos++) {
- if ((fragmap & field) == subfield) {
- fraglist[siz] += cnt;
- pos += siz;
- field <<= siz;
- subfield <<= siz;
- }
- field <<= 1;
- subfield <<= 1;
- }
- }
-}
-
-static void
-clusteracct(struct cg *cgp, ufs1_daddr_t blkno)
-{
- int32_t *sump;
- int32_t *lp;
- u_char *freemapp, *mapp;
- int i, start, end, forw, back, map, bit;
-
- if (fs->fs_contigsumsize <= 0)
- return;
- freemapp = cg_clustersfree(cgp);
- sump = cg_clustersum(cgp);
- /*
- * Clear the actual block.
- */
- setbit(freemapp, blkno);
- /*
- * Find the size of the cluster going forward.
- */
- start = blkno + 1;
- end = start + fs->fs_contigsumsize;
- if (end >= cgp->cg_nclusterblks)
- end = cgp->cg_nclusterblks;
- mapp = &freemapp[start / NBBY];
- map = *mapp++;
- bit = 1 << (start % NBBY);
- for (i = start; i < end; i++) {
- if ((map & bit) == 0)
- break;
- if ((i & (NBBY - 1)) != (NBBY - 1)) {
- bit <<= 1;
- } else {
- map = *mapp++;
- bit = 1;
- }
- }
- forw = i - start;
- /*
- * Find the size of the cluster going backward.
- */
- start = blkno - 1;
- end = start - fs->fs_contigsumsize;
- if (end < 0)
- end = -1;
- mapp = &freemapp[start / NBBY];
- map = *mapp--;
- bit = 1 << (start % NBBY);
- for (i = start; i > end; i--) {
- if ((map & bit) == 0)
- break;
- if ((i & (NBBY - 1)) != 0) {
- bit >>= 1;
- } else {
- map = *mapp--;
- bit = 1 << (NBBY - 1);
- }
- }
- back = start - i;
- /*
- * Account for old cluster and the possibly new forward and
- * back clusters.
- */
- i = back + forw + 1;
- if (i > fs->fs_contigsumsize)
- i = fs->fs_contigsumsize;
- sump[i]++;
- if (back > 0)
- sump[back]--;
- if (forw > 0)
- sump[forw]--;
- /*
- * Update cluster summary information.
- */
- lp = &sump[fs->fs_contigsumsize];
- for (i = fs->fs_contigsumsize; i > 0; i--)
- if (*lp-- > 0)
- break;
- fs->fs_maxcluster[cgp->cg_cgx] = i;
-}
-
static void
blkfree(ufs2_daddr_t bno, long size)
{
@@ -539,10 +272,10 @@ blkfree(ufs2_daddr_t bno, long size)
blksfree = cg_blksfree(cgp);
if (size == fs->fs_bsize) {
fragno = fragstoblks(fs, cgbno);
- if (!isfreeblock(blksfree, fragno))
+ if (!ffs_isfreeblock(fs, blksfree, fragno))
assert(!"blkfree: freeing free block");
- setblock(blksfree, fragno);
- clusteracct(cgp, fragno);
+ ffs_setblock(fs, blksfree, fragno);
+ ffs_clusteracct(fs, cgp, fragno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
@@ -552,7 +285,7 @@ blkfree(ufs2_daddr_t bno, long size)
* decrement the counts associated with the old frags
*/
blk = blkmap(fs, blksfree, bbase);
- fragacct(blk, cgp->cg_frsum, -1);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
/*
* deallocate the fragment
*/
@@ -569,16 +302,16 @@ blkfree(ufs2_daddr_t bno, long size)
* add back in counts associated with the new frags
*/
blk = blkmap(fs, blksfree, bbase);
- fragacct(blk, cgp->cg_frsum, 1);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
/*
* if a complete block has been reassembled, account for it
*/
fragno = fragstoblks(fs, bbase);
- if (isblock(blksfree, fragno)) {
+ if (ffs_isblock(fs, blksfree, fragno)) {
cgp->cg_cs.cs_nffree -= fs->fs_frag;
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
- clusteracct(cgp, fragno);
+ ffs_clusteracct(fs, cgp, fragno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
@@ -599,7 +332,7 @@ freeindir(ufs2_daddr_t blk, int level)
if (bread(disk, fsbtodb(fs, blk), (void *)&sblks, (size_t)fs->fs_bsize) == -1)
err(1, "bread: %s", disk->d_error);
blks = (ufs2_daddr_t *)&sblks;
- for (i = 0; i < howmany(fs->fs_bsize, sizeof(ufs2_daddr_t)); i++) {
+ for (i = 0; i < NINDIR(fs); i++) {
if (blks[i] == 0)
break;
if (level == 0)
@@ -671,7 +404,7 @@ gjournal_check(const char *filesys)
int cg, mode;
devnam = filesys;
- getdisk();
+ opendisk();
/* Are there any unreferenced inodes in this file system? */
if (fs->fs_unrefs == 0) {
//printf("No unreferenced inodes.\n");
@@ -747,7 +480,7 @@ gjournal_check(const char *filesys)
/* Zero-fill the inode. */
*dino = ufs2_zino;
/* Write the inode back. */
- if (putino(disk, ino) == -1)
+ if (putino(disk) == -1)
err(1, "putino(cg=%d ino=%d)", cg, ino);
if (cgp->cg_unrefs == 0) {
//printf("No more unreferenced inodes in cg=%d.\n", cg);
@@ -772,5 +505,5 @@ gjournal_check(const char *filesys)
/* Write back modified cylinder groups. */
putcgs();
/* Write back updated statistics and super-block. */
- putdisk();
+ closedisk();
}
diff --git a/sbin/fsck_ffs/main.c b/sbin/fsck_ffs/main.c
index 66edd63e21f1..e9a970431604 100644
--- a/sbin/fsck_ffs/main.c
+++ b/sbin/fsck_ffs/main.c
@@ -242,8 +242,9 @@ checkfilesys(char *filesys)
if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0)
exit(3); /* Cannot read superblock */
close(fsreadfd);
- if (sblock.fs_flags & FS_NEEDSFSCK)
- exit(4); /* Earlier background failed */
+ /* Earlier background failed or journaled */
+ if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ))
+ exit(4);
if ((sblock.fs_flags & FS_DOSOFTDEP) == 0)
exit(5); /* Not running soft updates */
size = MIBSIZE;
@@ -299,7 +300,7 @@ checkfilesys(char *filesys)
pfatal("MOUNTED READ-ONLY, CANNOT RUN IN BACKGROUND\n");
} else if ((fsreadfd = open(filesys, O_RDONLY)) >= 0) {
if (readsb(0) != 0) {
- if (sblock.fs_flags & FS_NEEDSFSCK) {
+ if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ)) {
bkgrdflag = 0;
pfatal("UNEXPECTED INCONSISTENCY, %s\n",
"CANNOT RUN IN BACKGROUND\n");
@@ -384,6 +385,26 @@ checkfilesys(char *filesys)
sblock.fs_cstotal.cs_nffree * 100.0 / sblock.fs_dsize);
return (0);
}
+ /*
+ * Determine if we can and should do journal recovery.
+ */
+ if ((sblock.fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == FS_SUJ) {
+ if (preen || reply("USE JOURNAL?")) {
+ if (suj_check(filesys) == 0) {
+ if (chkdoreload(mntp) == 0)
+ exit(0);
+ exit(4);
+ }
+ /* suj_check failed, fall through. */
+ }
+ printf("** Skipping journal, falling through to full fsck\n");
+ /*
+ * Write the superblock so we don't try to recover the
+ * journal on another pass.
+ */
+ sblock.fs_mtime = time(NULL);
+ sbdirty();
+ }
/*
* Cleared if any questions answered no. Used to decide if
diff --git a/sbin/fsck_ffs/pass5.c b/sbin/fsck_ffs/pass5.c
index 173156efaf51..639ce0f8abdc 100644
--- a/sbin/fsck_ffs/pass5.c
+++ b/sbin/fsck_ffs/pass5.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <inttypes.h>
#include <limits.h>
#include <string.h>
+#include <libufs.h>
#include "fsck.h"
diff --git a/sbin/fsck_ffs/suj.c b/sbin/fsck_ffs/suj.c
new file mode 100644
index 000000000000..84608f164e42
--- /dev/null
+++ b/sbin/fsck_ffs/suj.c
@@ -0,0 +1,4699 @@
+/*-
+ * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/disklabel.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ffs/fs.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <libufs.h>
+#include <string.h>
+#include <strings.h>
+#include <err.h>
+#include <assert.h>
+
+#include "fsck.h"
+
+#define DOTDOT_OFFSET DIRECTSIZ(1)
+#define SUJ_HASHSIZE 2048
+#define SUJ_HASHMASK (SUJ_HASHSIZE - 1)
+#define SUJ_HASH(x) ((x * 2654435761) & SUJ_HASHMASK)
+
+struct suj_seg {
+ TAILQ_ENTRY(suj_seg) ss_next;
+ struct jsegrec ss_rec;
+ uint8_t *ss_blk;
+};
+
+struct suj_rec {
+ TAILQ_ENTRY(suj_rec) sr_next;
+ union jrec *sr_rec;
+};
+TAILQ_HEAD(srechd, suj_rec);
+
+struct suj_ino {
+ LIST_ENTRY(suj_ino) si_next;
+ struct srechd si_recs;
+ struct srechd si_newrecs;
+ struct srechd si_movs;
+ struct jtrncrec *si_trunc;
+ ino_t si_ino;
+ char si_skipparent;
+ char si_hasrecs;
+ char si_blkadj;
+ char si_linkadj;
+ int si_mode;
+ nlink_t si_nlinkadj;
+ nlink_t si_nlink;
+ nlink_t si_dotlinks;
+};
+LIST_HEAD(inohd, suj_ino);
+
+struct suj_blk {
+ LIST_ENTRY(suj_blk) sb_next;
+ struct srechd sb_recs;
+ ufs2_daddr_t sb_blk;
+};
+LIST_HEAD(blkhd, suj_blk);
+
+struct data_blk {
+ LIST_ENTRY(data_blk) db_next;
+ uint8_t *db_buf;
+ ufs2_daddr_t db_blk;
+ int db_size;
+ int db_dirty;
+};
+
+struct ino_blk {
+ LIST_ENTRY(ino_blk) ib_next;
+ uint8_t *ib_buf;
+ int ib_dirty;
+ ufs2_daddr_t ib_blk;
+};
+LIST_HEAD(iblkhd, ino_blk);
+
+struct suj_cg {
+ LIST_ENTRY(suj_cg) sc_next;
+ struct blkhd sc_blkhash[SUJ_HASHSIZE];
+ struct inohd sc_inohash[SUJ_HASHSIZE];
+ struct iblkhd sc_iblkhash[SUJ_HASHSIZE];
+ struct ino_blk *sc_lastiblk;
+ struct suj_ino *sc_lastino;
+ struct suj_blk *sc_lastblk;
+ uint8_t *sc_cgbuf;
+ struct cg *sc_cgp;
+ int sc_dirty;
+ int sc_cgx;
+};
+
+LIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE];
+LIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE];
+struct suj_cg *lastcg;
+struct data_blk *lastblk;
+
+TAILQ_HEAD(seghd, suj_seg) allsegs;
+uint64_t oldseq;
+static struct uufsd *disk = NULL;
+static struct fs *fs = NULL;
+ino_t sujino;
+
+/*
+ * Summary statistics.
+ */
+uint64_t freefrags;
+uint64_t freeblocks;
+uint64_t freeinos;
+uint64_t freedir;
+uint64_t jbytes;
+uint64_t jrecs;
+
+typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int);
+static void ino_trunc(ino_t, off_t);
+static void ino_decr(ino_t);
+static void ino_adjust(struct suj_ino *);
+static void ino_build(struct suj_ino *);
+static int blk_isfree(ufs2_daddr_t);
+
+static void *
+errmalloc(size_t n)
+{
+ void *a;
+
+ a = malloc(n);
+ if (a == NULL)
+ errx(1, "malloc(%zu)", n);
+ return (a);
+}
+
+/*
+ * Open the given provider, load superblock.
+ */
+static void
+opendisk(const char *devnam)
+{
+ if (disk != NULL)
+ return;
+ disk = malloc(sizeof(*disk));
+ if (disk == NULL)
+ errx(1, "malloc(%zu)", sizeof(*disk));
+ if (ufs_disk_fillout(disk, devnam) == -1) {
+ err(1, "ufs_disk_fillout(%s) failed: %s", devnam,
+ disk->d_error);
+ }
+ fs = &disk->d_fs;
+}
+
+/*
+ * Mark file system as clean, write the super-block back, close the disk.
+ */
+static void
+closedisk(const char *devnam)
+{
+ struct csum *cgsum;
+ int i;
+
+ /*
+ * Recompute the fs summary info from correct cs summaries.
+ */
+ bzero(&fs->fs_cstotal, sizeof(struct csum_total));
+ for (i = 0; i < fs->fs_ncg; i++) {
+ cgsum = &fs->fs_cs(fs, i);
+ fs->fs_cstotal.cs_nffree += cgsum->cs_nffree;
+ fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree;
+ fs->fs_cstotal.cs_nifree += cgsum->cs_nifree;
+ fs->fs_cstotal.cs_ndir += cgsum->cs_ndir;
+ }
+ fs->fs_pendinginodes = 0;
+ fs->fs_pendingblocks = 0;
+ fs->fs_clean = 1;
+ fs->fs_time = time(NULL);
+ fs->fs_mtime = time(NULL);
+ if (sbwrite(disk, 0) == -1)
+ err(1, "sbwrite(%s)", devnam);
+ if (ufs_disk_close(disk) == -1)
+ err(1, "ufs_disk_close(%s)", devnam);
+ free(disk);
+ disk = NULL;
+ fs = NULL;
+}
+
+/*
+ * Lookup a cg by number in the hash so we can keep track of which cgs
+ * need stats rebuilt.
+ */
+static struct suj_cg *
+cg_lookup(int cgx)
+{
+ struct cghd *hd;
+ struct suj_cg *sc;
+
+ if (cgx < 0 || cgx >= fs->fs_ncg) {
+ abort();
+ errx(1, "Bad cg number %d", cgx);
+ }
+ if (lastcg && lastcg->sc_cgx == cgx)
+ return (lastcg);
+ hd = &cghash[SUJ_HASH(cgx)];
+ LIST_FOREACH(sc, hd, sc_next)
+ if (sc->sc_cgx == cgx) {
+ lastcg = sc;
+ return (sc);
+ }
+ sc = errmalloc(sizeof(*sc));
+ bzero(sc, sizeof(*sc));
+ sc->sc_cgbuf = errmalloc(fs->fs_bsize);
+ sc->sc_cgp = (struct cg *)sc->sc_cgbuf;
+ sc->sc_cgx = cgx;
+ LIST_INSERT_HEAD(hd, sc, sc_next);
+ if (bread(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf,
+ fs->fs_bsize) == -1)
+ err(1, "Unable to read cylinder group %d", sc->sc_cgx);
+
+ return (sc);
+}
+
+/*
+ * Lookup an inode number in the hash and allocate a suj_ino if it does
+ * not exist.
+ */
+static struct suj_ino *
+ino_lookup(ino_t ino, int creat)
+{
+ struct suj_ino *sino;
+ struct inohd *hd;
+ struct suj_cg *sc;
+
+ sc = cg_lookup(ino_to_cg(fs, ino));
+ if (sc->sc_lastino && sc->sc_lastino->si_ino == ino)
+ return (sc->sc_lastino);
+ hd = &sc->sc_inohash[SUJ_HASH(ino)];
+ LIST_FOREACH(sino, hd, si_next)
+ if (sino->si_ino == ino)
+ return (sino);
+ if (creat == 0)
+ return (NULL);
+ sino = errmalloc(sizeof(*sino));
+ bzero(sino, sizeof(*sino));
+ sino->si_ino = ino;
+ TAILQ_INIT(&sino->si_recs);
+ TAILQ_INIT(&sino->si_newrecs);
+ TAILQ_INIT(&sino->si_movs);
+ LIST_INSERT_HEAD(hd, sino, si_next);
+
+ return (sino);
+}
+
+/*
+ * Lookup a block number in the hash and allocate a suj_blk if it does
+ * not exist.
+ */
+static struct suj_blk *
+blk_lookup(ufs2_daddr_t blk, int creat)
+{
+ struct suj_blk *sblk;
+ struct suj_cg *sc;
+ struct blkhd *hd;
+
+ sc = cg_lookup(dtog(fs, blk));
+ if (sc->sc_lastblk && sc->sc_lastblk->sb_blk == blk)
+ return (sc->sc_lastblk);
+ hd = &sc->sc_blkhash[SUJ_HASH(fragstoblks(fs, blk))];
+ LIST_FOREACH(sblk, hd, sb_next)
+ if (sblk->sb_blk == blk)
+ return (sblk);
+ if (creat == 0)
+ return (NULL);
+ sblk = errmalloc(sizeof(*sblk));
+ bzero(sblk, sizeof(*sblk));
+ sblk->sb_blk = blk;
+ TAILQ_INIT(&sblk->sb_recs);
+ LIST_INSERT_HEAD(hd, sblk, sb_next);
+
+ return (sblk);
+}
+
+static struct data_blk *
+dblk_lookup(ufs2_daddr_t blk)
+{
+ struct data_blk *dblk;
+ struct dblkhd *hd;
+
+ hd = &dbhash[SUJ_HASH(fragstoblks(fs, blk))];
+ if (lastblk && lastblk->db_blk == blk)
+ return (lastblk);
+ LIST_FOREACH(dblk, hd, db_next)
+ if (dblk->db_blk == blk)
+ return (dblk);
+ /*
+ * The inode block wasn't located, allocate a new one.
+ */
+ dblk = errmalloc(sizeof(*dblk));
+ bzero(dblk, sizeof(*dblk));
+ LIST_INSERT_HEAD(hd, dblk, db_next);
+ dblk->db_blk = blk;
+ return (dblk);
+}
+
+static uint8_t *
+dblk_read(ufs2_daddr_t blk, int size)
+{
+ struct data_blk *dblk;
+
+ dblk = dblk_lookup(blk);
+ /*
+ * I doubt size mismatches can happen in practice but it is trivial
+ * to handle.
+ */
+ if (size != dblk->db_size) {
+ if (dblk->db_buf)
+ free(dblk->db_buf);
+ dblk->db_buf = errmalloc(size);
+ dblk->db_size = size;
+ if (bread(disk, fsbtodb(fs, blk), dblk->db_buf, size) == -1)
+ err(1, "Failed to read data block %jd", blk);
+ }
+ return (dblk->db_buf);
+}
+
+static void
+dblk_dirty(ufs2_daddr_t blk)
+{
+ struct data_blk *dblk;
+
+ dblk = dblk_lookup(blk);
+ dblk->db_dirty = 1;
+}
+
+static void
+dblk_write(void)
+{
+ struct data_blk *dblk;
+ int i;
+
+ for (i = 0; i < SUJ_HASHSIZE; i++) {
+ LIST_FOREACH(dblk, &dbhash[i], db_next) {
+ if (dblk->db_dirty == 0 || dblk->db_size == 0)
+ continue;
+ if (bwrite(disk, fsbtodb(fs, dblk->db_blk),
+ dblk->db_buf, dblk->db_size) == -1)
+ err(1, "Unable to write block %jd",
+ dblk->db_blk);
+ }
+ }
+}
+
+static union dinode *
+ino_read(ino_t ino)
+{
+ struct ino_blk *iblk;
+ struct iblkhd *hd;
+ struct suj_cg *sc;
+ ufs2_daddr_t blk;
+ int off;
+
+ blk = ino_to_fsba(fs, ino);
+ sc = cg_lookup(ino_to_cg(fs, ino));
+ iblk = sc->sc_lastiblk;
+ if (iblk && iblk->ib_blk == blk)
+ goto found;
+ hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))];
+ LIST_FOREACH(iblk, hd, ib_next)
+ if (iblk->ib_blk == blk)
+ goto found;
+ /*
+ * The inode block wasn't located, allocate a new one.
+ */
+ iblk = errmalloc(sizeof(*iblk));
+ bzero(iblk, sizeof(*iblk));
+ iblk->ib_buf = errmalloc(fs->fs_bsize);
+ iblk->ib_blk = blk;
+ LIST_INSERT_HEAD(hd, iblk, ib_next);
+ if (bread(disk, fsbtodb(fs, blk), iblk->ib_buf, fs->fs_bsize) == -1)
+ err(1, "Failed to read inode block %jd", blk);
+found:
+ sc->sc_lastiblk = iblk;
+ off = ino_to_fsbo(fs, ino);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ return (union dinode *)&((struct ufs1_dinode *)iblk->ib_buf)[off];
+ else
+ return (union dinode *)&((struct ufs2_dinode *)iblk->ib_buf)[off];
+}
+
+static void
+ino_dirty(ino_t ino)
+{
+ struct ino_blk *iblk;
+ struct iblkhd *hd;
+ struct suj_cg *sc;
+ ufs2_daddr_t blk;
+
+ blk = ino_to_fsba(fs, ino);
+ sc = cg_lookup(ino_to_cg(fs, ino));
+ iblk = sc->sc_lastiblk;
+ if (iblk && iblk->ib_blk == blk) {
+ iblk->ib_dirty = 1;
+ return;
+ }
+ hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))];
+ LIST_FOREACH(iblk, hd, ib_next) {
+ if (iblk->ib_blk == blk) {
+ iblk->ib_dirty = 1;
+ return;
+ }
+ }
+ ino_read(ino);
+ ino_dirty(ino);
+}
+
+static void
+iblk_write(struct ino_blk *iblk)
+{
+
+ if (iblk->ib_dirty == 0)
+ return;
+ if (bwrite(disk, fsbtodb(fs, iblk->ib_blk), iblk->ib_buf,
+ fs->fs_bsize) == -1)
+ err(1, "Failed to write inode block %jd", iblk->ib_blk);
+}
+
+static int
+blk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags)
+{
+ ufs2_daddr_t bstart;
+ ufs2_daddr_t bend;
+ ufs2_daddr_t end;
+
+ end = start + frags;
+ bstart = brec->jb_blkno + brec->jb_oldfrags;
+ bend = bstart + brec->jb_frags;
+ if (start < bend && end > bstart)
+ return (1);
+ return (0);
+}
+
+static int
+blk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start,
+ int frags)
+{
+
+ if (brec->jb_ino != ino || brec->jb_lbn != lbn)
+ return (0);
+ if (brec->jb_blkno + brec->jb_oldfrags != start)
+ return (0);
+ if (brec->jb_frags != frags)
+ return (0);
+ return (1);
+}
+
+static void
+blk_setmask(struct jblkrec *brec, int *mask)
+{
+ int i;
+
+ for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++)
+ *mask |= 1 << i;
+}
+
+/*
+ * Determine whether a given block has been reallocated to a new location.
+ * Returns a mask of overlapping bits if any frags have been reused or
+ * zero if the block has not been re-used and the contents can be trusted.
+ *
+ * This is used to ensure that an orphaned pointer due to truncate is safe
+ * to be freed. The mask value can be used to free partial blocks.
+ */
+static int
+blk_freemask(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags)
+{
+ struct suj_blk *sblk;
+ struct suj_rec *srec;
+ struct jblkrec *brec;
+ int mask;
+ int off;
+
+ /*
+ * To be certain we're not freeing a reallocated block we lookup
+ * this block in the blk hash and see if there is an allocation
+ * journal record that overlaps with any fragments in the block
+ * we're concerned with. If any fragments have ben reallocated
+ * the block has already been freed and re-used for another purpose.
+ */
+ mask = 0;
+ sblk = blk_lookup(blknum(fs, blk), 0);
+ if (sblk == NULL)
+ return (0);
+ off = blk - sblk->sb_blk;
+ TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+ brec = (struct jblkrec *)srec->sr_rec;
+ /*
+ * If the block overlaps but does not match
+ * exactly it's a new allocation. If it matches
+ * exactly this record refers to the current
+ * location.
+ */
+ if (blk_overlaps(brec, blk, frags) == 0)
+ continue;
+ if (blk_equals(brec, ino, lbn, blk, frags) == 1)
+ mask = 0;
+ else
+ blk_setmask(brec, &mask);
+ }
+ if (debug)
+ printf("blk_freemask: blk %jd sblk %jd off %d mask 0x%X\n",
+ blk, sblk->sb_blk, off, mask);
+ return (mask >> off);
+}
+
+/*
+ * Determine whether it is safe to follow an indirect. It is not safe
+ * if any part of the indirect has been reallocated or the last journal
+ * entry was an allocation. Just allocated indirects may not have valid
+ * pointers yet and all of their children will have their own records.
+ * It is also not safe to follow an indirect if the cg bitmap has been
+ * cleared as a new allocation may write to the block prior to the journal
+ * being written.
+ *
+ * Returns 1 if it's safe to follow the indirect and 0 otherwise.
+ */
+static int
+blk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn)
+{
+ struct suj_blk *sblk;
+ struct jblkrec *brec;
+
+ sblk = blk_lookup(blk, 0);
+ if (sblk == NULL)
+ return (1);
+ if (TAILQ_EMPTY(&sblk->sb_recs))
+ return (1);
+ brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec;
+ if (blk_equals(brec, ino, lbn, blk, fs->fs_frag))
+ if (brec->jb_op == JOP_FREEBLK)
+ return (!blk_isfree(blk));
+ return (0);
+}
+
+/*
+ * Clear an inode from the cg bitmap. If the inode was already clear return
+ * 0 so the caller knows it does not have to check the inode contents.
+ */
+static int
+ino_free(ino_t ino, int mode)
+{
+ struct suj_cg *sc;
+ uint8_t *inosused;
+ struct cg *cgp;
+ int cg;
+
+ cg = ino_to_cg(fs, ino);
+ ino = ino % fs->fs_ipg;
+ sc = cg_lookup(cg);
+ cgp = sc->sc_cgp;
+ inosused = cg_inosused(cgp);
+ /*
+ * The bitmap may never have made it to the disk so we have to
+ * conditionally clear. We can avoid writing the cg in this case.
+ */
+ if (isclr(inosused, ino))
+ return (0);
+ freeinos++;
+ clrbit(inosused, ino);
+ if (ino < cgp->cg_irotor)
+ cgp->cg_irotor = ino;
+ cgp->cg_cs.cs_nifree++;
+ if ((mode & IFMT) == IFDIR) {
+ freedir++;
+ cgp->cg_cs.cs_ndir--;
+ }
+ sc->sc_dirty = 1;
+
+ return (1);
+}
+
+/*
+ * Free 'frags' frags starting at filesystem block 'bno' skipping any frags
+ * set in the mask.
+ */
+static void
+blk_free(ufs2_daddr_t bno, int mask, int frags)
+{
+ ufs1_daddr_t fragno, cgbno;
+ struct suj_cg *sc;
+ struct cg *cgp;
+ int i, cg;
+ uint8_t *blksfree;
+
+ if (debug)
+ printf("Freeing %d frags at blk %jd\n", frags, bno);
+ cg = dtog(fs, bno);
+ sc = cg_lookup(cg);
+ cgp = sc->sc_cgp;
+ cgbno = dtogd(fs, bno);
+ blksfree = cg_blksfree(cgp);
+
+ /*
+ * If it's not allocated we only wrote the journal entry
+ * and never the bitmaps. Here we unconditionally clear and
+ * resolve the cg summary later.
+ */
+ if (frags == fs->fs_frag && mask == 0) {
+ fragno = fragstoblks(fs, cgbno);
+ ffs_setblock(fs, blksfree, fragno);
+ freeblocks++;
+ } else {
+ /*
+ * deallocate the fragment
+ */
+ for (i = 0; i < frags; i++)
+ if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) {
+ freefrags++;
+ setbit(blksfree, cgbno + i);
+ }
+ }
+ sc->sc_dirty = 1;
+}
+
+/*
+ * Returns 1 if the whole block starting at 'bno' is marked free and 0
+ * otherwise.
+ */
+static int
+blk_isfree(ufs2_daddr_t bno)
+{
+ struct suj_cg *sc;
+
+ sc = cg_lookup(dtog(fs, bno));
+ return ffs_isblock(fs, cg_blksfree(sc->sc_cgp), dtogd(fs, bno));
+}
+
+/*
+ * Fetch an indirect block to find the block at a given lbn. The lbn
+ * may be negative to fetch a specific indirect block pointer or positive
+ * to fetch a specific block.
+ */
+static ufs2_daddr_t
+indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn)
+{
+ ufs2_daddr_t *bap2;
+ ufs2_daddr_t *bap1;
+ ufs_lbn_t lbnadd;
+ ufs_lbn_t base;
+ int level;
+ int i;
+
+ if (blk == 0)
+ return (0);
+ level = lbn_level(cur);
+ if (level == -1)
+ errx(1, "Invalid indir lbn %jd", lbn);
+ if (level == 0 && lbn < 0)
+ errx(1, "Invalid lbn %jd", lbn);
+ bap2 = (void *)dblk_read(blk, fs->fs_bsize);
+ bap1 = (void *)bap2;
+ lbnadd = 1;
+ base = -(cur + level);
+ for (i = level; i > 0; i--)
+ lbnadd *= NINDIR(fs);
+ if (lbn > 0)
+ i = (lbn - base) / lbnadd;
+ else
+ i = (-lbn - base) / lbnadd;
+ if (i < 0 || i >= NINDIR(fs))
+ errx(1, "Invalid indirect index %d produced by lbn %jd",
+ i, lbn);
+ if (level == 0)
+ cur = base + (i * lbnadd);
+ else
+ cur = -(base + (i * lbnadd)) - (level - 1);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ blk = bap1[i];
+ else
+ blk = bap2[i];
+ if (cur == lbn)
+ return (blk);
+ if (level == 0) {
+ abort();
+ errx(1, "Invalid lbn %jd at level 0", lbn);
+ }
+ return indir_blkatoff(blk, ino, cur, lbn);
+}
+
+/*
+ * Finds the disk block address at the specified lbn within the inode
+ * specified by ip. This follows the whole tree and honors di_size and
+ * di_extsize so it is a true test of reachability. The lbn may be
+ * negative if an extattr or indirect block is requested.
+ */
+static ufs2_daddr_t
+ino_blkatoff(union dinode *ip, ino_t ino, ufs_lbn_t lbn, int *frags)
+{
+ ufs_lbn_t tmpval;
+ ufs_lbn_t cur;
+ ufs_lbn_t next;
+ int i;
+
+ /*
+ * Handle extattr blocks first.
+ */
+ if (lbn < 0 && lbn >= -NXADDR) {
+ lbn = -1 - lbn;
+ if (lbn > lblkno(fs, ip->dp2.di_extsize - 1))
+ return (0);
+ *frags = numfrags(fs, sblksize(fs, ip->dp2.di_extsize, lbn));
+ return (ip->dp2.di_extb[lbn]);
+ }
+ /*
+ * Now direct and indirect.
+ */
+ if (DIP(ip, di_mode) == IFLNK &&
+ DIP(ip, di_size) < fs->fs_maxsymlinklen)
+ return (0);
+ if (lbn >= 0 && lbn < NDADDR) {
+ *frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn));
+ return (DIP(ip, di_db[lbn]));
+ }
+ *frags = fs->fs_frag;
+
+ for (i = 0, tmpval = NINDIR(fs), cur = NDADDR; i < NIADDR; i++,
+ tmpval *= NINDIR(fs), cur = next) {
+ next = cur + tmpval;
+ if (lbn == -cur - i)
+ return (DIP(ip, di_ib[i]));
+ /*
+ * Determine whether the lbn in question is within this tree.
+ */
+ if (lbn < 0 && -lbn >= next)
+ continue;
+ if (lbn > 0 && lbn >= next)
+ continue;
+ return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn);
+ }
+ errx(1, "lbn %jd not in ino", lbn);
+}
+
+/*
+ * Determine whether a block exists at a particular lbn in an inode.
+ * Returns 1 if found, 0 if not. lbn may be negative for indirects
+ * or ext blocks.
+ */
+static int
+blk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags)
+{
+ union dinode *ip;
+ ufs2_daddr_t nblk;
+
+ ip = ino_read(ino);
+
+ if (DIP(ip, di_nlink) == 0 || DIP(ip, di_mode) == 0)
+ return (0);
+ nblk = ino_blkatoff(ip, ino, lbn, frags);
+
+ return (nblk == blk);
+}
+
+/*
+ * Determines whether a pointer to an inode exists within a directory
+ * at a specified offset. Returns the mode of the found entry.
+ */
+static int
+ino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot)
+{
+ union dinode *dip;
+ struct direct *dp;
+ ufs2_daddr_t blk;
+ uint8_t *block;
+ ufs_lbn_t lbn;
+ int blksize;
+ int frags;
+ int dpoff;
+ int doff;
+
+ *isdot = 0;
+ dip = ino_read(parent);
+ *mode = DIP(dip, di_mode);
+ if ((*mode & IFMT) != IFDIR) {
+ if (debug) {
+ /*
+ * This can happen if the parent inode
+ * was reallocated.
+ */
+ if (*mode != 0)
+ printf("Directory %d has bad mode %o\n",
+ parent, *mode);
+ else
+ printf("Directory %d zero inode\n", parent);
+ }
+ return (0);
+ }
+ lbn = lblkno(fs, diroff);
+ doff = blkoff(fs, diroff);
+ blksize = sblksize(fs, DIP(dip, di_size), lbn);
+ if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) {
+ if (debug)
+ printf("ino %d absent from %d due to offset %jd"
+ " exceeding size %jd\n",
+ child, parent, diroff, DIP(dip, di_size));
+ return (0);
+ }
+ blk = ino_blkatoff(dip, parent, lbn, &frags);
+ if (blk <= 0) {
+ if (debug)
+ printf("Sparse directory %d", parent);
+ return (0);
+ }
+ block = dblk_read(blk, blksize);
+ /*
+ * Walk through the records from the start of the block to be
+ * certain we hit a valid record and not some junk in the middle
+ * of a file name. Stop when we reach or pass the expected offset.
+ */
+ dpoff = (doff / DIRBLKSIZ) * DIRBLKSIZ;
+ do {
+ dp = (struct direct *)&block[dpoff];
+ if (dpoff == doff)
+ break;
+ if (dp->d_reclen == 0)
+ break;
+ dpoff += dp->d_reclen;
+ } while (dpoff <= doff);
+ if (dpoff > fs->fs_bsize)
+ errx(1, "Corrupt directory block in dir ino %d", parent);
+ /* Not found. */
+ if (dpoff != doff) {
+ if (debug)
+ printf("ino %d not found in %d, lbn %jd, dpoff %d\n",
+ child, parent, lbn, dpoff);
+ return (0);
+ }
+ /*
+ * We found the item in question. Record the mode and whether it's
+ * a . or .. link for the caller.
+ */
+ if (dp->d_ino == child) {
+ if (child == parent)
+ *isdot = 1;
+ else if (dp->d_namlen == 2 &&
+ dp->d_name[0] == '.' && dp->d_name[1] == '.')
+ *isdot = 1;
+ *mode = DTTOIF(dp->d_type);
+ return (1);
+ }
+ if (debug)
+ printf("ino %d doesn't match dirent ino %d in parent %d\n",
+ child, dp->d_ino, parent);
+ return (0);
+}
+
+#define VISIT_INDIR 0x0001
+#define VISIT_EXT 0x0002
+#define VISIT_ROOT 0x0004 /* Operation came via root & valid pointers. */
+
+/*
+ * Read an indirect level which may or may not be linked into an inode.
+ */
+static void
+indir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags,
+ ino_visitor visitor, int flags)
+{
+ ufs2_daddr_t *bap2;
+ ufs1_daddr_t *bap1;
+ ufs_lbn_t lbnadd;
+ ufs2_daddr_t nblk;
+ ufs_lbn_t nlbn;
+ int level;
+ int i;
+
+ /*
+ * Don't visit indirect blocks with contents we can't trust. This
+ * should only happen when indir_visit() is called to complete a
+ * truncate that never finished and not when a pointer is found via
+ * an inode.
+ */
+ if (blk == 0)
+ return;
+ level = lbn_level(lbn);
+ if (level == -1)
+ errx(1, "Invalid level for lbn %jd", lbn);
+ if ((flags & VISIT_ROOT) == 0 && blk_isindir(blk, ino, lbn) == 0) {
+ if (debug)
+ printf("blk %jd ino %d lbn %jd(%d) is not indir.\n",
+ blk, ino, lbn, level);
+ goto out;
+ }
+ lbnadd = 1;
+ for (i = level; i > 0; i--)
+ lbnadd *= NINDIR(fs);
+ bap1 = (void *)dblk_read(blk, fs->fs_bsize);
+ bap2 = (void *)bap1;
+ for (i = 0; i < NINDIR(fs); i++) {
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ nblk = *bap1++;
+ else
+ nblk = *bap2++;
+ if (nblk == 0)
+ continue;
+ if (level == 0) {
+ nlbn = -lbn + i * lbnadd;
+ (*frags) += fs->fs_frag;
+ visitor(ino, nlbn, nblk, fs->fs_frag);
+ } else {
+ nlbn = (lbn + 1) - (i * lbnadd);
+ indir_visit(ino, nlbn, nblk, frags, visitor, flags);
+ }
+ }
+out:
+ if (flags & VISIT_INDIR) {
+ (*frags) += fs->fs_frag;
+ visitor(ino, lbn, blk, fs->fs_frag);
+ }
+}
+
+/*
+ * Visit each block in an inode as specified by 'flags' and call a
+ * callback function. The callback may inspect or free blocks. The
+ * count of frags found according to the size in the file is returned.
+ * This is not valid for sparse files but may be used to determine
+ * the correct di_blocks for a file.
+ */
+static uint64_t
+ino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags)
+{
+ ufs_lbn_t nextlbn;
+ ufs_lbn_t tmpval;
+ ufs_lbn_t lbn;
+ uint64_t size;
+ uint64_t fragcnt;
+ int mode;
+ int frags;
+ int i;
+
+ size = DIP(ip, di_size);
+ mode = DIP(ip, di_mode) & IFMT;
+ fragcnt = 0;
+ if ((flags & VISIT_EXT) &&
+ fs->fs_magic == FS_UFS2_MAGIC && ip->dp2.di_extsize) {
+ for (i = 0; i < NXADDR; i++) {
+ if (ip->dp2.di_extb[i] == 0)
+ continue;
+ frags = sblksize(fs, ip->dp2.di_extsize, i);
+ frags = numfrags(fs, frags);
+ fragcnt += frags;
+ visitor(ino, -1 - i, ip->dp2.di_extb[i], frags);
+ }
+ }
+ /* Skip datablocks for short links and devices. */
+ if (mode == IFBLK || mode == IFCHR ||
+ (mode == IFLNK && size < fs->fs_maxsymlinklen))
+ return (fragcnt);
+ for (i = 0; i < NDADDR; i++) {
+ if (DIP(ip, di_db[i]) == 0)
+ continue;
+ frags = sblksize(fs, size, i);
+ frags = numfrags(fs, frags);
+ fragcnt += frags;
+ visitor(ino, i, DIP(ip, di_db[i]), frags);
+ }
+ /*
+ * We know the following indirects are real as we're following
+ * real pointers to them.
+ */
+ flags |= VISIT_ROOT;
+ for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++,
+ lbn = nextlbn) {
+ nextlbn = lbn + tmpval;
+ tmpval *= NINDIR(fs);
+ if (DIP(ip, di_ib[i]) == 0)
+ continue;
+ indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor,
+ flags);
+ }
+ return (fragcnt);
+}
+
+/*
+ * Null visitor function used when we just want to count blocks and
+ * record the lbn.
+ */
+ufs_lbn_t visitlbn;
+static void
+null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+ if (lbn > 0)
+ visitlbn = lbn;
+}
+
+/*
+ * Recalculate di_blocks when we discover that a block allocation or
+ * free was not successfully completed. The kernel does not roll this back
+ * because it would be too expensive to compute which indirects were
+ * reachable at the time the inode was written.
+ */
+static void
+ino_adjblks(struct suj_ino *sino)
+{
+ union dinode *ip;
+ uint64_t blocks;
+ uint64_t frags;
+ off_t isize;
+ off_t size;
+ ino_t ino;
+
+ ino = sino->si_ino;
+ ip = ino_read(ino);
+ /* No need to adjust zero'd inodes. */
+ if (DIP(ip, di_mode) == 0)
+ return;
+ /*
+ * Visit all blocks and count them as well as recording the last
+ * valid lbn in the file. If the file size doesn't agree with the
+ * last lbn we need to truncate to fix it. Otherwise just adjust
+ * the blocks count.
+ */
+ visitlbn = 0;
+ frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT);
+ blocks = fsbtodb(fs, frags);
+ /*
+ * We assume the size and direct block list is kept coherent by
+ * softdep. For files that have extended into indirects we truncate
+ * to the size in the inode or the maximum size permitted by
+ * populated indirects.
+ */
+ if (visitlbn >= NDADDR) {
+ isize = DIP(ip, di_size);
+ size = lblktosize(fs, visitlbn + 1);
+ if (isize > size)
+ isize = size;
+ /* Always truncate to free any unpopulated indirects. */
+ ino_trunc(sino->si_ino, isize);
+ return;
+ }
+ if (blocks == DIP(ip, di_blocks))
+ return;
+ if (debug)
+ printf("ino %d adjusting block count from %jd to %jd\n",
+ ino, DIP(ip, di_blocks), blocks);
+ DIP_SET(ip, di_blocks, blocks);
+ ino_dirty(ino);
+}
+
+static void
+blk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+ int mask;
+
+ mask = blk_freemask(blk, ino, lbn, frags);
+ if (debug)
+ printf("blk %jd freemask 0x%X\n", blk, mask);
+ blk_free(blk, mask, frags);
+}
+
+/*
+ * Free a block or tree of blocks that was previously rooted in ino at
+ * the given lbn. If the lbn is an indirect all children are freed
+ * recursively.
+ */
+static void
+blk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow)
+{
+ uint64_t resid;
+ int mask;
+
+ mask = blk_freemask(blk, ino, lbn, frags);
+ if (debug)
+ printf("blk %jd freemask 0x%X\n", blk, mask);
+ resid = 0;
+ if (lbn <= -NDADDR && follow && mask == 0)
+ indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR);
+ else
+ blk_free(blk, mask, frags);
+}
+
+static void
+ino_setskip(struct suj_ino *sino, ino_t parent)
+{
+ int isdot;
+ int mode;
+
+ if (ino_isat(sino->si_ino, DOTDOT_OFFSET, parent, &mode, &isdot))
+ sino->si_skipparent = 1;
+}
+
+/*
+ * Free the children of a directory when the directory is discarded.
+ */
+static void
+ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+ struct suj_ino *sino;
+ struct suj_rec *srec;
+ struct jrefrec *rrec;
+ struct direct *dp;
+ off_t diroff;
+ uint8_t *block;
+ int skipparent;
+ int isparent;
+ int dpoff;
+ int size;
+
+ sino = ino_lookup(ino, 0);
+ if (sino)
+ skipparent = sino->si_skipparent;
+ else
+ skipparent = 0;
+ size = lfragtosize(fs, frags);
+ block = dblk_read(blk, size);
+ dp = (struct direct *)&block[0];
+ for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) {
+ dp = (struct direct *)&block[dpoff];
+ if (dp->d_ino == 0 || dp->d_ino == WINO)
+ continue;
+ if (dp->d_namlen == 1 && dp->d_name[0] == '.')
+ continue;
+ isparent = dp->d_namlen == 2 && dp->d_name[0] == '.' &&
+ dp->d_name[1] == '.';
+ if (isparent && skipparent == 1)
+ continue;
+ if (debug)
+ printf("Directory %d removing ino %d name %s\n",
+ ino, dp->d_ino, dp->d_name);
+ /*
+ * Lookup this inode to see if we have a record for it.
+ * If not, we've already adjusted it assuming this path
+ * was valid and we have to adjust once more.
+ */
+ sino = ino_lookup(dp->d_ino, 0);
+ if (sino == NULL || sino->si_hasrecs == 0) {
+ ino_decr(ino);
+ continue;
+ }
+ /*
+ * Use ino_adjust() so if we lose the last non-dot reference
+ * to a directory it can be discarded.
+ */
+ if (sino->si_linkadj) {
+ sino->si_nlink--;
+ if (isparent)
+ sino->si_dotlinks--;
+ ino_adjust(sino);
+ }
+ /*
+ * Tell any child directories we've already removed their
+ * parent. Don't try to adjust our link down again.
+ */
+ if (isparent == 0)
+ ino_setskip(sino, ino);
+ /*
+ * If we haven't yet processed this inode we need to make
+ * sure we will successfully discover the lost path. If not
+ * use nlinkadj to remember.
+ */
+ diroff = lblktosize(fs, lbn) + dpoff;
+ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+ rrec = (struct jrefrec *)srec->sr_rec;
+ if (rrec->jr_parent == ino &&
+ rrec->jr_diroff == diroff)
+ break;
+ }
+ if (srec == NULL)
+ sino->si_nlinkadj++;
+ }
+}
+
+/*
+ * Reclaim an inode, freeing all blocks and decrementing all children's
+ * link counts. Free the inode back to the cg.
+ */
+static void
+ino_reclaim(union dinode *ip, ino_t ino, int mode)
+{
+ uint32_t gen;
+
+ if (ino == ROOTINO)
+ errx(1, "Attempting to free ROOTINO");
+ if (debug)
+ printf("Truncating and freeing ino %d, nlink %d, mode %o\n",
+ ino, DIP(ip, di_nlink), DIP(ip, di_mode));
+
+ /* We are freeing an inode or directory. */
+ if ((DIP(ip, di_mode) & IFMT) == IFDIR)
+ ino_visit(ip, ino, ino_free_children, 0);
+ DIP_SET(ip, di_nlink, 0);
+ ino_visit(ip, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR);
+ /* Here we have to clear the inode and release any blocks it holds. */
+ gen = DIP(ip, di_gen);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ bzero(ip, sizeof(struct ufs1_dinode));
+ else
+ bzero(ip, sizeof(struct ufs2_dinode));
+ DIP_SET(ip, di_gen, gen);
+ ino_dirty(ino);
+ ino_free(ino, mode);
+ return;
+}
+
+/*
+ * Adjust an inode's link count down by one when a directory goes away.
+ */
+static void
+ino_decr(ino_t ino)
+{
+ union dinode *ip;
+ int reqlink;
+ int nlink;
+ int mode;
+
+ ip = ino_read(ino);
+ nlink = DIP(ip, di_nlink);
+ mode = DIP(ip, di_mode);
+ if (nlink < 1)
+ errx(1, "Inode %d link count %d invalid", ino, nlink);
+ if (mode == 0)
+ errx(1, "Inode %d has a link of %d with 0 mode.", ino, nlink);
+ nlink--;
+ if ((mode & IFMT) == IFDIR)
+ reqlink = 2;
+ else
+ reqlink = 1;
+ if (nlink < reqlink) {
+ if (debug)
+ printf("ino %d not enough links to live %d < %d\n",
+ ino, nlink, reqlink);
+ ino_reclaim(ip, ino, mode);
+ return;
+ }
+ DIP_SET(ip, di_nlink, nlink);
+ ino_dirty(ino);
+}
+
+/*
+ * Adjust the inode link count to 'nlink'. If the count reaches zero
+ * free it.
+ */
+static void
+ino_adjust(struct suj_ino *sino)
+{
+ struct jrefrec *rrec;
+ struct suj_rec *srec;
+ struct suj_ino *stmp;
+ union dinode *ip;
+ nlink_t nlink;
+ int reqlink;
+ int mode;
+ ino_t ino;
+
+ nlink = sino->si_nlink;
+ ino = sino->si_ino;
+ /*
+ * If it's a directory with no real names pointing to it go ahead
+ * and truncate it. This will free any children.
+ */
+ if ((sino->si_mode & IFMT) == IFDIR &&
+ nlink - sino->si_dotlinks == 0) {
+ sino->si_nlink = nlink = 0;
+ /*
+ * Mark any .. links so they know not to free this inode
+ * when they are removed.
+ */
+ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+ rrec = (struct jrefrec *)srec->sr_rec;
+ if (rrec->jr_diroff == DOTDOT_OFFSET) {
+ stmp = ino_lookup(rrec->jr_parent, 0);
+ if (stmp)
+ ino_setskip(stmp, ino);
+ }
+ }
+ }
+ ip = ino_read(ino);
+ mode = DIP(ip, di_mode) & IFMT;
+ if (nlink > LINK_MAX)
+ errx(1,
+ "ino %d nlink manipulation error, new link %d, old link %d",
+ ino, nlink, DIP(ip, di_nlink));
+ if (debug)
+ printf("Adjusting ino %d, nlink %d, old link %d lastmode %o\n",
+ ino, nlink, DIP(ip, di_nlink), sino->si_mode);
+ if (mode == 0) {
+ if (debug)
+ printf("ino %d, zero inode freeing bitmap\n", ino);
+ ino_free(ino, sino->si_mode);
+ return;
+ }
+ /* XXX Should be an assert? */
+ if (mode != sino->si_mode && debug)
+ printf("ino %d, mode %o != %o\n", ino, mode, sino->si_mode);
+ if ((mode & IFMT) == IFDIR)
+ reqlink = 2;
+ else
+ reqlink = 1;
+ /* If the inode doesn't have enough links to live, free it. */
+ if (nlink < reqlink) {
+ if (debug)
+ printf("ino %d not enough links to live %d < %d\n",
+ ino, nlink, reqlink);
+ ino_reclaim(ip, ino, mode);
+ return;
+ }
+ /* If required write the updated link count. */
+ if (DIP(ip, di_nlink) == nlink) {
+ if (debug)
+ printf("ino %d, link matches, skipping.\n", ino);
+ return;
+ }
+ DIP_SET(ip, di_nlink, nlink);
+ ino_dirty(ino);
+}
+
+/*
+ * Truncate some or all blocks in an indirect, freeing any that are required
+ * and zeroing the indirect.
+ */
+static void
+indir_trunc(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, ufs_lbn_t lastlbn)
+{
+ ufs2_daddr_t *bap2;
+ ufs1_daddr_t *bap1;
+ ufs_lbn_t lbnadd;
+ ufs2_daddr_t nblk;
+ ufs_lbn_t next;
+ ufs_lbn_t nlbn;
+ int dirty;
+ int level;
+ int i;
+
+ if (blk == 0)
+ return;
+ dirty = 0;
+ level = lbn_level(lbn);
+ if (level == -1)
+ errx(1, "Invalid level for lbn %jd", lbn);
+ lbnadd = 1;
+ for (i = level; i > 0; i--)
+ lbnadd *= NINDIR(fs);
+ bap1 = (void *)dblk_read(blk, fs->fs_bsize);
+ bap2 = (void *)bap1;
+ for (i = 0; i < NINDIR(fs); i++) {
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ nblk = *bap1++;
+ else
+ nblk = *bap2++;
+ if (nblk == 0)
+ continue;
+ if (level != 0) {
+ nlbn = (lbn + 1) - (i * lbnadd);
+ /*
+ * Calculate the lbn of the next indirect to
+ * determine if any of this indirect must be
+ * reclaimed.
+ */
+ next = -(lbn + level) + ((i+1) * lbnadd);
+ if (next <= lastlbn)
+ continue;
+ indir_trunc(ino, nlbn, nblk, lastlbn);
+ /* If all of this indirect was reclaimed, free it. */
+ nlbn = next - lbnadd;
+ if (nlbn < lastlbn)
+ continue;
+ } else {
+ nlbn = -lbn + i * lbnadd;
+ if (nlbn < lastlbn)
+ continue;
+ }
+ dirty = 1;
+ blk_free(nblk, 0, fs->fs_frag);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ *(bap1 - 1) = 0;
+ else
+ *(bap2 - 1) = 0;
+ }
+ if (dirty)
+ dblk_dirty(blk);
+}
+
+/*
+ * Truncate an inode to the minimum of the given size or the last populated
+ * block after any over size have been discarded. The kernel would allocate
+ * the last block in the file but fsck does not and neither do we. This
+ * code never extends files, only shrinks them.
+ */
+static void
+ino_trunc(ino_t ino, off_t size)
+{
+ union dinode *ip;
+ ufs2_daddr_t bn;
+ uint64_t totalfrags;
+ ufs_lbn_t nextlbn;
+ ufs_lbn_t lastlbn;
+ ufs_lbn_t tmpval;
+ ufs_lbn_t lbn;
+ ufs_lbn_t i;
+ int frags;
+ off_t cursize;
+ off_t off;
+ int mode;
+
+ ip = ino_read(ino);
+ mode = DIP(ip, di_mode) & IFMT;
+ cursize = DIP(ip, di_size);
+ if (debug)
+ printf("Truncating ino %d, mode %o to size %jd from size %jd\n",
+ ino, mode, size, cursize);
+
+ /* Skip datablocks for short links and devices. */
+ if (mode == 0 || mode == IFBLK || mode == IFCHR ||
+ (mode == IFLNK && cursize < fs->fs_maxsymlinklen))
+ return;
+ /* Don't extend. */
+ if (size > cursize)
+ size = cursize;
+ lastlbn = lblkno(fs, blkroundup(fs, size));
+ for (i = lastlbn; i < NDADDR; i++) {
+ if (DIP(ip, di_db[i]) == 0)
+ continue;
+ frags = sblksize(fs, cursize, i);
+ frags = numfrags(fs, frags);
+ blk_free(DIP(ip, di_db[i]), 0, frags);
+ DIP_SET(ip, di_db[i], 0);
+ }
+ /*
+ * Follow indirect blocks, freeing anything required.
+ */
+ for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++,
+ lbn = nextlbn) {
+ nextlbn = lbn + tmpval;
+ tmpval *= NINDIR(fs);
+ /* If we're not freeing any in this indirect range skip it. */
+ if (lastlbn >= nextlbn)
+ continue;
+ if (DIP(ip, di_ib[i]) == 0)
+ continue;
+ indir_trunc(ino, -lbn - i, DIP(ip, di_ib[i]), lastlbn);
+ /* If we freed everything in this indirect free the indir. */
+ if (lastlbn > lbn)
+ continue;
+ blk_free(DIP(ip, di_ib[i]), 0, frags);
+ DIP_SET(ip, di_ib[i], 0);
+ }
+ ino_dirty(ino);
+ /*
+ * Now that we've freed any whole blocks that exceed the desired
+ * truncation size, figure out how many blocks remain and what the
+ * last populated lbn is. We will set the size to this last lbn
+ * rather than worrying about allocating the final lbn as the kernel
+ * would've done. This is consistent with normal fsck behavior.
+ */
+ visitlbn = 0;
+ totalfrags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT);
+ if (size > lblktosize(fs, visitlbn + 1))
+ size = lblktosize(fs, visitlbn + 1);
+ /*
+ * If we're truncating direct blocks we have to adjust frags
+ * accordingly.
+ */
+ if (visitlbn < NDADDR && totalfrags) {
+ long oldspace, newspace;
+
+ bn = DIP(ip, di_db[visitlbn]);
+ if (bn == 0)
+ errx(1, "Bad blk at ino %d lbn %jd\n", ino, visitlbn);
+ oldspace = sblksize(fs, cursize, visitlbn);
+ newspace = sblksize(fs, size, visitlbn);
+ if (oldspace != newspace) {
+ bn += numfrags(fs, newspace);
+ frags = numfrags(fs, oldspace - newspace);
+ blk_free(bn, 0, frags);
+ totalfrags -= frags;
+ }
+ }
+ DIP_SET(ip, di_blocks, fsbtodb(fs, totalfrags));
+ DIP_SET(ip, di_size, size);
+ /*
+ * If we've truncated into the middle of a block or frag we have
+ * to zero it here. Otherwise the file could extend into
+ * uninitialized space later.
+ */
+ off = blkoff(fs, size);
+ if (off) {
+ uint8_t *buf;
+ long clrsize;
+
+ bn = ino_blkatoff(ip, ino, visitlbn, &frags);
+ if (bn == 0)
+ errx(1, "Block missing from ino %d at lbn %jd\n",
+ ino, visitlbn);
+ clrsize = frags * fs->fs_fsize;
+ buf = dblk_read(bn, clrsize);
+ clrsize -= off;
+ buf += off;
+ bzero(buf, clrsize);
+ dblk_dirty(bn);
+ }
+ return;
+}
+
+/*
+ * Process records available for one inode and determine whether the
+ * link count is correct or needs adjusting.
+ */
+static void
+ino_check(struct suj_ino *sino)
+{
+ struct suj_rec *srec;
+ struct jrefrec *rrec;
+ nlink_t dotlinks;
+ int newlinks;
+ int removes;
+ int nlink;
+ ino_t ino;
+ int isdot;
+ int isat;
+ int mode;
+
+ if (sino->si_hasrecs == 0)
+ return;
+ ino = sino->si_ino;
+ rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec;
+ nlink = rrec->jr_nlink;
+ newlinks = 0;
+ dotlinks = 0;
+ removes = sino->si_nlinkadj;
+ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+ rrec = (struct jrefrec *)srec->sr_rec;
+ isat = ino_isat(rrec->jr_parent, rrec->jr_diroff,
+ rrec->jr_ino, &mode, &isdot);
+ if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT))
+ errx(1, "Inode mode/directory type mismatch %o != %o",
+ mode, rrec->jr_mode);
+ if (debug)
+ printf("jrefrec: op %d ino %d, nlink %d, parent %d, "
+ "diroff %jd, mode %o, isat %d, isdot %d\n",
+ rrec->jr_op, rrec->jr_ino, rrec->jr_nlink,
+ rrec->jr_parent, rrec->jr_diroff, rrec->jr_mode,
+ isat, isdot);
+ mode = rrec->jr_mode & IFMT;
+ if (rrec->jr_op == JOP_REMREF)
+ removes++;
+ newlinks += isat;
+ if (isdot)
+ dotlinks += isat;
+ }
+ /*
+ * The number of links that remain are the starting link count
+ * subtracted by the total number of removes with the total
+ * links discovered back in. An incomplete remove thus
+ * makes no change to the link count but an add increases
+ * by one.
+ */
+ if (debug)
+ printf("ino %d nlink %d newlinks %d removes %d dotlinks %d\n",
+ ino, nlink, newlinks, removes, dotlinks);
+ nlink += newlinks;
+ nlink -= removes;
+ sino->si_linkadj = 1;
+ sino->si_nlink = nlink;
+ sino->si_dotlinks = dotlinks;
+ sino->si_mode = mode;
+ ino_adjust(sino);
+}
+
+/*
+ * Process records available for one block and determine whether it is
+ * still allocated and whether the owning inode needs to be updated or
+ * a free completed.
+ */
+static void
+blk_check(struct suj_blk *sblk)
+{
+ struct suj_rec *srec;
+ struct jblkrec *brec;
+ struct suj_ino *sino;
+ ufs2_daddr_t blk;
+ int mask;
+ int frags;
+ int isat;
+
+ /*
+ * Each suj_blk actually contains records for any fragments in that
+ * block. As a result we must evaluate each record individually.
+ */
+ sino = NULL;
+ TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+ brec = (struct jblkrec *)srec->sr_rec;
+ frags = brec->jb_frags;
+ blk = brec->jb_blkno + brec->jb_oldfrags;
+ isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags);
+ if (sino == NULL || sino->si_ino != brec->jb_ino) {
+ sino = ino_lookup(brec->jb_ino, 1);
+ sino->si_blkadj = 1;
+ }
+ if (debug)
+ printf("op %d blk %jd ino %d lbn %jd frags %d isat %d (%d)\n",
+ brec->jb_op, blk, brec->jb_ino, brec->jb_lbn,
+ brec->jb_frags, isat, frags);
+ /*
+ * If we found the block at this address we still have to
+ * determine if we need to free the tail end that was
+ * added by adding contiguous fragments from the same block.
+ */
+ if (isat == 1) {
+ if (frags == brec->jb_frags)
+ continue;
+ mask = blk_freemask(blk, brec->jb_ino, brec->jb_lbn,
+ brec->jb_frags);
+ mask >>= frags;
+ blk += frags;
+ frags = brec->jb_frags - frags;
+ blk_free(blk, mask, frags);
+ continue;
+ }
+ /*
+ * The block wasn't found, attempt to free it. It won't be
+ * freed if it was actually reallocated. If this was an
+ * allocation we don't want to follow indirects as they
+ * may not be written yet. Any children of the indirect will
+ * have their own records. If it's a free we need to
+ * recursively free children.
+ */
+ blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags,
+ brec->jb_op == JOP_FREEBLK);
+ }
+}
+
+/*
+ * Walk the list of inode records for this cg and resolve moved and duplicate
+ * inode references now that we have a complete picture.
+ */
+static void
+cg_build(struct suj_cg *sc)
+{
+ struct suj_ino *sino;
+ int i;
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
+ ino_build(sino);
+}
+
+/*
+ * Handle inodes requiring truncation. This must be done prior to
+ * looking up any inodes in directories.
+ */
+static void
+cg_trunc(struct suj_cg *sc)
+{
+ struct suj_ino *sino;
+ int i;
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
+ if (sino->si_trunc) {
+ ino_trunc(sino->si_ino,
+ sino->si_trunc->jt_size);
+ sino->si_trunc = NULL;
+ }
+}
+
+/*
+ * Free any partially allocated blocks and then resolve inode block
+ * counts.
+ */
+static void
+cg_check_blk(struct suj_cg *sc)
+{
+ struct suj_ino *sino;
+ struct suj_blk *sblk;
+ int i;
+
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next)
+ blk_check(sblk);
+ /*
+ * Now that we've freed blocks which are not referenced we
+ * make a second pass over all inodes to adjust their block
+ * counts.
+ */
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
+ if (sino->si_blkadj)
+ ino_adjblks(sino);
+}
+
+/*
+ * Walk the list of inode records for this cg, recovering any
+ * changes which were not complete at the time of crash.
+ */
+static void
+cg_check_ino(struct suj_cg *sc)
+{
+ struct suj_ino *sino;
+ int i;
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
+ ino_check(sino);
+}
+
+/*
+ * Write a potentially dirty cg. Recalculate the summary information and
+ * update the superblock summary.
+ */
+static void
+cg_write(struct suj_cg *sc)
+{
+ ufs1_daddr_t fragno, cgbno, maxbno;
+ u_int8_t *blksfree;
+ struct cg *cgp;
+ int blk;
+ int i;
+
+ if (sc->sc_dirty == 0)
+ return;
+ /*
+ * Fix the frag and cluster summary.
+ */
+ cgp = sc->sc_cgp;
+ cgp->cg_cs.cs_nbfree = 0;
+ cgp->cg_cs.cs_nffree = 0;
+ bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum));
+ maxbno = fragstoblks(fs, fs->fs_fpg);
+ if (fs->fs_contigsumsize > 0) {
+ for (i = 1; i <= fs->fs_contigsumsize; i++)
+ cg_clustersum(cgp)[i] = 0;
+ bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT));
+ }
+ blksfree = cg_blksfree(cgp);
+ for (cgbno = 0; cgbno < maxbno; cgbno++) {
+ if (ffs_isfreeblock(fs, blksfree, cgbno))
+ continue;
+ if (ffs_isblock(fs, blksfree, cgbno)) {
+ ffs_clusteracct(fs, cgp, cgbno, 1);
+ cgp->cg_cs.cs_nbfree++;
+ continue;
+ }
+ fragno = blkstofrags(fs, cgbno);
+ blk = blkmap(fs, blksfree, fragno);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
+ for (i = 0; i < fs->fs_frag; i++)
+ if (isset(blksfree, fragno + i))
+ cgp->cg_cs.cs_nffree++;
+ }
+ /*
+ * Update the superblock cg summary from our now correct values
+ * before writing the block.
+ */
+ fs->fs_cs(fs, sc->sc_cgx) = cgp->cg_cs;
+ if (bwrite(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf,
+ fs->fs_bsize) == -1)
+ err(1, "Unable to write cylinder group %d", sc->sc_cgx);
+}
+
+/*
+ * Write out any modified inodes.
+ */
+static void
+cg_write_inos(struct suj_cg *sc)
+{
+ struct ino_blk *iblk;
+ int i;
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next)
+ if (iblk->ib_dirty)
+ iblk_write(iblk);
+}
+
+static void
+cg_apply(void (*apply)(struct suj_cg *))
+{
+ struct suj_cg *scg;
+ int i;
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(scg, &cghash[i], sc_next)
+ apply(scg);
+}
+
+/*
+ * Process the unlinked but referenced file list. Freeing all inodes.
+ */
+static void
+ino_unlinked(void)
+{
+ union dinode *ip;
+ uint16_t mode;
+ ino_t inon;
+ ino_t ino;
+
+ ino = fs->fs_sujfree;
+ fs->fs_sujfree = 0;
+ while (ino != 0) {
+ ip = ino_read(ino);
+ mode = DIP(ip, di_mode) & IFMT;
+ inon = DIP(ip, di_freelink);
+ DIP_SET(ip, di_freelink, 0);
+ /*
+ * XXX Should this be an errx?
+ */
+ if (DIP(ip, di_nlink) == 0) {
+ if (debug)
+ printf("Freeing unlinked ino %d mode %o\n",
+ ino, mode);
+ ino_reclaim(ip, ino, mode);
+ } else if (debug)
+ printf("Skipping ino %d mode %o with link %d\n",
+ ino, mode, DIP(ip, di_nlink));
+ ino = inon;
+ }
+}
+
+/*
+ * Append a new record to the list of records requiring processing.
+ */
+static void
+ino_append(union jrec *rec)
+{
+ struct jrefrec *refrec;
+ struct jmvrec *mvrec;
+ struct suj_ino *sino;
+ struct suj_rec *srec;
+
+ mvrec = &rec->rec_jmvrec;
+ refrec = &rec->rec_jrefrec;
+ if (debug && mvrec->jm_op == JOP_MVREF)
+ printf("ino move: ino %d, parent %d, diroff %jd, oldoff %jd\n",
+ mvrec->jm_ino, mvrec->jm_parent, mvrec->jm_newoff,
+ mvrec->jm_oldoff);
+ else if (debug &&
+ (refrec->jr_op == JOP_ADDREF || refrec->jr_op == JOP_REMREF))
+ printf("ino ref: op %d, ino %d, nlink %d, "
+ "parent %d, diroff %jd\n",
+ refrec->jr_op, refrec->jr_ino, refrec->jr_nlink,
+ refrec->jr_parent, refrec->jr_diroff);
+ /*
+ * Lookup the ino and clear truncate if one is found. Partial
+ * truncates are always done synchronously so if we discover
+ * an operation that requires a lock the truncation has completed
+ * and can be discarded.
+ */
+ sino = ino_lookup(((struct jrefrec *)rec)->jr_ino, 1);
+ sino->si_trunc = NULL;
+ sino->si_hasrecs = 1;
+ srec = errmalloc(sizeof(*srec));
+ srec->sr_rec = rec;
+ TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next);
+}
+
+/*
+ * Add a reference adjustment to the sino list and eliminate dups. The
+ * primary loop in ino_build_ref() checks for dups but new ones may be
+ * created as a result of offset adjustments.
+ */
+static void
+ino_add_ref(struct suj_ino *sino, struct suj_rec *srec)
+{
+ struct jrefrec *refrec;
+ struct suj_rec *srn;
+ struct jrefrec *rrn;
+
+ refrec = (struct jrefrec *)srec->sr_rec;
+ /*
+ * We walk backwards so that the oldest link count is preserved. If
+ * an add record conflicts with a remove keep the remove. Redundant
+ * removes are eliminated in ino_build_ref. Otherwise we keep the
+ * oldest record at a given location.
+ */
+ for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
+ srn = TAILQ_PREV(srn, srechd, sr_next)) {
+ rrn = (struct jrefrec *)srn->sr_rec;
+ if (rrn->jr_parent != refrec->jr_parent ||
+ rrn->jr_diroff != refrec->jr_diroff)
+ continue;
+ if (rrn->jr_op == JOP_REMREF || refrec->jr_op == JOP_ADDREF) {
+ rrn->jr_mode = refrec->jr_mode;
+ return;
+ }
+ /*
+ * Adding a remove.
+ *
+ * Replace the record in place with the old nlink in case
+ * we replace the head of the list. Abandon srec as a dup.
+ */
+ refrec->jr_nlink = rrn->jr_nlink;
+ srn->sr_rec = srec->sr_rec;
+ return;
+ }
+ TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next);
+}
+
+/*
+ * Create a duplicate of a reference at a previous location.
+ */
+static void
+ino_dup_ref(struct suj_ino *sino, struct jrefrec *refrec, off_t diroff)
+{
+ struct jrefrec *rrn;
+ struct suj_rec *srn;
+
+ rrn = errmalloc(sizeof(*refrec));
+ *rrn = *refrec;
+ rrn->jr_op = JOP_ADDREF;
+ rrn->jr_diroff = diroff;
+ srn = errmalloc(sizeof(*srn));
+ srn->sr_rec = (union jrec *)rrn;
+ ino_add_ref(sino, srn);
+}
+
+/*
+ * Add a reference to the list at all known locations. We follow the offset
+ * changes for a single instance and create duplicate add refs at each so
+ * that we can tolerate any version of the directory block. Eliminate
+ * removes which collide with adds that are seen in the journal. They should
+ * not adjust the link count down.
+ */
+static void
+ino_build_ref(struct suj_ino *sino, struct suj_rec *srec)
+{
+ struct jrefrec *refrec;
+ struct jmvrec *mvrec;
+ struct suj_rec *srp;
+ struct suj_rec *srn;
+ struct jrefrec *rrn;
+ off_t diroff;
+
+ refrec = (struct jrefrec *)srec->sr_rec;
+ /*
+ * Search for a mvrec that matches this offset. Whether it's an add
+ * or a remove we can delete the mvref after creating a dup record in
+ * the old location.
+ */
+ if (!TAILQ_EMPTY(&sino->si_movs)) {
+ diroff = refrec->jr_diroff;
+ for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn; srn = srp) {
+ srp = TAILQ_PREV(srn, srechd, sr_next);
+ mvrec = (struct jmvrec *)srn->sr_rec;
+ if (mvrec->jm_parent != refrec->jr_parent ||
+ mvrec->jm_newoff != diroff)
+ continue;
+ diroff = mvrec->jm_oldoff;
+ TAILQ_REMOVE(&sino->si_movs, srn, sr_next);
+ ino_dup_ref(sino, refrec, diroff);
+ }
+ }
+ /*
+ * If a remove wasn't eliminated by an earlier add just append it to
+ * the list.
+ */
+ if (refrec->jr_op == JOP_REMREF) {
+ ino_add_ref(sino, srec);
+ return;
+ }
+ /*
+ * Walk the list of records waiting to be added to the list. We
+ * must check for moves that apply to our current offset and remove
+ * them from the list. Remove any duplicates to eliminate removes
+ * with corresponding adds.
+ */
+ TAILQ_FOREACH_SAFE(srn, &sino->si_newrecs, sr_next, srp) {
+ switch (srn->sr_rec->rec_jrefrec.jr_op) {
+ case JOP_ADDREF:
+ /*
+ * This should actually be an error we should
+ * have a remove for every add journaled.
+ */
+ rrn = (struct jrefrec *)srn->sr_rec;
+ if (rrn->jr_parent != refrec->jr_parent ||
+ rrn->jr_diroff != refrec->jr_diroff)
+ break;
+ TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next);
+ break;
+ case JOP_REMREF:
+ /*
+ * Once we remove the current iteration of the
+ * record at this address we're done.
+ */
+ rrn = (struct jrefrec *)srn->sr_rec;
+ if (rrn->jr_parent != refrec->jr_parent ||
+ rrn->jr_diroff != refrec->jr_diroff)
+ break;
+ TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next);
+ ino_add_ref(sino, srec);
+ return;
+ case JOP_MVREF:
+ /*
+ * Update our diroff based on any moves that match
+ * and remove the move.
+ */
+ mvrec = (struct jmvrec *)srn->sr_rec;
+ if (mvrec->jm_parent != refrec->jr_parent ||
+ mvrec->jm_oldoff != refrec->jr_diroff)
+ break;
+ ino_dup_ref(sino, refrec, mvrec->jm_oldoff);
+ refrec->jr_diroff = mvrec->jm_newoff;
+ TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next);
+ break;
+ default:
+ errx(1, "ino_build_ref: Unknown op %d",
+ srn->sr_rec->rec_jrefrec.jr_op);
+ }
+ }
+ ino_add_ref(sino, srec);
+}
+
+/*
+ * Walk the list of new records and add them in-order resolving any
+ * dups and adjusted offsets.
+ */
+static void
+ino_build(struct suj_ino *sino)
+{
+ struct suj_rec *srec;
+
+ while ((srec = TAILQ_FIRST(&sino->si_newrecs)) != NULL) {
+ TAILQ_REMOVE(&sino->si_newrecs, srec, sr_next);
+ switch (srec->sr_rec->rec_jrefrec.jr_op) {
+ case JOP_ADDREF:
+ case JOP_REMREF:
+ ino_build_ref(sino, srec);
+ break;
+ case JOP_MVREF:
+ /*
+ * Add this mvrec to the queue of pending mvs.
+ */
+ TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next);
+ break;
+ default:
+ errx(1, "ino_build: Unknown op %d",
+ srec->sr_rec->rec_jrefrec.jr_op);
+ }
+ }
+ if (TAILQ_EMPTY(&sino->si_recs))
+ sino->si_hasrecs = 0;
+}
+
+/*
+ * Modify journal records so they refer to the base block number
+ * and a start and end frag range. This is to facilitate the discovery
+ * of overlapping fragment allocations.
+ */
+static void
+blk_build(struct jblkrec *blkrec)
+{
+ struct suj_rec *srec;
+ struct suj_blk *sblk;
+ struct jblkrec *blkrn;
+ struct suj_ino *sino;
+ ufs2_daddr_t blk;
+ off_t foff;
+ int frag;
+
+ if (debug)
+ printf("blk_build: op %d blkno %jd frags %d oldfrags %d "
+ "ino %d lbn %jd\n",
+ blkrec->jb_op, blkrec->jb_blkno, blkrec->jb_frags,
+ blkrec->jb_oldfrags, blkrec->jb_ino, blkrec->jb_lbn);
+
+ /*
+ * Look up the inode and clear the truncate if any lbns after the
+ * truncate lbn are freed or allocated.
+ */
+ sino = ino_lookup(blkrec->jb_ino, 0);
+ if (sino && sino->si_trunc) {
+ foff = lblktosize(fs, blkrec->jb_lbn);
+ foff += lfragtosize(fs, blkrec->jb_frags);
+ if (foff > sino->si_trunc->jt_size)
+ sino->si_trunc = NULL;
+ }
+ blk = blknum(fs, blkrec->jb_blkno);
+ frag = fragnum(fs, blkrec->jb_blkno);
+ sblk = blk_lookup(blk, 1);
+ /*
+ * Rewrite the record using oldfrags to indicate the offset into
+ * the block. Leave jb_frags as the actual allocated count.
+ */
+ blkrec->jb_blkno -= frag;
+ blkrec->jb_oldfrags = frag;
+ if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag)
+ errx(1, "Invalid fragment count %d oldfrags %d",
+ blkrec->jb_frags, frag);
+ /*
+ * Detect dups. If we detect a dup we always discard the oldest
+ * record as it is superseded by the new record. This speeds up
+ * later stages but also eliminates free records which are used
+ * to indicate that the contents of indirects can be trusted.
+ */
+ TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+ blkrn = (struct jblkrec *)srec->sr_rec;
+ if (blkrn->jb_ino != blkrec->jb_ino ||
+ blkrn->jb_lbn != blkrec->jb_lbn ||
+ blkrn->jb_blkno != blkrec->jb_blkno ||
+ blkrn->jb_frags != blkrec->jb_frags ||
+ blkrn->jb_oldfrags != blkrec->jb_oldfrags)
+ continue;
+ if (debug)
+ printf("Removed dup.\n");
+ /* Discard the free which is a dup with an alloc. */
+ if (blkrec->jb_op == JOP_FREEBLK)
+ return;
+ TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next);
+ free(srec);
+ break;
+ }
+ srec = errmalloc(sizeof(*srec));
+ srec->sr_rec = (union jrec *)blkrec;
+ TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next);
+}
+
+static void
+ino_build_trunc(struct jtrncrec *rec)
+{
+ struct suj_ino *sino;
+
+ if (debug)
+ printf("ino_build_trunc: ino %d, size %jd\n",
+ rec->jt_ino, rec->jt_size);
+ sino = ino_lookup(rec->jt_ino, 1);
+ sino->si_trunc = rec;
+}
+
+/*
+ * Build up tables of the operations we need to recover.
+ */
+static void
+suj_build(void)
+{
+ struct suj_seg *seg;
+ union jrec *rec;
+ int off;
+ int i;
+
+ TAILQ_FOREACH(seg, &allsegs, ss_next) {
+ if (debug)
+ printf("seg %jd has %d records, oldseq %jd.\n",
+ seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt,
+ seg->ss_rec.jsr_oldest);
+ off = 0;
+ rec = (union jrec *)seg->ss_blk;
+ for (i = 0; i < seg->ss_rec.jsr_cnt; off += JREC_SIZE, rec++) {
+ /* skip the segrec. */
+ if ((off % DEV_BSIZE) == 0)
+ continue;
+ switch (rec->rec_jrefrec.jr_op) {
+ case JOP_ADDREF:
+ case JOP_REMREF:
+ case JOP_MVREF:
+ ino_append(rec);
+ break;
+ case JOP_NEWBLK:
+ case JOP_FREEBLK:
+ blk_build((struct jblkrec *)rec);
+ break;
+ case JOP_TRUNC:
+ ino_build_trunc((struct jtrncrec *)rec);
+ break;
+ default:
+ errx(1, "Unknown journal operation %d (%d)",
+ rec->rec_jrefrec.jr_op, off);
+ }
+ i++;
+ }
+ }
+}
+
+/*
+ * Prune the journal segments to those we care about based on the
+ * oldest sequence in the newest segment. Order the segment list
+ * based on sequence number.
+ */
+static void
+suj_prune(void)
+{
+ struct suj_seg *seg;
+ struct suj_seg *segn;
+ uint64_t newseq;
+ int discard;
+
+ if (debug)
+ printf("Pruning up to %jd\n", oldseq);
+ /* First free the expired segments. */
+ TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
+ if (seg->ss_rec.jsr_seq >= oldseq)
+ continue;
+ TAILQ_REMOVE(&allsegs, seg, ss_next);
+ free(seg->ss_blk);
+ free(seg);
+ }
+ /* Next ensure that segments are ordered properly. */
+ seg = TAILQ_FIRST(&allsegs);
+ if (seg == NULL) {
+ if (debug)
+ printf("Empty journal\n");
+ return;
+ }
+ newseq = seg->ss_rec.jsr_seq;
+ for (;;) {
+ seg = TAILQ_LAST(&allsegs, seghd);
+ if (seg->ss_rec.jsr_seq >= newseq)
+ break;
+ TAILQ_REMOVE(&allsegs, seg, ss_next);
+ TAILQ_INSERT_HEAD(&allsegs, seg, ss_next);
+ newseq = seg->ss_rec.jsr_seq;
+
+ }
+ if (newseq != oldseq)
+ errx(1, "Journal file sequence mismatch %jd != %jd",
+ newseq, oldseq);
+ /*
+ * The kernel may asynchronously write segments which can create
+ * gaps in the sequence space. Throw away any segments after the
+ * gap as the kernel guarantees only those that are contiguously
+ * reachable are marked as completed.
+ */
+ discard = 0;
+ TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
+ if (!discard && newseq++ == seg->ss_rec.jsr_seq) {
+ jrecs += seg->ss_rec.jsr_cnt;
+ jbytes += seg->ss_rec.jsr_blocks * DEV_BSIZE;
+ continue;
+ }
+ discard = 1;
+ if (debug)
+ printf("Journal order mismatch %jd != %jd pruning\n",
+ newseq-1, seg->ss_rec.jsr_seq);
+ TAILQ_REMOVE(&allsegs, seg, ss_next);
+ free(seg->ss_blk);
+ free(seg);
+ }
+ if (debug)
+ printf("Processing journal segments from %jd to %jd\n",
+ oldseq, newseq-1);
+}
+
+/*
+ * Verify the journal inode before attempting to read records.
+ */
+static int
+suj_verifyino(union dinode *ip)
+{
+
+ if (DIP(ip, di_nlink) != 1) {
+ printf("Invalid link count %d for journal inode %d\n",
+ DIP(ip, di_nlink), sujino);
+ return (-1);
+ }
+
+ if ((DIP(ip, di_flags) & (SF_IMMUTABLE | SF_NOUNLINK)) !=
+ (SF_IMMUTABLE | SF_NOUNLINK)) {
+ printf("Invalid flags 0x%X for journal inode %d\n",
+ DIP(ip, di_flags), sujino);
+ return (-1);
+ }
+
+ if (DIP(ip, di_mode) != (IFREG | IREAD)) {
+ printf("Invalid mode %o for journal inode %d\n",
+ DIP(ip, di_mode), sujino);
+ return (-1);
+ }
+
+ if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX) {
+ printf("Invalid size %jd for journal inode %d\n",
+ DIP(ip, di_size), sujino);
+ return (-1);
+ }
+
+ if (DIP(ip, di_modrev) != fs->fs_mtime) {
+ printf("Journal timestamp does not match fs mount time\n");
+ return (-1);
+ }
+
+ return (0);
+}
+
+struct jblocks {
+ struct jextent *jb_extent; /* Extent array. */
+ int jb_avail; /* Available extents. */
+ int jb_used; /* Last used extent. */
+ int jb_head; /* Allocator head. */
+ int jb_off; /* Allocator extent offset. */
+};
+struct jextent {
+ ufs2_daddr_t je_daddr; /* Disk block address. */
+ int je_blocks; /* Disk block count. */
+};
+
+struct jblocks *suj_jblocks;
+
+static struct jblocks *
+jblocks_create(void)
+{
+ struct jblocks *jblocks;
+ int size;
+
+ jblocks = errmalloc(sizeof(*jblocks));
+ jblocks->jb_avail = 10;
+ jblocks->jb_used = 0;
+ jblocks->jb_head = 0;
+ jblocks->jb_off = 0;
+ size = sizeof(struct jextent) * jblocks->jb_avail;
+ jblocks->jb_extent = errmalloc(size);
+ bzero(jblocks->jb_extent, size);
+
+ return (jblocks);
+}
+
+/*
+ * Return the next available disk block and the amount of contiguous
+ * free space it contains.
+ */
+static ufs2_daddr_t
+jblocks_next(struct jblocks *jblocks, int bytes, int *actual)
+{
+ struct jextent *jext;
+ ufs2_daddr_t daddr;
+ int freecnt;
+ int blocks;
+
+ blocks = bytes / DEV_BSIZE;
+ jext = &jblocks->jb_extent[jblocks->jb_head];
+ freecnt = jext->je_blocks - jblocks->jb_off;
+ if (freecnt == 0) {
+ jblocks->jb_off = 0;
+ if (++jblocks->jb_head > jblocks->jb_used)
+ return (0);
+ jext = &jblocks->jb_extent[jblocks->jb_head];
+ freecnt = jext->je_blocks;
+ }
+ if (freecnt > blocks)
+ freecnt = blocks;
+ *actual = freecnt * DEV_BSIZE;
+ daddr = jext->je_daddr + jblocks->jb_off;
+
+ return (daddr);
+}
+
+/*
+ * Advance the allocation head by a specified number of bytes, consuming
+ * one journal segment.
+ */
+static void
+jblocks_advance(struct jblocks *jblocks, int bytes)
+{
+
+ jblocks->jb_off += bytes / DEV_BSIZE;
+}
+
+static void
+jblocks_destroy(struct jblocks *jblocks)
+{
+
+ free(jblocks->jb_extent);
+ free(jblocks);
+}
+
+static void
+jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks)
+{
+ struct jextent *jext;
+ int size;
+
+ jext = &jblocks->jb_extent[jblocks->jb_used];
+ /* Adding the first block. */
+ if (jext->je_daddr == 0) {
+ jext->je_daddr = daddr;
+ jext->je_blocks = blocks;
+ return;
+ }
+ /* Extending the last extent. */
+ if (jext->je_daddr + jext->je_blocks == daddr) {
+ jext->je_blocks += blocks;
+ return;
+ }
+ /* Adding a new extent. */
+ if (++jblocks->jb_used == jblocks->jb_avail) {
+ jblocks->jb_avail *= 2;
+ size = sizeof(struct jextent) * jblocks->jb_avail;
+ jext = errmalloc(size);
+ bzero(jext, size);
+ bcopy(jblocks->jb_extent, jext,
+ sizeof(struct jextent) * jblocks->jb_used);
+ free(jblocks->jb_extent);
+ jblocks->jb_extent = jext;
+ }
+ jext = &jblocks->jb_extent[jblocks->jb_used];
+ jext->je_daddr = daddr;
+ jext->je_blocks = blocks;
+
+ return;
+}
+
+/*
+ * Add a file block from the journal to the extent map. We can't read
+ * each file block individually because the kernel treats it as a circular
+ * buffer and segments may span mutliple contiguous blocks.
+ */
+static void
+suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+
+ jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags));
+}
+
+static void
+suj_read(void)
+{
+ uint8_t block[1 * 1024 * 1024];
+ struct suj_seg *seg;
+ struct jsegrec *recn;
+ struct jsegrec *rec;
+ ufs2_daddr_t blk;
+ int readsize;
+ int blocks;
+ int recsize;
+ int size;
+ int i;
+
+ /*
+ * Read records until we exhaust the journal space. If we find
+ * an invalid record we start searching for a valid segment header
+ * at the next block. This is because we don't have a head/tail
+ * pointer and must recover the information indirectly. At the gap
+ * between the head and tail we won't necessarily have a valid
+ * segment.
+ */
+restart:
+ for (;;) {
+ size = sizeof(block);
+ blk = jblocks_next(suj_jblocks, size, &readsize);
+ if (blk == 0)
+ return;
+ size = readsize;
+ /*
+ * Read 1MB at a time and scan for records within this block.
+ */
+ if (bread(disk, blk, &block, size) == -1)
+ err(1, "Error reading journal block %jd",
+ (intmax_t)blk);
+ for (rec = (void *)block; size; size -= recsize,
+ rec = (struct jsegrec *)((uintptr_t)rec + recsize)) {
+ recsize = DEV_BSIZE;
+ if (rec->jsr_time != fs->fs_mtime) {
+ if (debug)
+ printf("Rec time %jd != fs mtime %jd\n",
+ rec->jsr_time, fs->fs_mtime);
+ jblocks_advance(suj_jblocks, recsize);
+ continue;
+ }
+ if (rec->jsr_cnt == 0) {
+ if (debug)
+ printf("Found illegal count %d\n",
+ rec->jsr_cnt);
+ jblocks_advance(suj_jblocks, recsize);
+ continue;
+ }
+ blocks = rec->jsr_blocks;
+ recsize = blocks * DEV_BSIZE;
+ if (recsize > size) {
+ /*
+ * We may just have run out of buffer, restart
+ * the loop to re-read from this spot.
+ */
+ if (size < fs->fs_bsize &&
+ size != readsize &&
+ recsize <= fs->fs_bsize)
+ goto restart;
+ if (debug)
+ printf("Found invalid segsize %d > %d\n",
+ recsize, size);
+ recsize = DEV_BSIZE;
+ jblocks_advance(suj_jblocks, recsize);
+ continue;
+ }
+ /*
+ * Verify that all blocks in the segment are present.
+ */
+ for (i = 1; i < blocks; i++) {
+ recn = (void *)
+ ((uintptr_t)rec) + i * DEV_BSIZE;
+ if (recn->jsr_seq == rec->jsr_seq &&
+ recn->jsr_time == rec->jsr_time)
+ continue;
+ if (debug)
+ printf("Incomplete record %jd (%d)\n",
+ rec->jsr_seq, i);
+ recsize = i * DEV_BSIZE;
+ jblocks_advance(suj_jblocks, recsize);
+ goto restart;
+ }
+ seg = errmalloc(sizeof(*seg));
+ seg->ss_blk = errmalloc(recsize);
+ seg->ss_rec = *rec;
+ bcopy((void *)rec, seg->ss_blk, recsize);
+ if (rec->jsr_oldest > oldseq)
+ oldseq = rec->jsr_oldest;
+ TAILQ_INSERT_TAIL(&allsegs, seg, ss_next);
+ jblocks_advance(suj_jblocks, recsize);
+ }
+ }
+}
+
+/*
+ * Search a directory block for the SUJ_FILE.
+ */
+static void
+suj_find(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+ char block[MAXBSIZE];
+ struct direct *dp;
+ int bytes;
+ int off;
+
+ if (sujino)
+ return;
+ bytes = lfragtosize(fs, frags);
+ if (bread(disk, fsbtodb(fs, blk), block, bytes) <= 0)
+ err(1, "Failed to read ROOTINO directory block %jd", blk);
+ for (off = 0; off < bytes; off += dp->d_reclen) {
+ dp = (struct direct *)&block[off];
+ if (dp->d_reclen == 0)
+ break;
+ if (dp->d_ino == 0)
+ continue;
+ if (dp->d_namlen != strlen(SUJ_FILE))
+ continue;
+ if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
+ continue;
+ sujino = dp->d_ino;
+ return;
+ }
+}
+
+/*
+ * Orchestrate the verification of a filesystem via the softupdates journal.
+ */
+int
+suj_check(const char *filesys)
+{
+ union dinode *jip;
+ union dinode *ip;
+ uint64_t blocks;
+
+ opendisk(filesys);
+ TAILQ_INIT(&allsegs);
+ /*
+ * Find the journal inode.
+ */
+ ip = ino_read(ROOTINO);
+ sujino = 0;
+ ino_visit(ip, ROOTINO, suj_find, 0);
+ if (sujino == 0)
+ errx(1, "Journal inode removed. Use tunefs to re-create.");
+ /*
+ * Fetch the journal inode and verify it.
+ */
+ jip = ino_read(sujino);
+ printf("** SU+J Recovering %s\n", filesys);
+ if (suj_verifyino(jip) != 0)
+ return (-1);
+ /*
+ * Build a list of journal blocks in jblocks before parsing the
+ * available journal blocks in with suj_read().
+ */
+ printf("** Reading %jd byte journal from inode %d.\n",
+ DIP(jip, di_size), sujino);
+ suj_jblocks = jblocks_create();
+ blocks = ino_visit(jip, sujino, suj_add_block, 0);
+ if (blocks != numfrags(fs, DIP(jip, di_size)))
+ errx(1, "Sparse journal inode %d.\n", sujino);
+ suj_read();
+ jblocks_destroy(suj_jblocks);
+ suj_jblocks = NULL;
+ if (preen || reply("RECOVER")) {
+ printf("** Building recovery table.\n");
+ suj_prune();
+ suj_build();
+ cg_apply(cg_build);
+ printf("** Resolving unreferenced inode list.\n");
+ ino_unlinked();
+ printf("** Processing journal entries.\n");
+ cg_apply(cg_trunc);
+ cg_apply(cg_check_blk);
+ cg_apply(cg_check_ino);
+ }
+ if (preen == 0 && reply("WRITE CHANGES") == 0)
+ return (0);
+ /*
+ * To remain idempotent with partial truncations the free bitmaps
+ * must be written followed by indirect blocks and lastly inode
+ * blocks. This preserves access to the modified pointers until
+ * they are freed.
+ */
+ cg_apply(cg_write);
+ dblk_write();
+ cg_apply(cg_write_inos);
+ /* Write back superblock. */
+ closedisk(filesys);
+ printf("** %jd journal records in %jd bytes for %.2f%% utilization\n",
+ jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100);
+ printf("** Freed %jd inodes (%jd dirs) %jd blocks, and %jd frags.\n",
+ freeinos, freedir, freeblocks, freefrags);
+
+ return (0);
+}
+/*-
+ * Copyright (c) 2009 Jeffrey W. Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/disklabel.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ffs/fs.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <libufs.h>
+#include <strings.h>
+#include <err.h>
+#include <assert.h>
+
+#include "fsck.h"
+
+static void ino_decr(ino_t);
+
+#define SUJ_HASHSIZE 128
+#define SUJ_HASHMASK (SUJ_HASHSIZE - 1)
+#define SUJ_HASH(x) ((x * 2654435761) & SUJ_HASHMASK)
+
+struct suj_seg {
+ TAILQ_ENTRY(suj_seg) ss_next;
+ struct jsegrec ss_rec;
+ uint8_t *ss_blk;
+};
+
+struct suj_rec {
+ TAILQ_ENTRY(suj_rec) sr_next;
+ union jrec *sr_rec;
+};
+TAILQ_HEAD(srechd, suj_rec);
+
+struct suj_ino {
+ LIST_ENTRY(suj_ino) si_next;
+ struct srechd si_recs;
+ struct srechd si_movs;
+ ino_t si_ino;
+ int si_nlinkadj;
+ int si_skipparent;
+ int si_linkadj;
+ int si_hasrecs;
+ int si_blkadj;
+};
+LIST_HEAD(inohd, suj_ino);
+
+struct suj_blk {
+ LIST_ENTRY(suj_blk) sb_next;
+ struct srechd sb_recs;
+ ufs2_daddr_t sb_blk;
+};
+LIST_HEAD(blkhd, suj_blk);
+
+struct data_blk {
+ LIST_ENTRY(data_blk) db_next;
+ uint8_t *db_buf;
+ ufs2_daddr_t db_blk;
+ int db_size;
+};
+
+struct ino_blk {
+ LIST_ENTRY(ino_blk) ib_next;
+ uint8_t *ib_buf;
+ int ib_dirty;
+ ufs2_daddr_t ib_blk;
+};
+LIST_HEAD(iblkhd, ino_blk);
+
+struct suj_cg {
+ LIST_ENTRY(suj_cg) sc_next;
+ struct blkhd sc_blkhash[SUJ_HASHSIZE];
+ struct inohd sc_inohash[SUJ_HASHSIZE];
+ struct iblkhd sc_iblkhash[SUJ_HASHSIZE];
+ struct ino_blk *sc_lastiblk;
+ uint8_t *sc_cgbuf;
+ struct cg *sc_cgp;
+ int sc_dirty;
+ int sc_cgx;
+};
+
+LIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE];
+LIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE];
+
+TAILQ_HEAD(seghd, suj_seg) allsegs;
+uint64_t oldseq;
+static struct uufsd *disk = NULL;
+static struct fs *fs = NULL;
+
+/*
+ * Summary statistics.
+ */
+uint64_t freefrags;
+uint64_t freeblocks;
+uint64_t freeinos;
+uint64_t freedir;
+uint64_t jbytes;
+uint64_t jrecs;
+
+typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int);
+
+static void *
+errmalloc(size_t n)
+{
+ void *a;
+
+ a = malloc(n);
+ if (a == NULL)
+ errx(1, "malloc(%zu)", n);
+ return (a);
+}
+
+/*
+ * Open the given provider, load superblock.
+ */
+static void
+opendisk(const char *devnam)
+{
+ if (disk != NULL)
+ return;
+ disk = malloc(sizeof(*disk));
+ if (disk == NULL)
+ errx(1, "malloc(%zu)", sizeof(*disk));
+ if (ufs_disk_fillout(disk, devnam) == -1) {
+ err(1, "ufs_disk_fillout(%s) failed: %s", devnam,
+ disk->d_error);
+ }
+ fs = &disk->d_fs;
+ /*
+ * Setup a few things so reply() can work.
+ */
+ bcopy(fs, &sblock, sizeof(sblock));
+ fsreadfd = disk->d_fd;
+ fswritefd = disk->d_fd;
+}
+
+/*
+ * Mark file system as clean, write the super-block back, close the disk.
+ */
+static void
+closedisk(const char *devnam)
+{
+ struct csum *cgsum;
+ int i;
+
+ /*
+ * Recompute the fs summary info from correct cs summaries.
+ */
+ bzero(&fs->fs_cstotal, sizeof(struct csum_total));
+ for (i = 0; i < fs->fs_ncg; i++) {
+ cgsum = &fs->fs_cs(fs, i);
+ fs->fs_cstotal.cs_nffree += cgsum->cs_nffree;
+ fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree;
+ fs->fs_cstotal.cs_nifree += cgsum->cs_nifree;
+ fs->fs_cstotal.cs_ndir += cgsum->cs_ndir;
+ }
+ /* XXX Don't set clean for now, we don't trust the journal. */
+ /* fs->fs_clean = 1; */
+ fs->fs_time = time(NULL);
+ fs->fs_mtime = time(NULL);
+ if (sbwrite(disk, 0) == -1)
+ err(1, "sbwrite(%s)", devnam);
+ if (ufs_disk_close(disk) == -1)
+ err(1, "ufs_disk_close(%s)", devnam);
+ free(disk);
+ disk = NULL;
+ fs = NULL;
+ fsreadfd = -1;
+ fswritefd = -1;
+}
+
+/*
+ * Lookup a cg by number in the hash so we can keep track of which cgs
+ * need stats rebuilt.
+ */
+static struct suj_cg *
+cg_lookup(int cgx)
+{
+ struct cghd *hd;
+ struct suj_cg *sc;
+
+ if (cgx < 0 || cgx >= fs->fs_ncg) {
+ abort();
+ errx(1, "Bad cg number %d", cgx);
+ }
+ hd = &cghash[SUJ_HASH(cgx)];
+ LIST_FOREACH(sc, hd, sc_next)
+ if (sc->sc_cgx == cgx)
+ return (sc);
+ sc = errmalloc(sizeof(*sc));
+ bzero(sc, sizeof(*sc));
+ sc->sc_cgbuf = errmalloc(fs->fs_bsize);
+ sc->sc_cgp = (struct cg *)sc->sc_cgbuf;
+ sc->sc_cgx = cgx;
+ LIST_INSERT_HEAD(hd, sc, sc_next);
+ if (bread(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf,
+ fs->fs_bsize) == -1)
+ err(1, "Unable to read cylinder group %d", sc->sc_cgx);
+
+ return (sc);
+}
+
+/*
+ * Lookup an inode number in the hash and allocate a suj_ino if it does
+ * not exist.
+ */
+static struct suj_ino *
+ino_lookup(ino_t ino, int creat)
+{
+ struct suj_ino *sino;
+ struct inohd *hd;
+ struct suj_cg *sc;
+
+ sc = cg_lookup(ino_to_cg(fs, ino));
+ hd = &sc->sc_inohash[SUJ_HASH(ino)];
+ LIST_FOREACH(sino, hd, si_next)
+ if (sino->si_ino == ino)
+ return (sino);
+ if (creat == 0)
+ return (NULL);
+ sino = errmalloc(sizeof(*sino));
+ bzero(sino, sizeof(*sino));
+ sino->si_ino = ino;
+ sino->si_nlinkadj = 0;
+ TAILQ_INIT(&sino->si_recs);
+ TAILQ_INIT(&sino->si_movs);
+ LIST_INSERT_HEAD(hd, sino, si_next);
+
+ return (sino);
+}
+
+/*
+ * Lookup a block number in the hash and allocate a suj_blk if it does
+ * not exist.
+ */
+static struct suj_blk *
+blk_lookup(ufs2_daddr_t blk, int creat)
+{
+ struct suj_blk *sblk;
+ struct suj_cg *sc;
+ struct blkhd *hd;
+
+ sc = cg_lookup(dtog(fs, blk));
+ hd = &sc->sc_blkhash[SUJ_HASH(blk)];
+ LIST_FOREACH(sblk, hd, sb_next)
+ if (sblk->sb_blk == blk)
+ return (sblk);
+ if (creat == 0)
+ return (NULL);
+ sblk = errmalloc(sizeof(*sblk));
+ bzero(sblk, sizeof(*sblk));
+ sblk->sb_blk = blk;
+ TAILQ_INIT(&sblk->sb_recs);
+ LIST_INSERT_HEAD(hd, sblk, sb_next);
+
+ return (sblk);
+}
+
+static uint8_t *
+dblk_read(ufs2_daddr_t blk, int size)
+{
+ struct data_blk *dblk;
+ struct dblkhd *hd;
+
+ hd = &dbhash[SUJ_HASH(blk)];
+ LIST_FOREACH(dblk, hd, db_next)
+ if (dblk->db_blk == blk)
+ goto found;
+ /*
+ * The inode block wasn't located, allocate a new one.
+ */
+ dblk = errmalloc(sizeof(*dblk));
+ bzero(dblk, sizeof(*dblk));
+ LIST_INSERT_HEAD(hd, dblk, db_next);
+ dblk->db_blk = blk;
+found:
+ /*
+ * I doubt size mismatches can happen in practice but it is trivial
+ * to handle.
+ */
+ if (size != dblk->db_size) {
+ if (dblk->db_buf)
+ free(dblk->db_buf);
+ dblk->db_buf = errmalloc(size);
+ dblk->db_size = size;
+ if (bread(disk, fsbtodb(fs, blk), dblk->db_buf, size) == -1)
+ err(1, "Failed to read data block %jd", blk);
+ }
+ return (dblk->db_buf);
+}
+
+static union dinode *
+ino_read(ino_t ino)
+{
+ struct ino_blk *iblk;
+ struct iblkhd *hd;
+ struct suj_cg *sc;
+ ufs2_daddr_t blk;
+ int off;
+
+ blk = ino_to_fsba(fs, ino);
+ sc = cg_lookup(ino_to_cg(fs, ino));
+ hd = &sc->sc_iblkhash[SUJ_HASH(blk)];
+ LIST_FOREACH(iblk, hd, ib_next)
+ if (iblk->ib_blk == blk)
+ goto found;
+ /*
+ * The inode block wasn't located, allocate a new one.
+ */
+ iblk = errmalloc(sizeof(*iblk));
+ bzero(iblk, sizeof(*iblk));
+ iblk->ib_buf = errmalloc(fs->fs_bsize);
+ iblk->ib_blk = blk;
+ LIST_INSERT_HEAD(hd, iblk, ib_next);
+ if (bread(disk, fsbtodb(fs, blk), iblk->ib_buf, fs->fs_bsize) == -1)
+ err(1, "Failed to read inode block %jd", blk);
+found:
+ sc->sc_lastiblk = iblk;
+ off = ino_to_fsbo(fs, ino);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ return (union dinode *)&((struct ufs1_dinode *)iblk->ib_buf)[off];
+ else
+ return (union dinode *)&((struct ufs2_dinode *)iblk->ib_buf)[off];
+}
+
+static void
+ino_dirty(ino_t ino)
+{
+ struct ino_blk *iblk;
+ struct iblkhd *hd;
+ struct suj_cg *sc;
+ ufs2_daddr_t blk;
+
+ blk = ino_to_fsba(fs, ino);
+ sc = cg_lookup(ino_to_cg(fs, ino));
+ iblk = sc->sc_lastiblk;
+ if (iblk && iblk->ib_blk == blk) {
+ iblk->ib_dirty = 1;
+ return;
+ }
+ hd = &sc->sc_iblkhash[SUJ_HASH(blk)];
+ LIST_FOREACH(iblk, hd, ib_next) {
+ if (iblk->ib_blk == blk) {
+ iblk->ib_dirty = 1;
+ return;
+ }
+ }
+ ino_read(ino);
+ ino_dirty(ino);
+}
+
+static void
+iblk_write(struct ino_blk *iblk)
+{
+
+ if (iblk->ib_dirty == 0)
+ return;
+ if (bwrite(disk, fsbtodb(fs, iblk->ib_blk), iblk->ib_buf,
+ fs->fs_bsize) == -1)
+ err(1, "Failed to write inode block %jd", iblk->ib_blk);
+}
+
+/*
+ * Return 1 if the inode was free and 0 if it is allocated.
+ */
+static int
+ino_isfree(ino_t ino)
+{
+ struct suj_cg *sc;
+ uint8_t *inosused;
+ struct cg *cgp;
+ int cg;
+
+ cg = ino_to_cg(fs, ino);
+ ino = ino % fs->fs_ipg;
+ sc = cg_lookup(cg);
+ cgp = sc->sc_cgp;
+ inosused = cg_inosused(cgp);
+ return isclr(inosused, ino);
+}
+
+static int
+blk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags)
+{
+ ufs2_daddr_t bstart;
+ ufs2_daddr_t bend;
+ ufs2_daddr_t end;
+
+ end = start + frags;
+ bstart = brec->jb_blkno + brec->jb_oldfrags;
+ bend = bstart + brec->jb_frags;
+ if (start < bend && end > bstart)
+ return (1);
+ return (0);
+}
+
+static int
+blk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start,
+ int frags)
+{
+
+ if (brec->jb_ino != ino || brec->jb_lbn != lbn)
+ return (0);
+ if (brec->jb_blkno + brec->jb_oldfrags != start)
+ return (0);
+ if (brec->jb_frags != frags)
+ return (0);
+ return (1);
+}
+
+static void
+blk_setmask(struct jblkrec *brec, int *mask)
+{
+ int i;
+
+ for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++)
+ *mask |= 1 << i;
+}
+
+/*
+ * Determine whether a given block has been reallocated to a new location.
+ * Returns a mask of overlapping bits if any frags have been reused or
+ * zero if the block has not been re-used and the contents can be trusted.
+ *
+ * This is used to ensure that an orphaned pointer due to truncate is safe
+ * to be freed. The mask value can be used to free partial blocks.
+ */
+static int
+blk_isfree(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags)
+{
+ struct suj_blk *sblk;
+ struct suj_rec *srec;
+ struct jblkrec *brec;
+ int mask;
+ int off;
+
+ /*
+ * To be certain we're not freeing a reallocated block we lookup
+ * this block in the blk hash and see if there is an allocation
+ * journal record that overlaps with any fragments in the block
+ * we're concerned with. If any fragments have ben reallocated
+ * the block has already been freed and re-used for another purpose.
+ */
+ mask = 0;
+ sblk = blk_lookup(blknum(fs, blk), 0);
+ if (sblk == NULL)
+ return (0);
+ off = blk - sblk->sb_blk;
+ TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+ brec = (struct jblkrec *)srec->sr_rec;
+ /*
+ * If the block overlaps but does not match
+ * exactly it's a new allocation. If it matches
+ * exactly this record refers to the current
+ * location.
+ */
+ if (blk_overlaps(brec, blk, frags) == 0)
+ continue;
+ if (blk_equals(brec, ino, lbn, blk, frags) == 1)
+ mask = 0;
+ else
+ blk_setmask(brec, &mask);
+ }
+ if (debug)
+ printf("blk_isfree: blk %jd sblk %jd off %d mask 0x%X\n",
+ blk, sblk->sb_blk, off, mask);
+ return (mask >> off);
+}
+
+/*
+ * Determine whether it is safe to follow an indirect. It is not safe
+ * if any part of the indirect has been reallocated or the last journal
+ * entry was an allocation. Just allocated indirects may not have valid
+ * pointers yet and all of their children will have their own records.
+ *
+ * Returns 1 if it's safe to follow the indirect and 0 otherwise.
+ */
+static int
+blk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn)
+{
+ struct suj_blk *sblk;
+ struct jblkrec *brec;
+
+ sblk = blk_lookup(blk, 0);
+ if (sblk == NULL)
+ return (1);
+ if (TAILQ_EMPTY(&sblk->sb_recs))
+ return (1);
+ brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec;
+ if (blk_equals(brec, ino, lbn, blk, fs->fs_frag))
+ if (brec->jb_op == JOP_FREEBLK)
+ return (1);
+ return (0);
+}
+
+/*
+ * Clear an inode from the cg bitmap. If the inode was already clear return
+ * 0 so the caller knows it does not have to check the inode contents.
+ */
+static int
+ino_free(ino_t ino, int mode)
+{
+ struct suj_cg *sc;
+ uint8_t *inosused;
+ struct cg *cgp;
+ int cg;
+
+ cg = ino_to_cg(fs, ino);
+ ino = ino % fs->fs_ipg;
+ sc = cg_lookup(cg);
+ cgp = sc->sc_cgp;
+ inosused = cg_inosused(cgp);
+ /*
+ * The bitmap may never have made it to the disk so we have to
+ * conditionally clear. We can avoid writing the cg in this case.
+ */
+ if (isclr(inosused, ino))
+ return (0);
+ freeinos++;
+ clrbit(inosused, ino);
+ if (ino < cgp->cg_irotor)
+ cgp->cg_irotor = ino;
+ cgp->cg_cs.cs_nifree++;
+ if ((mode & IFMT) == IFDIR) {
+ freedir++;
+ cgp->cg_cs.cs_ndir--;
+ }
+ sc->sc_dirty = 1;
+
+ return (1);
+}
+
+/*
+ * Free 'frags' frags starting at filesystem block 'bno' skipping any frags
+ * set in the mask.
+ */
+static void
+blk_free(ufs2_daddr_t bno, int mask, int frags)
+{
+ ufs1_daddr_t fragno, cgbno;
+ struct suj_cg *sc;
+ struct cg *cgp;
+ int i, cg;
+ uint8_t *blksfree;
+
+ if (debug)
+ printf("Freeing %d frags at blk %jd\n", frags, bno);
+ cg = dtog(fs, bno);
+ sc = cg_lookup(cg);
+ cgp = sc->sc_cgp;
+ cgbno = dtogd(fs, bno);
+ blksfree = cg_blksfree(cgp);
+
+ /*
+ * If it's not allocated we only wrote the journal entry
+ * and never the bitmaps. Here we unconditionally clear and
+ * resolve the cg summary later.
+ */
+ if (frags == fs->fs_frag && mask == 0) {
+ fragno = fragstoblks(fs, cgbno);
+ ffs_setblock(fs, blksfree, fragno);
+ freeblocks++;
+ } else {
+ /*
+ * deallocate the fragment
+ */
+ for (i = 0; i < frags; i++)
+ if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) {
+ freefrags++;
+ setbit(blksfree, cgbno + i);
+ }
+ }
+ sc->sc_dirty = 1;
+}
+
+/*
+ * Fetch an indirect block to find the block at a given lbn. The lbn
+ * may be negative to fetch a specific indirect block pointer or positive
+ * to fetch a specific block.
+ */
+static ufs2_daddr_t
+indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn, int level)
+{
+ ufs2_daddr_t *bap2;
+ ufs2_daddr_t *bap1;
+ ufs_lbn_t lbnadd;
+ ufs_lbn_t base;
+ int i;
+
+ if (blk == 0)
+ return (0);
+ if (cur == lbn)
+ return (blk);
+ if (level == 0 && lbn < 0) {
+ abort();
+ errx(1, "Invalid lbn %jd", lbn);
+ }
+ bap2 = (void *)dblk_read(blk, fs->fs_bsize);
+ bap1 = (void *)bap2;
+ lbnadd = 1;
+ base = -(cur + level);
+ for (i = level; i > 0; i--)
+ lbnadd *= NINDIR(fs);
+ if (lbn > 0)
+ i = (lbn - base) / lbnadd;
+ else
+ i = (-lbn - base) / lbnadd;
+ if (i < 0 || i >= NINDIR(fs)) {
+ abort();
+ errx(1, "Invalid indirect index %d produced by lbn %jd",
+ i, lbn);
+ }
+ if (level == 0)
+ cur = base + (i * lbnadd);
+ else
+ cur = -(base + (i * lbnadd)) - (level - 1);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ blk = bap1[i];
+ else
+ blk = bap2[i];
+ if (cur == lbn)
+ return (blk);
+ if (level == 0) {
+ abort();
+ errx(1, "Invalid lbn %jd at level 0", lbn);
+ }
+ return indir_blkatoff(blk, ino, cur, lbn, level - 1);
+}
+
+/*
+ * Finds the disk block address at the specified lbn within the inode
+ * specified by ip. This follows the whole tree and honors di_size and
+ * di_extsize so it is a true test of reachability. The lbn may be
+ * negative if an extattr or indirect block is requested.
+ */
+static ufs2_daddr_t
+ino_blkatoff(union dinode *ip, ino_t ino, ufs_lbn_t lbn, int *frags)
+{
+ ufs_lbn_t tmpval;
+ ufs_lbn_t cur;
+ ufs_lbn_t next;
+ int i;
+
+ /*
+ * Handle extattr blocks first.
+ */
+ if (lbn < 0 && lbn >= -NXADDR) {
+ lbn = -1 - lbn;
+ if (lbn > lblkno(fs, ip->dp2.di_extsize - 1))
+ return (0);
+ *frags = numfrags(fs, sblksize(fs, ip->dp2.di_extsize, lbn));
+ return (ip->dp2.di_extb[lbn]);
+ }
+ /*
+ * And now direct and indirect. Verify that the lbn does not
+ * exceed the size required to store the file by asking for
+ * the lbn of the last byte. These blocks should be 0 anyway
+ * so this simply saves the traversal.
+ */
+ if (lbn > 0 && lbn > lblkno(fs, DIP(ip, di_size) - 1))
+ return (0);
+ if (lbn < 0 && -lbn > lblkno(fs, DIP(ip, di_size) - 1))
+ return (0);
+ if (lbn >= 0 && lbn < NDADDR) {
+ *frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn));
+ return (DIP(ip, di_db[lbn]));
+ }
+ *frags = fs->fs_frag;
+
+ for (i = 0, tmpval = NINDIR(fs), cur = NDADDR; i < NIADDR; i++,
+ tmpval *= NINDIR(fs), cur = next) {
+ next = cur + tmpval;
+ if (lbn == -cur)
+ return (DIP(ip, di_ib[i]));
+ /*
+ * Determine whether the lbn in question is within this tree.
+ */
+ if (lbn < 0 && -lbn >= next)
+ continue;
+ if (lbn > 0 && lbn >= next)
+ continue;
+
+ return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn, i);
+ }
+ errx(1, "lbn %jd not in ino", lbn);
+}
+
+/*
+ * Determine whether a block exists at a particular lbn in an inode.
+ * Returns 1 if found, 0 if not. lbn may be negative for indirects
+ * or ext blocks.
+ */
+static int
+blk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags)
+{
+ union dinode *ip;
+ ufs2_daddr_t nblk;
+
+ ip = ino_read(ino);
+
+ if (DIP(ip, di_nlink) == 0 || DIP(ip, di_mode) == 0)
+ return (0);
+ nblk = ino_blkatoff(ip, ino, lbn, frags);
+
+ return (nblk == blk);
+}
+
+/*
+ * Determines whether a pointer to an inode exists within a directory
+ * at a specified offset. Returns the mode of the found entry.
+ */
+static int
+ino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot)
+{
+ union dinode *dip;
+ struct direct *dp;
+ ufs2_daddr_t blk;
+ uint8_t *block;
+ ufs_lbn_t lbn;
+ int blksize;
+ int frags;
+ int dpoff;
+ int doff;
+
+ *isdot = 0;
+ dip = ino_read(parent);
+ *mode = DIP(dip, di_mode);
+ if ((*mode & IFMT) != IFDIR) {
+ if (debug) {
+ /* This can happen if the parent inode was reallocated. */
+ if (*mode != 0)
+ printf("Directory %d has bad mode %o\n",
+ parent, *mode);
+ else
+ printf("Directory %d zero inode\n", parent);
+ }
+ return (0);
+ }
+ lbn = lblkno(fs, diroff);
+ doff = blkoff(fs, diroff);
+ blksize = sblksize(fs, DIP(dip, di_size), lbn);
+ if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) {
+ if (debug)
+ printf("ino %d absent from %d due to offset %jd"
+ " exceeding size %jd\n",
+ child, parent, diroff, DIP(dip, di_size));
+ return (0);
+ }
+ blk = ino_blkatoff(dip, parent, lbn, &frags);
+ if (blk <= 0) {
+ if (debug)
+ printf("Sparse directory %d", parent);
+ return (0);
+ }
+ block = dblk_read(blk, blksize);
+ /*
+ * Walk through the records from the start of the block to be
+ * certain we hit a valid record and not some junk in the middle
+ * of a file name. Stop when we reach or pass the expected offset.
+ */
+ dpoff = 0;
+ do {
+ dp = (struct direct *)&block[dpoff];
+ if (dpoff == doff)
+ break;
+ if (dp->d_reclen == 0)
+ break;
+ dpoff += dp->d_reclen;
+ } while (dpoff <= doff);
+ if (dpoff > fs->fs_bsize)
+ errx(1, "Corrupt directory block in dir inode %d", parent);
+ /* Not found. */
+ if (dpoff != doff) {
+ if (debug)
+ printf("ino %d not found in %d, lbn %jd, dpoff %d\n",
+ child, parent, lbn, dpoff);
+ return (0);
+ }
+ /*
+ * We found the item in question. Record the mode and whether it's
+ * a . or .. link for the caller.
+ */
+ if (dp->d_ino == child) {
+ if (child == parent)
+ *isdot = 1;
+ else if (dp->d_namlen == 2 &&
+ dp->d_name[0] == '.' && dp->d_name[1] == '.')
+ *isdot = 1;
+ *mode = DTTOIF(dp->d_type);
+ return (1);
+ }
+ if (debug)
+ printf("ino %d doesn't match dirent ino %d in parent %d\n",
+ child, dp->d_ino, parent);
+ return (0);
+}
+
+#define VISIT_INDIR 0x0001
+#define VISIT_EXT 0x0002
+
+/*
+ * Read an indirect level which may or may not be linked into an inode.
+ */
+static void
+indir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags,
+ ino_visitor visitor, int flags)
+{
+ ufs2_daddr_t *bap2;
+ ufs1_daddr_t *bap1;
+ ufs_lbn_t lbnadd;
+ ufs2_daddr_t nblk;
+ ufs_lbn_t nlbn;
+ int level;
+ int i;
+
+ /*
+ * Don't visit indirect blocks with contents we can't trust. This
+ * should only happen when indir_visit() is called to complete a
+ * truncate that never finished and not when a pointer is found via
+ * an inode.
+ */
+ if (blk == 0)
+ return;
+ if (blk_isindir(blk, ino, lbn) == 0) {
+ if (debug)
+ printf("blk %jd ino %d lbn %jd is not indir.\n",
+ blk, ino, lbn);
+ goto out;
+ }
+ level = lbn_level(lbn);
+ if (level == -1) {
+ abort();
+ errx(1, "Invalid level for lbn %jd", lbn);
+ }
+ lbnadd = 1;
+ for (i = level; i > 0; i--)
+ lbnadd *= NINDIR(fs);
+ bap1 = (void *)dblk_read(blk, fs->fs_bsize);
+ bap2 = (void *)bap1;
+ for (i = 0; i < NINDIR(fs); i++) {
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ nblk = *bap1++;
+ else
+ nblk = *bap2++;
+ if (nblk == 0)
+ continue;
+ if (level == 0) {
+ nlbn = -lbn + i * lbnadd;
+ (*frags) += fs->fs_frag;
+ visitor(ino, nlbn, nblk, fs->fs_frag);
+ } else {
+ nlbn = (lbn + 1) - (i * lbnadd);
+ indir_visit(ino, nlbn, nblk, frags, visitor, flags);
+ }
+ }
+out:
+ if (flags & VISIT_INDIR) {
+ (*frags) += fs->fs_frag;
+ visitor(ino, lbn, blk, fs->fs_frag);
+ }
+}
+
+/*
+ * Visit each block in an inode as specified by 'flags' and call a
+ * callback function. The callback may inspect or free blocks. The
+ * count of frags found according to the size in the file is returned.
+ * This is not valid for sparse files but may be used to determine
+ * the correct di_blocks for a file.
+ */
+static uint64_t
+ino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags)
+{
+ ufs_lbn_t tmpval;
+ ufs_lbn_t lbn;
+ uint64_t size;
+ uint64_t fragcnt;
+ int mode;
+ int frags;
+ int i;
+
+ size = DIP(ip, di_size);
+ mode = DIP(ip, di_mode) & IFMT;
+ fragcnt = 0;
+ if ((flags & VISIT_EXT) &&
+ fs->fs_magic == FS_UFS2_MAGIC && ip->dp2.di_extsize) {
+ for (i = 0; i < NXADDR; i++) {
+ if (ip->dp2.di_extb[i] == 0)
+ continue;
+ frags = sblksize(fs, ip->dp2.di_extsize, i);
+ frags = numfrags(fs, frags);
+ fragcnt += frags;
+ visitor(ino, -1 - i, ip->dp2.di_extb[i], frags);
+ }
+ }
+ /* Skip datablocks for short links and devices. */
+ if (mode == IFBLK || mode == IFCHR ||
+ (mode == IFLNK && size < fs->fs_maxsymlinklen))
+ return (fragcnt);
+ for (i = 0; i < NDADDR; i++) {
+ if (DIP(ip, di_db[i]) == 0)
+ continue;
+ frags = sblksize(fs, size, i);
+ frags = numfrags(fs, frags);
+ fragcnt += frags;
+ visitor(ino, i, DIP(ip, di_db[i]), frags);
+ }
+ for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++,
+ tmpval *= NINDIR(fs), lbn += tmpval) {
+ if (DIP(ip, di_ib[i]) == 0)
+ continue;
+ indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor,
+ flags);
+ }
+ return (fragcnt);
+}
+
+/*
+ * Null visitor function used when we just want to count blocks.
+ */
+static void
+null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+}
+
+/*
+ * Recalculate di_blocks when we discover that a block allocation or
+ * free was not successfully completed. The kernel does not roll this back
+ * because it would be too expensive to compute which indirects were
+ * reachable at the time the inode was written.
+ */
+static void
+ino_adjblks(ino_t ino)
+{
+ struct suj_ino *sino;
+ union dinode *ip;
+ uint64_t blocks;
+ uint64_t frags;
+
+ sino = ino_lookup(ino, 1);
+ if (sino->si_blkadj)
+ return;
+ sino->si_blkadj = 1;
+ ip = ino_read(ino);
+ /* No need to adjust zero'd inodes. */
+ if (DIP(ip, di_mode) == 0)
+ return;
+ frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT);
+ blocks = fsbtodb(fs, frags);
+ if (blocks == DIP(ip, di_blocks))
+ return;
+ if (debug)
+ printf("ino %d adjusting block count from %jd to %jd\n",
+ ino, DIP(ip, di_blocks), blocks);
+ DIP_SET(ip, di_blocks, blocks);
+ ino_dirty(ino);
+}
+
+static void
+blk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+ int mask;
+
+ mask = blk_isfree(blk, ino, lbn, frags);
+ if (debug)
+ printf("blk %jd freemask 0x%X\n", blk, mask);
+ blk_free(blk, mask, frags);
+}
+
+/*
+ * Free a block or tree of blocks that was previously rooted in ino at
+ * the given lbn. If the lbn is an indirect all children are freed
+ * recursively.
+ */
+static void
+blk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow)
+{
+ uint64_t resid;
+ int mask;
+
+ mask = blk_isfree(blk, ino, lbn, frags);
+ if (debug)
+ printf("blk %jd freemask 0x%X\n", blk, mask);
+ resid = 0;
+ if (lbn <= -NDADDR && follow && mask == 0)
+ indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR);
+ else
+ blk_free(blk, mask, frags);
+}
+
+static void
+ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+ struct suj_ino *sino;
+ struct suj_rec *srec;
+ struct jrefrec *rrec;
+ struct direct *dp;
+ off_t diroff;
+ uint8_t *block;
+ int skipparent;
+ int isparent;
+ int dpoff;
+ int size;
+
+ sino = ino_lookup(ino, 0);
+ if (sino)
+ skipparent = sino->si_skipparent;
+ else
+ skipparent = 0;
+ size = lfragtosize(fs, frags);
+ block = dblk_read(blk, size);
+ dp = (struct direct *)&block[0];
+ for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) {
+ dp = (struct direct *)&block[dpoff];
+ if (dp->d_ino == 0 || dp->d_ino == WINO)
+ continue;
+ if (dp->d_namlen == 1 && dp->d_name[0] == '.')
+ continue;
+ isparent = dp->d_namlen == 2 && dp->d_name[0] == '.' &&
+ dp->d_name[1] == '.';
+ if (isparent && skipparent == 1)
+ continue;
+ if (debug)
+ printf("Directory %d removing inode %d name %s\n",
+ ino, dp->d_ino, dp->d_name);
+ /*
+ * Lookup this inode to see if we have a record for it.
+ * If not, we've already adjusted it assuming this path
+ * was valid and we have to adjust once more.
+ */
+ sino = ino_lookup(dp->d_ino, 0);
+ if (sino == NULL || sino->si_linkadj || sino->si_hasrecs == 0) {
+ ino_decr(dp->d_ino);
+ continue;
+ }
+ /*
+ * Tell any child directories we've already removed their
+ * parent. Don't try to adjust our link down again.
+ */
+ if (isparent == 0)
+ sino->si_skipparent = 1;
+ /*
+ * If we haven't yet processed this inode we need to make
+ * sure we will successfully discover the lost path. If not
+ * use nlinkadj to remember.
+ */
+ diroff = lblktosize(fs, lbn) + dpoff;
+ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+ rrec = (struct jrefrec *)srec->sr_rec;
+ if (rrec->jr_parent == ino &&
+ rrec->jr_diroff == diroff)
+ break;
+ }
+ if (srec == NULL)
+ sino->si_nlinkadj--;
+ }
+}
+
+/*
+ * Truncate an inode, freeing all blocks and decrementing all children's
+ * link counts. Free the inode back to the cg.
+ */
+static void
+ino_truncate(union dinode *ip, ino_t ino, int mode)
+{
+ uint32_t gen;
+
+ if (ino == ROOTINO)
+ errx(1, "Attempting to free ROOTINO");
+ if (debug)
+ printf("Truncating and freeing ino %d, nlink %d, mode %o\n",
+ ino, DIP(ip, di_nlink), DIP(ip, di_mode));
+
+ /* We are freeing an inode or directory. */
+ if ((DIP(ip, di_mode) & IFMT) == IFDIR)
+ ino_visit(ip, ino, ino_free_children, 0);
+ DIP_SET(ip, di_nlink, 0);
+ ino_visit(ip, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR);
+ /* Here we have to clear the inode and release any blocks it holds. */
+ gen = DIP(ip, di_gen);
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ bzero(ip, sizeof(struct ufs1_dinode));
+ else
+ bzero(ip, sizeof(struct ufs2_dinode));
+ DIP_SET(ip, di_gen, gen);
+ ino_dirty(ino);
+ ino_free(ino, mode);
+ return;
+}
+
+/*
+ * Adjust an inode's link count down by one when a directory goes away.
+ */
+static void
+ino_decr(ino_t ino)
+{
+ union dinode *ip;
+ int reqlink;
+ int nlink;
+ int mode;
+
+ ip = ino_read(ino);
+ nlink = DIP(ip, di_nlink);
+ mode = DIP(ip, di_mode);
+ if (nlink < 1)
+ errx(1, "Inode %d link count %d invalid", ino, nlink);
+ if (mode == 0)
+ errx(1, "Inode %d has a link of %d with 0 mode.", ino, nlink);
+ nlink--;
+ if ((mode & IFMT) == IFDIR)
+ reqlink = 2;
+ else
+ reqlink = 1;
+ if (nlink < reqlink) {
+ if (debug)
+ printf("ino %d not enough links to live %d < %d\n",
+ ino, nlink, reqlink);
+ ino_truncate(ip, ino, mode);
+ return;
+ }
+ DIP_SET(ip, di_nlink, nlink);
+ ino_dirty(ino);
+}
+
+/*
+ * Adjust the inode link count to 'nlink'. If the count reaches zero
+ * free it.
+ */
+static void
+ino_adjust(ino_t ino, int lastmode, nlink_t nlink)
+{
+ union dinode *ip;
+ int reqlink;
+ int mode;
+
+ ip = ino_read(ino);
+ mode = DIP(ip, di_mode) & IFMT;
+ if (nlink > LINK_MAX)
+ errx(1,
+ "ino %d nlink manipulation error, new link %d, old link %d",
+ ino, nlink, DIP(ip, di_nlink));
+ if (debug)
+ printf("Adjusting ino %d, nlink %d, old link %d lastmode %o\n",
+ ino, nlink, DIP(ip, di_nlink), lastmode);
+ if (mode == 0) {
+ if (debug)
+ printf("ino %d, zero inode freeing bitmap\n", ino);
+ ino_free(ino, lastmode);
+ return;
+ }
+ /* XXX Should be an assert? */
+ if (mode != lastmode && debug)
+ printf("ino %d, mode %o != %o\n", ino, mode, lastmode);
+ if ((mode & IFMT) == IFDIR)
+ reqlink = 2;
+ else
+ reqlink = 1;
+ /* If the inode doesn't have enough links to live, free it. */
+ if (nlink < reqlink) {
+ if (debug)
+ printf("ino %d not enough links to live %d < %d\n",
+ ino, nlink, reqlink);
+ ino_truncate(ip, ino, mode);
+ return;
+ }
+ /* If required write the updated link count. */
+ if (DIP(ip, di_nlink) == nlink) {
+ if (debug)
+ printf("ino %d, link matches, skipping.\n", ino);
+ return;
+ }
+ DIP_SET(ip, di_nlink, nlink);
+ ino_dirty(ino);
+}
+
+#define DOTDOT_OFFSET DIRECTSIZ(1)
+
+/*
+ * Process records available for one inode and determine whether the
+ * link count is correct or needs adjusting.
+ *
+ * XXX Failed to fix zero length directory. Shouldn't .. have been mising?
+ */
+static void
+ino_check(struct suj_ino *sino)
+{
+ struct suj_rec *srec;
+ struct jrefrec *rrec;
+ struct suj_ino *stmp;
+ nlink_t dotlinks;
+ int newlinks;
+ int removes;
+ int nlink;
+ ino_t ino;
+ int isdot;
+ int isat;
+ int mode;
+
+ if (sino->si_hasrecs == 0)
+ return;
+ ino = sino->si_ino;
+ /*
+ * XXX ino_isfree currently is skipping initialized inodes
+ * that are unreferenced.
+ */
+ if (0 && ino_isfree(ino))
+ return;
+ rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec;
+ nlink = rrec->jr_nlink;
+ newlinks = sino->si_nlinkadj;
+ dotlinks = 0;
+ removes = 0;
+ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+ rrec = (struct jrefrec *)srec->sr_rec;
+ isat = ino_isat(rrec->jr_parent, rrec->jr_diroff,
+ rrec->jr_ino, &mode, &isdot);
+ if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT))
+ errx(1, "Inode mode/directory type mismatch %o != %o",
+ mode, rrec->jr_mode);
+ if (debug)
+ printf("jrefrec: op %d ino %d, nlink %d, parent %d, "
+ "diroff %jd, mode %o, isat %d, isdot %d\n",
+ rrec->jr_op, rrec->jr_ino, rrec->jr_nlink,
+ rrec->jr_parent, rrec->jr_diroff, rrec->jr_mode,
+ isat, isdot);
+ mode = rrec->jr_mode & IFMT;
+ if (rrec->jr_op == JOP_REMREF)
+ removes++;
+ newlinks += isat;
+ if (isdot)
+ dotlinks += isat;
+ }
+ /*
+ * The number of links that remain are the starting link count
+ * subtracted by the total number of removes with the total
+ * links discovered back in. An incomplete remove thus
+ * makes no change to the link count but an add increases
+ * by one.
+ */
+ nlink += newlinks;
+ nlink -= removes;
+ /*
+ * If it's a directory with no real names pointing to it go ahead
+ * and truncate it. This will free any children.
+ */
+ if ((mode & IFMT) == IFDIR && nlink - dotlinks == 0) {
+ nlink = 0;
+ /*
+ * Mark any .. links so they know not to free this inode
+ * when they are removed.
+ */
+ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
+ rrec = (struct jrefrec *)srec->sr_rec;
+ if (rrec->jr_diroff == DOTDOT_OFFSET) {
+ stmp = ino_lookup(rrec->jr_parent, 0);
+ if (stmp)
+ stmp->si_skipparent = 1;
+ }
+ }
+ }
+ sino->si_linkadj = 1;
+ ino_adjust(ino, mode, nlink);
+}
+
+/*
+ * Process records available for one block and determine whether it is
+ * still allocated and whether the owning inode needs to be updated or
+ * a free completed.
+ */
+static void
+blk_check(struct suj_blk *sblk)
+{
+ struct suj_rec *srec;
+ struct jblkrec *brec;
+ ufs2_daddr_t blk;
+ int mask;
+ int frags;
+ int isat;
+
+ /*
+ * Each suj_blk actually contains records for any fragments in that
+ * block. As a result we must evaluate each record individually.
+ */
+ TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+ brec = (struct jblkrec *)srec->sr_rec;
+ frags = brec->jb_frags;
+ blk = brec->jb_blkno + brec->jb_oldfrags;
+ isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags);
+ if (debug)
+ printf("op %d blk %jd ino %d lbn %jd frags %d isat %d (%d)\n",
+ brec->jb_op, blk, brec->jb_ino, brec->jb_lbn,
+ brec->jb_frags, isat, frags);
+ /*
+ * If we found the block at this address we still have to
+ * determine if we need to free the tail end that was
+ * added by adding contiguous fragments from the same block.
+ */
+ if (isat == 1) {
+ if (frags == brec->jb_frags)
+ continue;
+ mask = blk_isfree(blk, brec->jb_ino, brec->jb_lbn,
+ brec->jb_frags);
+ mask >>= frags;
+ blk += frags;
+ frags = brec->jb_frags - frags;
+ blk_free(blk, mask, frags);
+ ino_adjblks(brec->jb_ino);
+ continue;
+ }
+ /*
+ * The block wasn't found, attempt to free it. It won't be
+ * freed if it was actually reallocated. If this was an
+ * allocation we don't want to follow indirects as they
+ * may not be written yet. Any children of the indirect will
+ * have their own records. If it's a free we need to
+ * recursively free children.
+ */
+ blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags,
+ brec->jb_op == JOP_FREEBLK);
+ ino_adjblks(brec->jb_ino);
+ }
+}
+
+/*
+ * Walk the list of inode and block records for this cg, recovering any
+ * changes which were not complete at the time of crash.
+ */
+static void
+cg_check(struct suj_cg *sc)
+{
+ struct suj_blk *nextb;
+ struct suj_ino *nexti;
+ struct suj_ino *sino;
+ struct suj_blk *sblk;
+ int i;
+
+ if (debug)
+ printf("Recovering cg %d\n", sc->sc_cgx);
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH_SAFE(sino, &sc->sc_inohash[i], si_next, nexti)
+ ino_check(sino);
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH_SAFE(sblk, &sc->sc_blkhash[i], sb_next, nextb)
+ blk_check(sblk);
+}
+
+/*
+ * Write a potentially dirty cg. All inodes must be written before the
+ * cg maps are so that an allocated inode is never marked free, even if
+ * we crash during fsck.
+ */
+static void
+cg_write(struct suj_cg *sc)
+{
+ struct ino_blk *iblk;
+ ufs1_daddr_t fragno, cgbno, maxbno;
+ u_int8_t *blksfree;
+ struct cg *cgp;
+ int blk;
+ int i;
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next)
+ iblk_write(iblk);
+ if (sc->sc_dirty == 0)
+ return;
+ /*
+ * Fix the frag and cluster summary.
+ */
+ cgp = sc->sc_cgp;
+ cgp->cg_cs.cs_nbfree = 0;
+ cgp->cg_cs.cs_nffree = 0;
+ bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum));
+ maxbno = fragstoblks(fs, fs->fs_fpg);
+ if (fs->fs_contigsumsize > 0) {
+ for (i = 1; i <= fs->fs_contigsumsize; i++)
+ cg_clustersum(cgp)[i] = 0;
+ bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT));
+ }
+ blksfree = cg_blksfree(cgp);
+ for (cgbno = 0; cgbno < maxbno; cgbno++) {
+ if (ffs_isfreeblock(fs, blksfree, cgbno))
+ continue;
+ if (ffs_isblock(fs, blksfree, cgbno)) {
+ ffs_clusteracct(fs, cgp, cgbno, 1);
+ cgp->cg_cs.cs_nbfree++;
+ continue;
+ }
+ fragno = blkstofrags(fs, cgbno);
+ blk = blkmap(fs, blksfree, fragno);
+ ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
+ for (i = 0; i < fs->fs_frag; i++)
+ if (isset(blksfree, fragno + i))
+ cgp->cg_cs.cs_nffree++;
+ }
+ /*
+ * Update the superblock cg summary from our now correct values
+ * before writing the block.
+ */
+ fs->fs_cs(fs, sc->sc_cgx) = cgp->cg_cs;
+ if (bwrite(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf,
+ fs->fs_bsize) == -1)
+ err(1, "Unable to write cylinder group %d", sc->sc_cgx);
+}
+
+static void
+cg_apply(void (*apply)(struct suj_cg *))
+{
+ struct suj_cg *scg;
+ int i;
+
+ for (i = 0; i < SUJ_HASHSIZE; i++)
+ LIST_FOREACH(scg, &cghash[i], sc_next)
+ apply(scg);
+}
+
+/*
+ * Process the unlinked but referenced file list. Freeing all inodes.
+ */
+static void
+ino_unlinked(void)
+{
+ union dinode *ip;
+ uint16_t mode;
+ ino_t inon;
+ ino_t ino;
+
+ ino = fs->fs_sujfree;
+ fs->fs_sujfree = 0;
+ while (ino != 0) {
+ ip = ino_read(ino);
+ mode = DIP(ip, di_mode) & IFMT;
+ inon = DIP(ip, di_freelink);
+ DIP_SET(ip, di_freelink, 0);
+ /*
+ * XXX Should this be an errx?
+ */
+ if (DIP(ip, di_nlink) == 0) {
+ if (debug)
+ printf("Freeing unlinked ino %d mode %o\n",
+ ino, mode);
+ ino_truncate(ip, ino, mode);
+ } else if (debug)
+ printf("Skipping ino %d mode %o with link %d\n",
+ ino, mode, DIP(ip, di_nlink));
+ ino = inon;
+ }
+}
+
+/*
+ * If we see two ops for the same inode to the same parent at the same
+ * offset we could miscount the link with ino_isat() returning twice.
+ * Keep only the first record because it has the valid link count but keep
+ * the mode from the final op as that should be the correct mode in case
+ * it changed.
+ */
+static void
+suj_build_ino(struct jrefrec *refrec)
+{
+ struct jmvrec *mvrec;
+ struct suj_rec *srec;
+ struct suj_ino *sino;
+ struct suj_rec *srn;
+ struct jrefrec *rrn;
+
+ if (debug)
+ printf("suj_build_ino: op %d, ino %d, nlink %d, parent %d, diroff %jd\n",
+ refrec->jr_op, refrec->jr_ino, refrec->jr_nlink, refrec->jr_parent,
+ refrec->jr_diroff);
+ sino = ino_lookup(refrec->jr_ino, 1);
+ /*
+ * Search for a mvrec that matches this offset. Whether it's an add
+ * or a remove we can delete the mvref. It no longer applies to this
+ * location.
+ *
+ * For removes, we have to find the original offset so we can create
+ * a remove that matches the earlier add so it can be abandoned
+ * if necessary. We create an add in the new location so we can
+ * tolerate the directory block as it existed before or after
+ * the move.
+ */
+ if (!TAILQ_EMPTY(&sino->si_movs)) {
+ for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn;
+ srn = TAILQ_PREV(srn, srechd, sr_next)) {
+ mvrec = (struct jmvrec *)srn->sr_rec;
+ if (mvrec->jm_parent != refrec->jr_parent ||
+ mvrec->jm_newoff != refrec->jr_diroff)
+ continue;
+ TAILQ_REMOVE(&sino->si_movs, srn, sr_next);
+ if (refrec->jr_op == JOP_REMREF) {
+ rrn = errmalloc(sizeof(*refrec));
+ *rrn = *refrec;
+ rrn->jr_op = JOP_ADDREF;
+ suj_build_ino(rrn);
+ refrec->jr_diroff = mvrec->jm_oldoff;
+ }
+ }
+ }
+ /*
+ * We walk backwards so that adds and removes are evaluated in the
+ * correct order.
+ */
+ for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
+ srn = TAILQ_PREV(srn, srechd, sr_next)) {
+ rrn = (struct jrefrec *)srn->sr_rec;
+ if (rrn->jr_parent != refrec->jr_parent ||
+ rrn->jr_diroff != refrec->jr_diroff)
+ continue;
+ if (debug)
+ printf("Discarding dup.\n");
+ rrn->jr_mode = refrec->jr_mode;
+ return;
+ }
+ sino->si_hasrecs = 1;
+ srec = errmalloc(sizeof(*srec));
+ srec->sr_rec = (union jrec *)refrec;
+ TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next);
+}
+
+/*
+ * Apply a move record to an inode. We must search for adds that preceed us
+ * and add duplicates because we won't know which location to search first.
+ * Then we add movs to a queue that is maintained until the moved location
+ * is removed. If a single record is moved multiple times we only maintain
+ * one copy that contains the original and final diroffs.
+ */
+static void
+suj_move_ino(struct jmvrec *mvrec)
+{
+ struct jrefrec *refrec;
+ struct suj_ino *sino;
+ struct suj_rec *srec;
+ struct jmvrec *mvrn;
+ struct suj_rec *srn;
+ struct jrefrec *rrn;
+
+ if (debug)
+ printf("suj_move_ino: ino %d, parent %d, diroff %jd, oldoff %jd\n",
+ mvrec->jm_ino, mvrec->jm_parent, mvrec->jm_newoff,
+ mvrec->jm_oldoff);
+ sino = ino_lookup(mvrec->jm_ino, 0);
+ if (sino == NULL)
+ return;
+ /*
+ * We walk backwards so we only evaluate the most recent record at
+ * this offset.
+ */
+ for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
+ srn = TAILQ_PREV(srn, srechd, sr_next)) {
+ rrn = (struct jrefrec *)srn->sr_rec;
+ if (rrn->jr_op != JOP_ADDREF)
+ continue;
+ if (rrn->jr_parent != mvrec->jm_parent ||
+ rrn->jr_diroff != mvrec->jm_oldoff)
+ continue;
+ /*
+ * When an entry is moved we don't know whether the write
+ * to move has completed yet. To resolve this we create
+ * a new add dependency in the new location as if it were added
+ * twice. Only one will succeed.
+ */
+ refrec = errmalloc(sizeof(*refrec));
+ refrec->jr_op = JOP_ADDREF;
+ refrec->jr_ino = mvrec->jm_ino;
+ refrec->jr_parent = mvrec->jm_parent;
+ refrec->jr_diroff = mvrec->jm_newoff;
+ refrec->jr_mode = rrn->jr_mode;
+ refrec->jr_nlink = rrn->jr_nlink;
+ suj_build_ino(refrec);
+ break;
+ }
+ /*
+ * Add this mvrec to the queue of pending mvs.
+ */
+ for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn;
+ srn = TAILQ_PREV(srn, srechd, sr_next)) {
+ mvrn = (struct jmvrec *)srn->sr_rec;
+ if (mvrn->jm_parent != mvrec->jm_parent ||
+ mvrn->jm_newoff != mvrec->jm_oldoff)
+ continue;
+ mvrn->jm_newoff = mvrec->jm_newoff;
+ return;
+ }
+ srec = errmalloc(sizeof(*srec));
+ srec->sr_rec = (union jrec *)mvrec;
+ TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next);
+}
+
+/*
+ * Modify journal records so they refer to the base block number
+ * and a start and end frag range. This is to facilitate the discovery
+ * of overlapping fragment allocations.
+ */
+static void
+suj_build_blk(struct jblkrec *blkrec)
+{
+ struct suj_rec *srec;
+ struct suj_blk *sblk;
+ struct jblkrec *blkrn;
+ ufs2_daddr_t blk;
+ int frag;
+
+ if (debug)
+ printf("suj_build_blk: op %d blkno %jd frags %d oldfrags %d "
+ "ino %d lbn %jd\n",
+ blkrec->jb_op, blkrec->jb_blkno, blkrec->jb_frags,
+ blkrec->jb_oldfrags, blkrec->jb_ino, blkrec->jb_lbn);
+ blk = blknum(fs, blkrec->jb_blkno);
+ frag = fragnum(fs, blkrec->jb_blkno);
+ sblk = blk_lookup(blk, 1);
+ /*
+ * Rewrite the record using oldfrags to indicate the offset into
+ * the block. Leave jb_frags as the actual allocated count.
+ */
+ blkrec->jb_blkno -= frag;
+ blkrec->jb_oldfrags = frag;
+ if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag)
+ errx(1, "Invalid fragment count %d oldfrags %d",
+ blkrec->jb_frags, frag);
+ /*
+ * Detect dups. If we detect a dup we always discard the oldest
+ * record as it is superseded by the new record. This speeds up
+ * later stages but also eliminates free records which are used
+ * to indicate that the contents of indirects can be trusted.
+ */
+ TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) {
+ blkrn = (struct jblkrec *)srec->sr_rec;
+ if (blkrn->jb_ino != blkrec->jb_ino ||
+ blkrn->jb_lbn != blkrec->jb_lbn ||
+ blkrn->jb_blkno != blkrec->jb_blkno ||
+ blkrn->jb_frags != blkrec->jb_frags ||
+ blkrn->jb_oldfrags != blkrec->jb_oldfrags)
+ continue;
+ if (debug)
+ printf("Removed dup.\n");
+ /* Discard the free which is a dup with an alloc. */
+ if (blkrec->jb_op == JOP_FREEBLK)
+ return;
+ TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next);
+ free(srec);
+ break;
+ }
+ srec = errmalloc(sizeof(*srec));
+ srec->sr_rec = (union jrec *)blkrec;
+ TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next);
+}
+
+/*
+ * Build up tables of the operations we need to recover.
+ */
+static void
+suj_build(void)
+{
+ struct suj_seg *seg;
+ union jrec *rec;
+ int i;
+
+ TAILQ_FOREACH(seg, &allsegs, ss_next) {
+ rec = (union jrec *)seg->ss_blk;
+ rec++; /* skip the segrec. */
+ if (debug)
+ printf("seg %jd has %d records, oldseq %jd.\n",
+ seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt,
+ seg->ss_rec.jsr_oldest);
+ for (i = 0; i < seg->ss_rec.jsr_cnt; i++, rec++) {
+ switch (rec->rec_jrefrec.jr_op) {
+ case JOP_ADDREF:
+ case JOP_REMREF:
+ suj_build_ino((struct jrefrec *)rec);
+ break;
+ case JOP_MVREF:
+ suj_move_ino((struct jmvrec *)rec);
+ break;
+ case JOP_NEWBLK:
+ case JOP_FREEBLK:
+ suj_build_blk((struct jblkrec *)rec);
+ break;
+ default:
+ errx(1, "Unknown journal operation %d (%d)",
+ rec->rec_jrefrec.jr_op, i);
+ }
+ }
+ }
+}
+
+/*
+ * Prune the journal segments to those we care about based on the
+ * oldest sequence in the newest segment. Order the segment list
+ * based on sequence number.
+ */
+static void
+suj_prune(void)
+{
+ struct suj_seg *seg;
+ struct suj_seg *segn;
+ uint64_t newseq;
+ int discard;
+
+ if (debug)
+ printf("Pruning up to %jd\n", oldseq);
+ /* First free the expired segments. */
+ TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
+ if (seg->ss_rec.jsr_seq >= oldseq)
+ continue;
+ TAILQ_REMOVE(&allsegs, seg, ss_next);
+ free(seg->ss_blk);
+ free(seg);
+ }
+ /* Next ensure that segments are ordered properly. */
+ seg = TAILQ_FIRST(&allsegs);
+ if (seg == NULL) {
+ if (debug)
+ printf("Empty journal\n");
+ return;
+ }
+ newseq = seg->ss_rec.jsr_seq;
+ for (;;) {
+ seg = TAILQ_LAST(&allsegs, seghd);
+ if (seg->ss_rec.jsr_seq >= newseq)
+ break;
+ TAILQ_REMOVE(&allsegs, seg, ss_next);
+ TAILQ_INSERT_HEAD(&allsegs, seg, ss_next);
+ newseq = seg->ss_rec.jsr_seq;
+
+ }
+ if (newseq != oldseq)
+ errx(1, "Journal file sequence mismatch %jd != %jd",
+ newseq, oldseq);
+ /*
+ * The kernel may asynchronously write segments which can create
+ * gaps in the sequence space. Throw away any segments after the
+ * gap as the kernel guarantees only those that are contiguously
+ * reachable are marked as completed.
+ */
+ discard = 0;
+ TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) {
+ if (!discard && newseq++ == seg->ss_rec.jsr_seq)
+ continue;
+ discard = 1;
+ if (debug)
+ printf("Journal order mismatch %jd != %jd pruning\n",
+ newseq-1, seg->ss_rec.jsr_seq);
+ TAILQ_REMOVE(&allsegs, seg, ss_next);
+ free(seg->ss_blk);
+ free(seg);
+ }
+ if (debug)
+ printf("Processing journal segments from %jd to %jd\n",
+ oldseq, newseq-1);
+}
+
+/*
+ * Verify the journal inode before attempting to read records.
+ */
+static void
+suj_verifyino(union dinode *ip)
+{
+
+ if (DIP(ip, di_nlink) != 1)
+ errx(1, "Invalid link count %d for journal inode %d",
+ DIP(ip, di_nlink), fs->fs_sujournal);
+
+ if (DIP(ip, di_mode) != IFREG)
+ errx(1, "Invalid mode %d for journal inode %d",
+ DIP(ip, di_mode), fs->fs_sujournal);
+
+ if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX)
+ errx(1, "Invalid size %jd for journal inode %d",
+ DIP(ip, di_size), fs->fs_sujournal);
+
+ if (DIP(ip, di_modrev) != fs->fs_mtime)
+ errx(1, "Journal timestamp does not match fs mount time");
+ /* XXX Add further checks. */
+}
+
+struct jblocks {
+ struct jextent *jb_extent; /* Extent array. */
+ int jb_avail; /* Available extents. */
+ int jb_used; /* Last used extent. */
+ int jb_head; /* Allocator head. */
+ int jb_off; /* Allocator extent offset. */
+};
+struct jextent {
+ ufs2_daddr_t je_daddr; /* Disk block address. */
+ int je_blocks; /* Disk block count. */
+};
+
+struct jblocks *suj_jblocks;
+
+static struct jblocks *
+jblocks_create(void)
+{
+ struct jblocks *jblocks;
+ int size;
+
+ jblocks = errmalloc(sizeof(*jblocks));
+ jblocks->jb_avail = 10;
+ jblocks->jb_used = 0;
+ jblocks->jb_head = 0;
+ jblocks->jb_off = 0;
+ size = sizeof(struct jextent) * jblocks->jb_avail;
+ jblocks->jb_extent = errmalloc(size);
+ bzero(jblocks->jb_extent, size);
+
+ return (jblocks);
+}
+
+/*
+ * Return the next available disk block and the amount of contiguous
+ * free space it contains.
+ */
+static ufs2_daddr_t
+jblocks_next(struct jblocks *jblocks, int bytes, int *actual)
+{
+ struct jextent *jext;
+ ufs2_daddr_t daddr;
+ int freecnt;
+ int blocks;
+
+ blocks = bytes / DEV_BSIZE;
+ jext = &jblocks->jb_extent[jblocks->jb_head];
+ freecnt = jext->je_blocks - jblocks->jb_off;
+ if (freecnt == 0) {
+ jblocks->jb_off = 0;
+ if (++jblocks->jb_head > jblocks->jb_used)
+ return (0);
+ jext = &jblocks->jb_extent[jblocks->jb_head];
+ freecnt = jext->je_blocks;
+ }
+ if (freecnt > blocks)
+ freecnt = blocks;
+ *actual = freecnt * DEV_BSIZE;
+ daddr = jext->je_daddr + jblocks->jb_off;
+
+ return (daddr);
+}
+
+/*
+ * Advance the allocation head by a specified number of bytes, consuming
+ * one journal segment.
+ */
+static void
+jblocks_advance(struct jblocks *jblocks, int bytes)
+{
+
+ jblocks->jb_off += bytes / DEV_BSIZE;
+}
+
+static void
+jblocks_destroy(struct jblocks *jblocks)
+{
+
+ free(jblocks->jb_extent);
+ free(jblocks);
+}
+
+static void
+jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks)
+{
+ struct jextent *jext;
+ int size;
+
+ jext = &jblocks->jb_extent[jblocks->jb_used];
+ /* Adding the first block. */
+ if (jext->je_daddr == 0) {
+ jext->je_daddr = daddr;
+ jext->je_blocks = blocks;
+ return;
+ }
+ /* Extending the last extent. */
+ if (jext->je_daddr + jext->je_blocks == daddr) {
+ jext->je_blocks += blocks;
+ return;
+ }
+ /* Adding a new extent. */
+ if (++jblocks->jb_used == jblocks->jb_avail) {
+ jblocks->jb_avail *= 2;
+ size = sizeof(struct jextent) * jblocks->jb_avail;
+ jext = errmalloc(size);
+ bzero(jext, size);
+ bcopy(jblocks->jb_extent, jext,
+ sizeof(struct jextent) * jblocks->jb_used);
+ free(jblocks->jb_extent);
+ jblocks->jb_extent = jext;
+ }
+ jext = &jblocks->jb_extent[jblocks->jb_used];
+ jext->je_daddr = daddr;
+ jext->je_blocks = blocks;
+
+ return;
+}
+
+/*
+ * Add a file block from the journal to the extent map. We can't read
+ * each file block individually because the kernel treats it as a circular
+ * buffer and segments may span mutliple contiguous blocks.
+ */
+static void
+suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+
+ jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags));
+}
+
+static void
+suj_read(void)
+{
+ uint8_t block[1 * 1024 * 1024];
+ struct suj_seg *seg;
+ struct jsegrec *rec;
+ ufs2_daddr_t blk;
+ int recsize;
+ int size;
+
+ /*
+ * Read records until we exhaust the journal space. If we find
+ * an invalid record we start searching for a valid segment header
+ * at the next block. This is because we don't have a head/tail
+ * pointer and must recover the information indirectly. At the gap
+ * between the head and tail we won't necessarily have a valid
+ * segment.
+ */
+ for (;;) {
+ size = sizeof(block);
+ blk = jblocks_next(suj_jblocks, size, &size);
+ if (blk == 0)
+ return;
+ /*
+ * Read 1MB at a time and scan for records within this block.
+ */
+ if (bread(disk, blk, &block, size) == -1)
+ err(1, "Error reading journal block %jd",
+ (intmax_t)blk);
+ for (rec = (void *)block; size; size -= recsize,
+ rec = (struct jsegrec *)((uintptr_t)rec + recsize)) {
+ recsize = DEV_BSIZE;
+ if (rec->jsr_time != fs->fs_mtime) {
+ if (debug)
+ printf("Rec time %jd != fs mtime %jd\n",
+ rec->jsr_time, fs->fs_mtime);
+ jblocks_advance(suj_jblocks, recsize);
+ continue;
+ }
+ if (rec->jsr_cnt == 0) {
+ if (debug)
+ printf("Found illegal count %d\n",
+ rec->jsr_cnt);
+ jblocks_advance(suj_jblocks, recsize);
+ continue;
+ }
+ recsize = roundup2((rec->jsr_cnt + 1) * JREC_SIZE,
+ DEV_BSIZE);
+ if (recsize > size) {
+ /*
+ * We may just have run out of buffer, restart
+ * the loop to re-read from this spot.
+ */
+ if (size < fs->fs_bsize &&
+ recsize <= fs->fs_bsize) {
+ recsize = size;
+ continue;
+ }
+ if (debug)
+ printf("Found invalid segsize %d > %d\n",
+ recsize, size);
+ recsize = DEV_BSIZE;
+ jblocks_advance(suj_jblocks, recsize);
+ continue;
+ }
+ seg = errmalloc(sizeof(*seg));
+ seg->ss_blk = errmalloc(recsize);
+ seg->ss_rec = *rec;
+ bcopy((void *)rec, seg->ss_blk, recsize);
+ if (rec->jsr_oldest > oldseq)
+ oldseq = rec->jsr_oldest;
+ TAILQ_INSERT_TAIL(&allsegs, seg, ss_next);
+ jrecs += rec->jsr_cnt;
+ jbytes += recsize;
+ jblocks_advance(suj_jblocks, recsize);
+ }
+ }
+}
+
+/*
+ * Orchestrate the verification of a filesystem via the softupdates journal.
+ */
+void
+suj_check(const char *filesys)
+{
+ union dinode *jip;
+ uint64_t blocks;
+
+ opendisk(filesys);
+ TAILQ_INIT(&allsegs);
+ /*
+ * Fetch the journal inode and verify it.
+ */
+ jip = ino_read(fs->fs_sujournal);
+ printf("SU+J Checking %s\n", filesys);
+ suj_verifyino(jip);
+ /*
+ * Build a list of journal blocks in jblocks before parsing the
+ * available journal blocks in with suj_read().
+ */
+ printf("Reading %jd byte journal from inode %d.\n",
+ DIP(jip, di_size), fs->fs_sujournal);
+ suj_jblocks = jblocks_create();
+ blocks = ino_visit(jip, fs->fs_sujournal, suj_add_block, 0);
+ if (blocks != numfrags(fs, DIP(jip, di_size)))
+ errx(1, "Sparse journal inode %d.\n", fs->fs_sujournal);
+ suj_read();
+ jblocks_destroy(suj_jblocks);
+ suj_jblocks = NULL;
+ if (reply("RECOVER")) {
+ printf("Building recovery table.\n");
+ suj_prune();
+ suj_build();
+ printf("Resolving unreferenced inode list.\n");
+ ino_unlinked();
+ printf("Processing journal entries.\n");
+ cg_apply(cg_check);
+ }
+ if (reply("WRITE CHANGES"))
+ cg_apply(cg_write);
+ printf("%jd journal records in %jd bytes for %.2f%% utilization\n",
+ jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100);
+ printf("Freed %jd inodes (%jd directories) %jd blocks and %jd frags.\n",
+ freeinos, freedir, freeblocks, freefrags);
+ /* Write back superblock. */
+ closedisk(filesys);
+}
diff --git a/sbin/fsdb/fsdb.c b/sbin/fsdb/fsdb.c
index f7354e86d9cc..5622cbb29196 100644
--- a/sbin/fsdb/fsdb.c
+++ b/sbin/fsdb/fsdb.c
@@ -396,7 +396,8 @@ const char *typename[] = {
"unregistered #13",
"whiteout",
};
-
+
+int diroff;
int slot;
int
@@ -404,9 +405,10 @@ scannames(struct inodesc *idesc)
{
struct direct *dirp = idesc->id_dirp;
- printf("slot %d ino %d reclen %d: %s, `%.*s'\n",
- slot++, dirp->d_ino, dirp->d_reclen, typename[dirp->d_type],
- dirp->d_namlen, dirp->d_name);
+ printf("slot %d off %d ino %d reclen %d: %s, `%.*s'\n",
+ slot++, diroff, dirp->d_ino, dirp->d_reclen,
+ typename[dirp->d_type], dirp->d_namlen, dirp->d_name);
+ diroff += dirp->d_reclen;
return (KEEPON);
}
@@ -416,6 +418,7 @@ CMDFUNCSTART(ls)
checkactivedir(); /* let it go on anyway */
slot = 0;
+ diroff = 0;
idesc.id_number = curinum;
idesc.id_func = scannames;
idesc.id_type = DATA;
diff --git a/sbin/fsdb/fsdbutil.c b/sbin/fsdb/fsdbutil.c
index d50c6c00d331..2c5710a80aa8 100644
--- a/sbin/fsdb/fsdbutil.c
+++ b/sbin/fsdb/fsdbutil.c
@@ -52,7 +52,7 @@ static const char rcsid[] =
#include "fsck.h"
static int charsperline(void);
-static int printindir(ufs2_daddr_t blk, int level, char *bufp);
+static void printindir(ufs2_daddr_t blk, int level, char *bufp);
static void printblocks(ino_t inum, union dinode *dp);
char **
@@ -226,7 +226,7 @@ charsperline(void)
/*
* Recursively print a list of indirect blocks.
*/
-static int
+static void
printindir(ufs2_daddr_t blk, int level, char *bufp)
{
struct bufarea buf, *bp;
@@ -234,6 +234,9 @@ printindir(ufs2_daddr_t blk, int level, char *bufp)
int i, j, cpl, charssofar;
ufs2_daddr_t blkno;
+ if (blk == 0)
+ return;
+ printf("%jd (%d) =>\n", (intmax_t)blk, level);
if (level == 0) {
/* for the final indirect level, don't use the cache */
bp = &buf;
@@ -251,11 +254,8 @@ printindir(ufs2_daddr_t blk, int level, char *bufp)
blkno = bp->b_un.b_indir1[i];
else
blkno = bp->b_un.b_indir2[i];
- if (blkno == 0) {
- if (level == 0)
- putchar('\n');
- return 0;
- }
+ if (blkno == 0)
+ continue;
j = sprintf(tempbuf, "%jd", (intmax_t)blkno);
if (level == 0) {
charssofar += j;
@@ -270,13 +270,14 @@ printindir(ufs2_daddr_t blk, int level, char *bufp)
charssofar += 2;
} else {
printf(" =>\n");
- if (printindir(blkno, level - 1, bufp) == 0)
- return 0;
+ printindir(blkno, level - 1, bufp);
+ printf("\n");
+ charssofar = 0;
}
}
if (level == 0)
putchar('\n');
- return 1;
+ return;
}
@@ -309,7 +310,7 @@ printblocks(ino_t inum, union dinode *dp)
}
}
putchar('\n');
- if (DIP(dp, di_ib[0]) == 0)
+ if (ndb == 0)
return;
bufp = malloc((unsigned int)sblock.fs_bsize);
@@ -317,8 +318,7 @@ printblocks(ino_t inum, union dinode *dp)
errx(EEXIT, "cannot allocate indirect block buffer");
printf("Indirect blocks:\n");
for (i = 0; i < NIADDR; i++)
- if (printindir(DIP(dp, di_ib[i]), i, bufp) == 0)
- break;
+ printindir(DIP(dp, di_ib[i]), i, bufp);
free(bufp);
}
diff --git a/sbin/tunefs/tunefs.8 b/sbin/tunefs/tunefs.8
index 53e463cfc705..a883cd4fc611 100644
--- a/sbin/tunefs/tunefs.8
+++ b/sbin/tunefs/tunefs.8
@@ -28,7 +28,7 @@
.\" @(#)tunefs.8 8.2 (Berkeley) 12/11/93
.\" $FreeBSD$
.\"
-.Dd October 21, 2009
+.Dd March 6, 2010
.Dt TUNEFS 8
.Os
.Sh NAME
@@ -40,6 +40,7 @@
.Op Fl a Cm enable | disable
.Op Fl e Ar maxbpg
.Op Fl f Ar avgfilesize
+.Op Fl j Cm enable | disable
.Op Fl J Cm enable | disable
.Op Fl L Ar volname
.Op Fl l Cm enable | disable
@@ -49,6 +50,7 @@
.Op Fl o Cm space | time
.Op Fl p
.Op Fl s Ar avgfpdir
+.Op Fl S Ar size
.Ar special | filesystem
.Sh DESCRIPTION
The
@@ -89,6 +91,8 @@ For file systems with exclusively large files,
this parameter should be set higher.
.It Fl f Ar avgfilesize
Specify the expected average file size.
+.It Fl j Cm enable | disable
+Turn on/off soft updates journaling.
.It Fl J Cm enable | disable
Turn on/off gjournal flag.
.It Fl L Ar volname
@@ -136,6 +140,9 @@ obtained from the
utility.
.It Fl s Ar avgfpdir
Specify the expected number of files per directory.
+.It Fl S Ar size
+Specify the softdep journal size in bytes.
+The minimum is 4M.
.El
.Pp
At least one of the above flags is required.
diff --git a/sbin/tunefs/tunefs.c b/sbin/tunefs/tunefs.c
index e4adb5267eb3..a10b35daec2b 100644
--- a/sbin/tunefs/tunefs.c
+++ b/sbin/tunefs/tunefs.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/dinode.h>
#include <ufs/ffs/fs.h>
+#include <ufs/ufs/dir.h>
#include <ctype.h>
#include <err.h>
@@ -61,6 +62,7 @@ __FBSDID("$FreeBSD$");
#include <paths.h>
#include <stdio.h>
#include <stdlib.h>
+#include <stdint.h>
#include <string.h>
#include <unistd.h>
@@ -72,16 +74,20 @@ struct uufsd disk;
void usage(void);
void printfs(void);
+int journal_alloc(int64_t size);
+void journal_clear(void);
+void sbdirty(void);
int
main(int argc, char *argv[])
{
- char *avalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue;
+ char *avalue, *jvalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue;
const char *special, *on;
const char *name;
int active;
- int Aflag, aflag, eflag, evalue, fflag, fvalue, Jflag, Lflag, lflag;
- int mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag, svalue;
+ int Aflag, aflag, eflag, evalue, fflag, fvalue, jflag, Jflag, Lflag;
+ int lflag, mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag;
+ int svalue, Sflag, Svalue;
int ch, found_arg, i;
const char *chg[2];
struct ufs_args args;
@@ -89,13 +95,13 @@ main(int argc, char *argv[])
if (argc < 3)
usage();
- Aflag = aflag = eflag = fflag = Jflag = Lflag = lflag = mflag = 0;
- Nflag = nflag = oflag = pflag = sflag = 0;
- avalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL;
- evalue = fvalue = mvalue = ovalue = svalue = 0;
+ Aflag = aflag = eflag = fflag = jflag = Jflag = Lflag = lflag = 0;
+ mflag = Nflag = nflag = oflag = pflag = sflag = 0;
+ avalue = jvalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL;
+ evalue = fvalue = mvalue = ovalue = svalue = Svalue = 0;
active = 0;
found_arg = 0; /* At least one arg is required. */
- while ((ch = getopt(argc, argv, "Aa:e:f:J:L:l:m:N:n:o:ps:")) != -1)
+ while ((ch = getopt(argc, argv, "Aa:e:f:j:J:L:l:m:N:n:o:ps:S:")) != -1)
switch (ch) {
case 'A':
@@ -135,6 +141,18 @@ main(int argc, char *argv[])
fflag = 1;
break;
+ case 'j':
+ found_arg = 1;
+ name = "softdep journaled file system";
+ jvalue = optarg;
+ if (strcmp(jvalue, "enable") &&
+ strcmp(jvalue, "disable")) {
+ errx(10, "bad %s (options are %s)",
+ name, "`enable' or `disable'");
+ }
+ jflag = 1;
+ break;
+
case 'J':
found_arg = 1;
name = "gjournaled file system";
@@ -240,6 +258,16 @@ main(int argc, char *argv[])
sflag = 1;
break;
+ case 'S':
+ found_arg = 1;
+ name = "Softdep Journal Size";
+ Svalue = atoi(optarg);
+ if (Svalue < SUJ_MIN)
+ errx(10, "%s must be >= %d (was %s)",
+ name, SUJ_MIN, optarg);
+ Sflag = 1;
+ break;
+
default:
usage();
}
@@ -310,6 +338,33 @@ main(int argc, char *argv[])
sblock.fs_avgfilesize = fvalue;
}
}
+ if (jflag) {
+ name = "soft updates journaling";
+ if (strcmp(jvalue, "enable") == 0) {
+ if ((sblock.fs_flags & (FS_DOSOFTDEP | FS_SUJ)) ==
+ (FS_DOSOFTDEP | FS_SUJ)) {
+ warnx("%s remains unchanged as enabled", name);
+ } else if (sblock.fs_clean == 0) {
+ warnx("%s cannot be enabled until fsck is run",
+ name);
+ } else if (journal_alloc(Svalue) != 0) {
+ warnx("%s can not be enabled", name);
+ } else {
+ sblock.fs_flags |= FS_DOSOFTDEP | FS_SUJ;
+ warnx("%s set", name);
+ }
+ } else if (strcmp(jvalue, "disable") == 0) {
+ if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) {
+ warnx("%s remains unchanged as disabled", name);
+ } else {
+ journal_clear();
+ sblock.fs_flags &= ~(FS_DOSOFTDEP | FS_SUJ);
+ sblock.fs_sujfree = 0;
+ warnx("%s cleared, "
+ "remove .sujournal to reclaim space", name);
+ }
+ }
+ }
if (Jflag) {
name = "gjournal";
if (strcmp(Jvalue, "enable") == 0) {
@@ -456,6 +511,500 @@ err:
}
void
+sbdirty(void)
+{
+ disk.d_fs.fs_flags |= FS_UNCLEAN | FS_NEEDSFSCK;
+ disk.d_fs.fs_clean = 0;
+}
+
+int blocks;
+static char clrbuf[MAXBSIZE];
+
+static ufs2_daddr_t
+journal_balloc(void)
+{
+ ufs2_daddr_t blk;
+ struct cg *cgp;
+ int valid;
+ static int contig = 1;
+
+ cgp = &disk.d_cg;
+ for (;;) {
+ blk = cgballoc(&disk);
+ if (blk > 0)
+ break;
+ /*
+ * If we failed to allocate a block from this cg, move to
+ * the next.
+ */
+ if (cgwrite(&disk) < 0) {
+ warn("Failed to write updated cg");
+ return (-1);
+ }
+ while ((valid = cgread(&disk)) == 1) {
+ /*
+ * Try to minimize fragmentation by requiring a minimum
+ * number of blocks present.
+ */
+ if (cgp->cg_cs.cs_nbfree > blocks / 8)
+ break;
+ if (contig == 0 && cgp->cg_cs.cs_nbfree)
+ break;
+ }
+ if (valid)
+ continue;
+ /*
+ * Try once through looking only for large contiguous regions
+ * and again taking any space we can find.
+ */
+ if (contig) {
+ contig = 0;
+ disk.d_ccg = 0;
+ warnx("Journal file fragmented.");
+ continue;
+ }
+ warnx("Failed to find sufficient free blocks for the journal");
+ return -1;
+ }
+ if (bwrite(&disk, fsbtodb(&sblock, blk), clrbuf,
+ sblock.fs_bsize) <= 0) {
+ warn("Failed to initialize new block");
+ return -1;
+ }
+ return (blk);
+}
+
+/*
+ * Search a directory block for the SUJ_FILE.
+ */
+static ino_t
+dir_search(ufs2_daddr_t blk, int bytes)
+{
+ char block[MAXBSIZE];
+ struct direct *dp;
+ int off;
+
+ if (bread(&disk, fsbtodb(&sblock, blk), block, bytes) <= 0) {
+ warn("Failed to read dir block");
+ return (-1);
+ }
+ for (off = 0; off < bytes; off += dp->d_reclen) {
+ dp = (struct direct *)&block[off];
+ if (dp->d_reclen == 0)
+ break;
+ if (dp->d_ino == 0)
+ continue;
+ if (dp->d_namlen != strlen(SUJ_FILE))
+ continue;
+ if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
+ continue;
+ return (dp->d_ino);
+ }
+
+ return (0);
+}
+
+/*
+ * Search in the ROOTINO for the SUJ_FILE. If it exists we can not enable
+ * journaling.
+ */
+static ino_t
+journal_findfile(void)
+{
+ struct ufs1_dinode *dp1;
+ struct ufs2_dinode *dp2;
+ ino_t ino;
+ int mode;
+ void *ip;
+ int i;
+
+ if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
+ warn("Failed to get root inode");
+ return (-1);
+ }
+ dp2 = ip;
+ dp1 = ip;
+ if (sblock.fs_magic == FS_UFS1_MAGIC) {
+ if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
+ warnx("ROOTINO extends beyond direct blocks.");
+ return (-1);
+ }
+ for (i = 0; i < NDADDR; i++) {
+ if (dp1->di_db[i] == 0)
+ break;
+ if ((ino = dir_search(dp1->di_db[i],
+ sblksize(&sblock, (off_t)dp1->di_size, i))) != 0)
+ return (ino);
+ }
+ } else {
+ if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
+ warnx("ROOTINO extends beyond direct blocks.");
+ return (-1);
+ }
+ for (i = 0; i < NDADDR; i++) {
+ if (dp2->di_db[i] == 0)
+ break;
+ if ((ino = dir_search(dp2->di_db[i],
+ sblksize(&sblock, (off_t)dp2->di_size, i))) != 0)
+ return (ino);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Insert the journal at inode 'ino' into directory blk 'blk' at the first
+ * free offset of 'off'. DIRBLKSIZ blocks after off are initialized as
+ * empty.
+ */
+static int
+dir_insert(ufs2_daddr_t blk, off_t off, ino_t ino)
+{
+ struct direct *dp;
+ char block[MAXBSIZE];
+
+ if (bread(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
+ warn("Failed to read dir block");
+ return (-1);
+ }
+ bzero(&block[off], sblock.fs_bsize - off);
+ dp = (struct direct *)&block[off];
+ dp->d_ino = ino;
+ dp->d_reclen = DIRBLKSIZ;
+ dp->d_type = DT_REG;
+ dp->d_namlen = strlen(SUJ_FILE);
+ bcopy(SUJ_FILE, &dp->d_name, strlen(SUJ_FILE));
+ off += DIRBLKSIZ;
+ for (; off < sblock.fs_bsize; off += DIRBLKSIZ) {
+ dp = (struct direct *)&block[off];
+ dp->d_ino = 0;
+ dp->d_reclen = DIRBLKSIZ;
+ dp->d_type = DT_UNKNOWN;
+ }
+ if (bwrite(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
+ warn("Failed to write dir block");
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Extend a directory block in 'blk' by copying it to a full size block
+ * and inserting the new journal inode into .sujournal.
+ */
+static int
+dir_extend(ufs2_daddr_t blk, ufs2_daddr_t nblk, off_t size, ino_t ino)
+{
+ char block[MAXBSIZE];
+
+ if (bread(&disk, fsbtodb(&sblock, blk), block, size) <= 0) {
+ warn("Failed to read dir block");
+ return (-1);
+ }
+ if (bwrite(&disk, fsbtodb(&sblock, nblk), block, size) <= 0) {
+ warn("Failed to write dir block");
+ return (-1);
+ }
+
+ return dir_insert(nblk, size, ino);
+}
+
+/*
+ * Insert the journal file into the ROOTINO directory. We always extend the
+ * last frag
+ */
+static int
+journal_insertfile(ino_t ino)
+{
+ struct ufs1_dinode *dp1;
+ struct ufs2_dinode *dp2;
+ void *ip;
+ ufs2_daddr_t nblk;
+ ufs2_daddr_t blk;
+ ufs_lbn_t lbn;
+ int size;
+ int mode;
+ int off;
+
+ if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
+ warn("Failed to get root inode");
+ sbdirty();
+ return (-1);
+ }
+ dp2 = ip;
+ dp1 = ip;
+ blk = 0;
+ size = 0;
+ nblk = journal_balloc();
+ if (nblk <= 0)
+ return (-1);
+ /*
+ * For simplicity sake we aways extend the ROOTINO into a new
+ * directory block rather than searching for space and inserting
+ * into an existing block. However, if the rootino has frags
+ * have to free them and extend the block.
+ */
+ if (sblock.fs_magic == FS_UFS1_MAGIC) {
+ lbn = lblkno(&sblock, dp1->di_size);
+ off = blkoff(&sblock, dp1->di_size);
+ blk = dp1->di_db[lbn];
+ size = sblksize(&sblock, (off_t)dp1->di_size, lbn);
+ } else {
+ lbn = lblkno(&sblock, dp2->di_size);
+ off = blkoff(&sblock, dp2->di_size);
+ blk = dp2->di_db[lbn];
+ size = sblksize(&sblock, (off_t)dp2->di_size, lbn);
+ }
+ if (off != 0) {
+ if (dir_extend(blk, nblk, off, ino) == -1)
+ return (-1);
+ } else {
+ blk = 0;
+ if (dir_insert(nblk, 0, ino) == -1)
+ return (-1);
+ }
+ if (sblock.fs_magic == FS_UFS1_MAGIC) {
+ dp1->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
+ dp1->di_db[lbn] = nblk;
+ dp1->di_size = lblktosize(&sblock, lbn+1);
+ } else {
+ dp2->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
+ dp2->di_db[lbn] = nblk;
+ dp2->di_size = lblktosize(&sblock, lbn+1);
+ }
+ if (putino(&disk) < 0) {
+ warn("Failed to write root inode");
+ return (-1);
+ }
+ if (cgwrite(&disk) < 0) {
+ warn("Failed to write updated cg");
+ sbdirty();
+ return (-1);
+ }
+ if (blk) {
+ if (cgbfree(&disk, blk, size) < 0) {
+ warn("Failed to write cg");
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+indir_fill(ufs2_daddr_t blk, int level, int *resid)
+{
+ char indirbuf[MAXBSIZE];
+ ufs1_daddr_t *bap1;
+ ufs2_daddr_t *bap2;
+ ufs2_daddr_t nblk;
+ int ncnt;
+ int cnt;
+ int i;
+
+ bzero(indirbuf, sizeof(indirbuf));
+ bap1 = (ufs1_daddr_t *)indirbuf;
+ bap2 = (void *)bap1;
+ cnt = 0;
+ for (i = 0; i < NINDIR(&sblock) && *resid != 0; i++) {
+ nblk = journal_balloc();
+ if (nblk <= 0)
+ return (-1);
+ cnt++;
+ if (sblock.fs_magic == FS_UFS1_MAGIC)
+ *bap1++ = nblk;
+ else
+ *bap2++ = nblk;
+ if (level != 0) {
+ ncnt = indir_fill(nblk, level - 1, resid);
+ if (ncnt <= 0)
+ return (-1);
+ cnt += ncnt;
+ } else
+ (*resid)--;
+ }
+ if (bwrite(&disk, fsbtodb(&sblock, blk), indirbuf,
+ sblock.fs_bsize) <= 0) {
+ warn("Failed to write indirect");
+ return (-1);
+ }
+ return (cnt);
+}
+
+/*
+ * Clear the flag bits so the journal can be removed.
+ */
+void
+journal_clear(void)
+{
+ struct ufs1_dinode *dp1;
+ struct ufs2_dinode *dp2;
+ ino_t ino;
+ int mode;
+ void *ip;
+
+ ino = journal_findfile();
+ if (ino == (ino_t)-1 || ino == 0) {
+ warnx("Journal file does not exist");
+ return;
+ }
+ printf("Clearing journal flags from inode %d\n", ino);
+ if (getino(&disk, &ip, ino, &mode) != 0) {
+ warn("Failed to get journal inode");
+ return;
+ }
+ dp2 = ip;
+ dp1 = ip;
+ if (sblock.fs_magic == FS_UFS1_MAGIC)
+ dp1->di_flags = 0;
+ else
+ dp2->di_flags = 0;
+ if (putino(&disk) < 0) {
+ warn("Failed to write journal inode");
+ return;
+ }
+}
+
+int
+journal_alloc(int64_t size)
+{
+ struct ufs1_dinode *dp1;
+ struct ufs2_dinode *dp2;
+ ufs2_daddr_t blk;
+ void *ip;
+ struct cg *cgp;
+ int resid;
+ ino_t ino;
+ int blks;
+ int mode;
+ int i;
+
+ cgp = &disk.d_cg;
+ ino = 0;
+
+ /*
+ * If the journal file exists we can't allocate it.
+ */
+ ino = journal_findfile();
+ if (ino == (ino_t)-1)
+ return (-1);
+ if (ino > 0) {
+ warnx("Journal file %s already exists, please remove.",
+ SUJ_FILE);
+ return (-1);
+ }
+ /*
+ * If the user didn't supply a size pick one based on the filesystem
+ * size constrained with hardcoded MIN and MAX values. We opt for
+ * 1/1024th of the filesystem up to MAX but not exceeding one CG and
+ * not less than the MIN.
+ */
+ if (size == 0) {
+ size = (sblock.fs_size * sblock.fs_bsize) / 1024;
+ size = MIN(SUJ_MAX, size);
+ if (size / sblock.fs_fsize > sblock.fs_fpg)
+ size = sblock.fs_fpg * sblock.fs_fsize;
+ size = MAX(SUJ_MIN, size);
+ }
+ resid = blocks = size / sblock.fs_bsize;
+ if (sblock.fs_cstotal.cs_nbfree < blocks) {
+ warn("Insufficient free space for %jd byte journal", size);
+ return (-1);
+ }
+ /*
+ * Find a cg with enough blocks to satisfy the journal
+ * size. Presently the journal does not span cgs.
+ */
+ while (cgread(&disk) == 1) {
+ if (cgp->cg_cs.cs_nifree == 0)
+ continue;
+ ino = cgialloc(&disk);
+ if (ino <= 0)
+ break;
+ printf("Using inode %d in cg %d for %jd byte journal\n",
+ ino, cgp->cg_cgx, size);
+ if (getino(&disk, &ip, ino, &mode) != 0) {
+ warn("Failed to get allocated inode");
+ sbdirty();
+ goto out;
+ }
+ /*
+ * We leave fields unrelated to the number of allocated
+ * blocks and size uninitialized. This causes legacy
+ * fsck implementations to clear the inode.
+ */
+ dp2 = ip;
+ dp1 = ip;
+ if (sblock.fs_magic == FS_UFS1_MAGIC) {
+ bzero(dp1, sizeof(*dp1));
+ dp1->di_size = size;
+ dp1->di_mode = IFREG | IREAD;
+ dp1->di_nlink = 1;
+ dp1->di_flags = SF_IMMUTABLE | SF_NOUNLINK | UF_NODUMP;
+ } else {
+ bzero(dp2, sizeof(*dp2));
+ dp2->di_size = size;
+ dp2->di_mode = IFREG | IREAD;
+ dp2->di_nlink = 1;
+ dp2->di_flags = SF_IMMUTABLE | SF_NOUNLINK | UF_NODUMP;
+ }
+ for (i = 0; i < NDADDR && resid; i++, resid--) {
+ blk = journal_balloc();
+ if (blk <= 0)
+ goto out;
+ if (sblock.fs_magic == FS_UFS1_MAGIC) {
+ dp1->di_db[i] = blk;
+ dp1->di_blocks++;
+ } else {
+ dp2->di_db[i] = blk;
+ dp2->di_blocks++;
+ }
+ }
+ for (i = 0; i < NIADDR && resid; i++) {
+ blk = journal_balloc();
+ if (blk <= 0)
+ goto out;
+ blks = indir_fill(blk, i, &resid) + 1;
+ if (blks <= 0) {
+ sbdirty();
+ goto out;
+ }
+ if (sblock.fs_magic == FS_UFS1_MAGIC) {
+ dp1->di_ib[i] = blk;
+ dp1->di_blocks += blks;
+ } else {
+ dp2->di_ib[i] = blk;
+ dp2->di_blocks += blks;
+ }
+ }
+ if (sblock.fs_magic == FS_UFS1_MAGIC)
+ dp1->di_blocks *= sblock.fs_bsize / disk.d_bsize;
+ else
+ dp2->di_blocks *= sblock.fs_bsize / disk.d_bsize;
+ if (putino(&disk) < 0) {
+ warn("Failed to write inode");
+ sbdirty();
+ return (-1);
+ }
+ if (cgwrite(&disk) < 0) {
+ warn("Failed to write updated cg");
+ sbdirty();
+ return (-1);
+ }
+ if (journal_insertfile(ino) < 0) {
+ sbdirty();
+ return (-1);
+ }
+ sblock.fs_sujfree = 0;
+ return (0);
+ }
+ warnx("Insufficient free space for the journal.");
+out:
+ return (-1);
+}
+
+void
usage(void)
{
fprintf(stderr, "%s\n%s\n%s\n%s\n",
@@ -477,6 +1026,8 @@ printfs(void)
(sblock.fs_flags & FS_MULTILABEL)? "enabled" : "disabled");
warnx("soft updates: (-n) %s",
(sblock.fs_flags & FS_DOSOFTDEP)? "enabled" : "disabled");
+ warnx("soft update journaling: (-j) %s",
+ (sblock.fs_flags & FS_SUJ)? "enabled" : "disabled");
warnx("gjournal: (-J) %s",
(sblock.fs_flags & FS_GJOURNAL)? "enabled" : "disabled");
warnx("maximum blocks per file in a cylinder group: (-e) %d",
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 4e9cfc699575..156b676cf6be 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -216,6 +216,14 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
static int bd_request;
/*
+ * Request for the buf daemon to write more buffers than is indicated by
+ * lodirtybuf. This may be necessary to push out excess dependencies or
+ * defragment the address space where a simple count of the number of dirty
+ * buffers is insufficient to characterize the demand for flushing them.
+ */
+static int bd_speedupreq;
+
+/*
* This lock synchronizes access to bd_request.
*/
static struct mtx bdlock;
@@ -467,12 +475,20 @@ bd_wakeup(int dirtybuflevel)
* bd_speedup - speedup the buffer cache flushing code
*/
-static __inline
void
bd_speedup(void)
{
+ int needwake;
- bd_wakeup(1);
+ mtx_lock(&bdlock);
+ needwake = 0;
+ if (bd_speedupreq == 0 || bd_request == 0)
+ needwake = 1;
+ bd_speedupreq = 1;
+ bd_request = 1;
+ if (needwake)
+ wakeup(&bd_request);
+ mtx_unlock(&bdlock);
}
/*
@@ -2120,6 +2136,7 @@ buf_do_flush(struct vnode *vp)
static void
buf_daemon()
{
+ int lodirtysave;
/*
* This process needs to be suspended prior to shutdown sync.
@@ -2137,7 +2154,11 @@ buf_daemon()
mtx_unlock(&bdlock);
kproc_suspend_check(bufdaemonproc);
-
+ lodirtysave = lodirtybuffers;
+ if (bd_speedupreq) {
+ lodirtybuffers = numdirtybuffers / 2;
+ bd_speedupreq = 0;
+ }
/*
* Do the flush. Limit the amount of in-transit I/O we
* allow to build up, otherwise we would completely saturate
@@ -2149,6 +2170,7 @@ buf_daemon()
break;
uio_yield();
}
+ lodirtybuffers = lodirtysave;
/*
* Only clear bd_request if we have reached our low water
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 4810e769b157..ae182e008391 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2815,6 +2815,7 @@ DB_SHOW_COMMAND(mount, db_show_mount)
MNT_FLAG(MNT_FORCE);
MNT_FLAG(MNT_SNAPSHOT);
MNT_FLAG(MNT_BYFSID);
+ MNT_FLAG(MNT_SOFTDEP);
#undef MNT_FLAG
if (flags != 0) {
if (buf[0] != '\0')
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 8f3b1b2ab3fe..137f90f6d72a 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -215,7 +215,7 @@ struct buf {
#define B_DIRTY 0x00200000 /* Needs writing later (in EXT2FS). */
#define B_RELBUF 0x00400000 /* Release VMIO buffer. */
#define B_00800000 0x00800000 /* Available flag. */
-#define B_01000000 0x01000000 /* Available flag. */
+#define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */
#define B_NEEDSGIANT 0x02000000 /* Buffer's vnode needs giant. */
#define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */
#define B_MANAGED 0x08000000 /* Managed by FS. */
@@ -493,6 +493,7 @@ int bufwait(struct buf *);
int bufwrite(struct buf *);
void bufdone(struct buf *);
void bufdone_finish(struct buf *);
+void bd_speedup(void);
int cluster_read(struct vnode *, u_quad_t, daddr_t, long,
struct ucred *, long, int, struct buf **);
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index b8264130a176..20dcf641c7bd 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -275,7 +275,8 @@ void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \
MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \
MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \
- MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | MNT_NFS4ACLS)
+ MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \
+ MNT_NFS4ACLS)
/* Mask of flags that can be updated. */
#define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \
@@ -324,6 +325,7 @@ void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
#define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */
#define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */
#define MNTK_SHARED_WRITES 0x00000080 /* Allow shared locking for writes */
+#define MNTK_SUJ 0x00000100 /* Softdep journaling enabled */
#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */
#define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */
#define MNTK_SUSPEND 0x08000000 /* request write suspension */
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index 7bf117719726..b1f7ba0127f7 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -94,24 +94,24 @@ __FBSDID("$FreeBSD$");
#include <ufs/ffs/ffs_extern.h>
typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,
- int size);
+ int size, int rsize);
-static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int);
+static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);
static ufs2_daddr_t
- ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t);
+ ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
#ifdef INVARIANTS
static int ffs_checkblk(struct inode *, ufs2_daddr_t, long);
#endif
-static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);
-static void ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *,
- ufs1_daddr_t, int);
+static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int,
+ int);
static ino_t ffs_dirpref(struct inode *);
static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,
int, int);
static void ffs_fserr(struct fs *, ino_t, char *);
static ufs2_daddr_t ffs_hashalloc
- (struct inode *, u_int, ufs2_daddr_t, int, allocfcn_t *);
-static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int);
+ (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *);
+static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,
+ int);
static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
@@ -188,7 +188,7 @@ retry:
cg = ino_to_cg(fs, ip->i_number);
else
cg = dtog(fs, bpref);
- bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
+ bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
if (bno > 0) {
delta = btodb(size);
if (ip->i_flag & IN_SPACECOUNTED) {
@@ -387,16 +387,12 @@ retry:
panic("ffs_realloccg: bad optim");
/* NOTREACHED */
}
- bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);
+ bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
if (bno > 0) {
bp->b_blkno = fsbtodb(fs, bno);
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize,
- ip->i_number);
- if (nsize < request)
- ffs_blkfree(ump, fs, ip->i_devvp,
- bno + numfrags(fs, nsize),
- (long)(request - nsize), ip->i_number);
+ ip->i_number, NULL);
delta = btodb(nsize - osize);
if (ip->i_flag & IN_SPACECOUNTED) {
UFS_LOCK(ump);
@@ -487,6 +483,14 @@ ffs_reallocblks(ap)
if (doreallocblks == 0)
return (ENOSPC);
+ /*
+ * We can't wait in softdep prealloc as it may fsync and recurse
+ * here. Instead we simply fail to reallocate blocks if this
+ * rare condition arises.
+ */
+ if (DOINGSOFTDEP(ap->a_vp))
+ if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
+ return (ENOSPC);
if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1)
return (ffs_reallocblks_ufs1(ap));
return (ffs_reallocblks_ufs2(ap));
@@ -587,7 +591,7 @@ ffs_reallocblks_ufs1(ap)
* Search the block map looking for an allocation of the desired size.
*/
if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
- len, ffs_clusteralloc)) == 0) {
+ len, len, ffs_clusteralloc)) == 0) {
UFS_UNLOCK(ump);
goto fail;
}
@@ -673,7 +677,7 @@ ffs_reallocblks_ufs1(ap)
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ip->i_devvp,
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
- fs->fs_bsize, ip->i_number);
+ fs->fs_bsize, ip->i_number, NULL);
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
@@ -795,7 +799,7 @@ ffs_reallocblks_ufs2(ap)
* Search the block map looking for an allocation of the desired size.
*/
if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
- len, ffs_clusteralloc)) == 0) {
+ len, len, ffs_clusteralloc)) == 0) {
UFS_UNLOCK(ump);
goto fail;
}
@@ -881,7 +885,7 @@ ffs_reallocblks_ufs2(ap)
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ip->i_devvp,
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
- fs->fs_bsize, ip->i_number);
+ fs->fs_bsize, ip->i_number, NULL);
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
@@ -969,7 +973,7 @@ ffs_valloc(pvp, mode, cred, vpp)
if (fs->fs_contigdirs[cg] > 0)
fs->fs_contigdirs[cg]--;
}
- ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode,
+ ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
(allocfcn_t *)ffs_nodealloccg);
if (ino == 0)
goto noinodes;
@@ -1278,11 +1282,12 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap)
*/
/*VARARGS5*/
static ufs2_daddr_t
-ffs_hashalloc(ip, cg, pref, size, allocator)
+ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
struct inode *ip;
u_int cg;
ufs2_daddr_t pref;
- int size; /* size for data blocks, mode for inodes */
+ int size; /* Search size for data blocks, mode for inodes */
+ int rsize; /* Real allocated size. */
allocfcn_t *allocator;
{
struct fs *fs;
@@ -1298,7 +1303,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
/*
* 1: preferred cylinder group
*/
- result = (*allocator)(ip, cg, pref, size);
+ result = (*allocator)(ip, cg, pref, size, rsize);
if (result)
return (result);
/*
@@ -1308,7 +1313,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
cg += i;
if (cg >= fs->fs_ncg)
cg -= fs->fs_ncg;
- result = (*allocator)(ip, cg, 0, size);
+ result = (*allocator)(ip, cg, 0, size, rsize);
if (result)
return (result);
}
@@ -1319,7 +1324,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
*/
cg = (icg + 2) % fs->fs_ncg;
for (i = 2; i < fs->fs_ncg; i++) {
- result = (*allocator)(ip, cg, 0, size);
+ result = (*allocator)(ip, cg, 0, size, rsize);
if (result)
return (result);
cg++;
@@ -1401,7 +1406,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize)
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev);
+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
+ frags, numfrags(fs, osize));
bdwrite(bp);
return (bprev);
@@ -1419,11 +1425,12 @@ fail:
* and if it is, allocate it.
*/
static ufs2_daddr_t
-ffs_alloccg(ip, cg, bpref, size)
+ffs_alloccg(ip, cg, bpref, size, rsize)
struct inode *ip;
u_int cg;
ufs2_daddr_t bpref;
int size;
+ int rsize;
{
struct fs *fs;
struct cg *cgp;
@@ -1451,7 +1458,7 @@ ffs_alloccg(ip, cg, bpref, size)
cgp->cg_old_time = cgp->cg_time = time_second;
if (size == fs->fs_bsize) {
UFS_LOCK(ump);
- blkno = ffs_alloccgblk(ip, bp, bpref);
+ blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
bdwrite(bp);
@@ -1475,21 +1482,14 @@ ffs_alloccg(ip, cg, bpref, size)
if (cgp->cg_cs.cs_nbfree == 0)
goto fail;
UFS_LOCK(ump);
- blkno = ffs_alloccgblk(ip, bp, bpref);
- bno = dtogd(fs, blkno);
- for (i = frags; i < fs->fs_frag; i++)
- setbit(blksfree, bno + i);
- i = fs->fs_frag - frags;
- cgp->cg_cs.cs_nffree += i;
- fs->fs_cstotal.cs_nffree += i;
- fs->fs_cs(fs, cg).cs_nffree += i;
- fs->fs_fmod = 1;
- cgp->cg_frsum[i]++;
+ blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
bdwrite(bp);
return (blkno);
}
+ KASSERT(size == rsize,
+ ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
if (bno < 0)
goto fail;
@@ -1507,7 +1507,7 @@ ffs_alloccg(ip, cg, bpref, size)
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
bdwrite(bp);
return (blkno);
@@ -1529,10 +1529,11 @@ fail:
* blocks may be fragmented by the routine that allocates them.
*/
static ufs2_daddr_t
-ffs_alloccgblk(ip, bp, bpref)
+ffs_alloccgblk(ip, bp, bpref, size)
struct inode *ip;
struct buf *bp;
ufs2_daddr_t bpref;
+ int size;
{
struct fs *fs;
struct cg *cgp;
@@ -1540,6 +1541,7 @@ ffs_alloccgblk(ip, bp, bpref)
ufs1_daddr_t bno;
ufs2_daddr_t blkno;
u_int8_t *blksfree;
+ int i;
fs = ip->i_fs;
ump = ip->i_ump;
@@ -1567,16 +1569,32 @@ ffs_alloccgblk(ip, bp, bpref)
gotit:
blkno = fragstoblks(fs, bno);
ffs_clrblock(fs, blksfree, (long)blkno);
- ffs_clusteracct(ump, fs, cgp, blkno, -1);
+ ffs_clusteracct(fs, cgp, blkno, -1);
cgp->cg_cs.cs_nbfree--;
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
fs->fs_fmod = 1;
blkno = cgbase(fs, cgp->cg_cgx) + bno;
+ /*
+ * If the caller didn't want the whole block free the frags here.
+ */
+ size = numfrags(fs, size);
+ if (size != fs->fs_frag) {
+ bno = dtogd(fs, blkno);
+ for (i = size; i < fs->fs_frag; i++)
+ setbit(blksfree, bno + i);
+ i = fs->fs_frag - size;
+ cgp->cg_cs.cs_nffree += i;
+ fs->fs_cstotal.cs_nffree += i;
+ fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
+ fs->fs_fmod = 1;
+ cgp->cg_frsum[i]++;
+ }
/* XXX Fixme. */
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
+ size, 0);
UFS_LOCK(ump);
return (blkno);
}
@@ -1589,11 +1607,12 @@ gotit:
* take the first one that we find following bpref.
*/
static ufs2_daddr_t
-ffs_clusteralloc(ip, cg, bpref, len)
+ffs_clusteralloc(ip, cg, bpref, len, unused)
struct inode *ip;
u_int cg;
ufs2_daddr_t bpref;
int len;
+ int unused;
{
struct fs *fs;
struct cg *cgp;
@@ -1689,7 +1708,7 @@ ffs_clusteralloc(ip, cg, bpref, len)
len = blkstofrags(fs, len);
UFS_LOCK(ump);
for (i = 0; i < len; i += fs->fs_frag)
- if (ffs_alloccgblk(ip, bp, bno + i) != bno + i)
+ if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
panic("ffs_clusteralloc: lost block");
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
@@ -1713,11 +1732,12 @@ fail:
* inode in the specified cylinder group.
*/
static ufs2_daddr_t
-ffs_nodealloccg(ip, cg, ipref, mode)
+ffs_nodealloccg(ip, cg, ipref, mode, unused)
struct inode *ip;
u_int cg;
ufs2_daddr_t ipref;
int mode;
+ int unused;
{
struct fs *fs;
struct cg *cgp;
@@ -1820,28 +1840,6 @@ gotit:
}
/*
- * check if a block is free
- */
-static int
-ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
-{
-
- switch ((int)fs->fs_frag) {
- case 8:
- return (cp[h] == 0);
- case 4:
- return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
- case 2:
- return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
- case 1:
- return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
- default:
- panic("ffs_isfreeblock");
- }
- return (0);
-}
-
-/*
* Free a block or fragment.
*
* The specified block or fragment is placed back in the
@@ -1849,14 +1847,16 @@ ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
* block reassembly is checked.
*/
void
-ffs_blkfree(ump, fs, devvp, bno, size, inum)
+ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
ufs2_daddr_t bno;
long size;
ino_t inum;
+ struct workhead *dephd;
{
+ struct mount *mp;
struct cg *cgp;
struct buf *bp;
ufs1_daddr_t fragno, cgbno;
@@ -1923,7 +1923,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
panic("ffs_blkfree: freeing free block");
}
ffs_setblock(fs, blksfree, fragno);
- ffs_clusteracct(ump, fs, cgp, fragno, 1);
+ ffs_clusteracct(fs, cgp, fragno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
@@ -1963,7 +1963,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
cgp->cg_cs.cs_nffree -= fs->fs_frag;
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
- ffs_clusteracct(ump, fs, cgp, fragno, 1);
+ ffs_clusteracct(fs, cgp, fragno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
@@ -1972,6 +1972,10 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
fs->fs_fmod = 1;
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
+ mp = UFSTOVFS(ump);
+ if (mp->mnt_flag & MNT_SOFTDEP && devvp->v_type != VREG)
+ softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
+ numfrags(fs, size), dephd);
bdwrite(bp);
}
@@ -2042,7 +2046,8 @@ ffs_vfree(pvp, ino, mode)
return (0);
}
ip = VTOI(pvp);
- return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode));
+ return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode,
+ NULL));
}
/*
@@ -2050,12 +2055,13 @@ ffs_vfree(pvp, ino, mode)
* The specified inode is placed back in the free map.
*/
int
-ffs_freefile(ump, fs, devvp, ino, mode)
+ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
ino_t ino;
int mode;
+ struct workhead *wkhd;
{
struct cg *cgp;
struct buf *bp;
@@ -2112,6 +2118,9 @@ ffs_freefile(ump, fs, devvp, ino, mode)
fs->fs_fmod = 1;
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
+ if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP && devvp->v_type != VREG)
+ softdep_setup_inofree(UFSTOVFS(ump), bp,
+ ino + cg * fs->fs_ipg, wkhd);
bdwrite(bp);
return (0);
}
@@ -2226,101 +2235,6 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz)
}
/*
- * Update the cluster map because of an allocation or free.
- *
- * Cnt == 1 means free; cnt == -1 means allocating.
- */
-void
-ffs_clusteracct(ump, fs, cgp, blkno, cnt)
- struct ufsmount *ump;
- struct fs *fs;
- struct cg *cgp;
- ufs1_daddr_t blkno;
- int cnt;
-{
- int32_t *sump;
- int32_t *lp;
- u_char *freemapp, *mapp;
- int i, start, end, forw, back, map, bit;
-
- mtx_assert(UFS_MTX(ump), MA_OWNED);
-
- if (fs->fs_contigsumsize <= 0)
- return;
- freemapp = cg_clustersfree(cgp);
- sump = cg_clustersum(cgp);
- /*
- * Allocate or clear the actual block.
- */
- if (cnt > 0)
- setbit(freemapp, blkno);
- else
- clrbit(freemapp, blkno);
- /*
- * Find the size of the cluster going forward.
- */
- start = blkno + 1;
- end = start + fs->fs_contigsumsize;
- if (end >= cgp->cg_nclusterblks)
- end = cgp->cg_nclusterblks;
- mapp = &freemapp[start / NBBY];
- map = *mapp++;
- bit = 1 << (start % NBBY);
- for (i = start; i < end; i++) {
- if ((map & bit) == 0)
- break;
- if ((i & (NBBY - 1)) != (NBBY - 1)) {
- bit <<= 1;
- } else {
- map = *mapp++;
- bit = 1;
- }
- }
- forw = i - start;
- /*
- * Find the size of the cluster going backward.
- */
- start = blkno - 1;
- end = start - fs->fs_contigsumsize;
- if (end < 0)
- end = -1;
- mapp = &freemapp[start / NBBY];
- map = *mapp--;
- bit = 1 << (start % NBBY);
- for (i = start; i > end; i--) {
- if ((map & bit) == 0)
- break;
- if ((i & (NBBY - 1)) != 0) {
- bit >>= 1;
- } else {
- map = *mapp--;
- bit = 1 << (NBBY - 1);
- }
- }
- back = start - i;
- /*
- * Account for old cluster and the possibly new forward and
- * back clusters.
- */
- i = back + forw + 1;
- if (i > fs->fs_contigsumsize)
- i = fs->fs_contigsumsize;
- sump[i] += cnt;
- if (back > 0)
- sump[back] -= cnt;
- if (forw > 0)
- sump[forw] -= cnt;
- /*
- * Update cluster summary information.
- */
- lp = &sump[fs->fs_contigsumsize];
- for (i = fs->fs_contigsumsize; i > 0; i--)
- if (*lp-- > 0)
- break;
- fs->fs_maxcluster[cgp->cg_cgx] = i;
-}
-
-/*
* Fserr prints the name of a filesystem with an error diagnostic.
*
* The form of the error message is:
@@ -2540,7 +2454,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
#endif /* DEBUG */
while (cmd.size > 0) {
if ((error = ffs_freefile(ump, fs, ump->um_devvp,
- cmd.value, filetype)))
+ cmd.value, filetype, NULL)))
break;
cmd.size -= 1;
cmd.value += 1;
@@ -2568,7 +2482,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
if (blksize > blkcnt)
blksize = blkcnt;
ffs_blkfree(ump, fs, ump->um_devvp, blkno,
- blksize * fs->fs_fsize, ROOTINO);
+ blksize * fs->fs_fsize, ROOTINO, NULL);
blkno += blksize;
blkcnt -= blksize;
blksize = fs->fs_frag;
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index a12f96e60d0e..6d5f27c1f306 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -120,6 +120,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
if (lbn < 0)
return (EFBIG);
+ if (DOINGSOFTDEP(vp))
+ softdep_prealloc(vp, MNT_WAIT);
/*
* If the next write will extend the file into a new block,
* and the file is currently composed of a fragment
@@ -418,6 +420,8 @@ fail:
* slow, running out of disk space is not expected to be a common
* occurence. The error return from fsync is ignored as we already
* have an error to return to the user.
+ *
+ * XXX Still have to journal the free below
*/
(void) ffs_syncvnode(vp, MNT_WAIT);
for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
@@ -473,7 +477,7 @@ fail:
*/
for (blkp = allociblk; blkp < allocblk; blkp++) {
ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
- ip->i_number);
+ ip->i_number, NULL);
}
return (error);
}
@@ -515,6 +519,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
if (lbn < 0)
return (EFBIG);
+ if (DOINGSOFTDEP(vp))
+ softdep_prealloc(vp, MNT_WAIT);
+
/*
* Check for allocating external data.
*/
@@ -930,6 +937,8 @@ fail:
* slow, running out of disk space is not expected to be a common
* occurence. The error return from fsync is ignored as we already
* have an error to return to the user.
+ *
+ * XXX Still have to journal the free below
*/
(void) ffs_syncvnode(vp, MNT_WAIT);
for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
@@ -985,7 +994,7 @@ fail:
*/
for (blkp = allociblk; blkp < allocblk; blkp++) {
ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
- ip->i_number);
+ ip->i_number, NULL);
}
return (error);
}
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index 7e32ced2ebe2..7011623749ba 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -47,6 +47,7 @@ struct ucred;
struct vnode;
struct vop_fsync_args;
struct vop_reallocblks_args;
+struct workhead;
int ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int,
struct ucred *, ufs2_daddr_t *);
@@ -56,20 +57,23 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size,
struct ucred *a_cred, int a_flags, struct buf **a_bpp);
int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
- ufs2_daddr_t, long, ino_t);
+ ufs2_daddr_t, long, ino_t, struct workhead *);
ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
void ffs_bdflush(struct bufobj *, struct buf *);
int ffs_copyonwrite(struct vnode *, struct buf *);
int ffs_flushfiles(struct mount *, int, struct thread *);
void ffs_fragacct(struct fs *, int, int32_t [], int);
int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t,
- int);
+ int, struct workhead *);
int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
+int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
int ffs_mountroot(void);
+void ffs_oldfscompat_write(struct fs *, struct ufsmount *);
int ffs_reallocblks(struct vop_reallocblks_args *);
int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
ufs2_daddr_t, int, int, int, struct ucred *, struct buf **);
@@ -103,12 +107,14 @@ extern struct vop_vector ffs_fifoops2;
int softdep_check_suspend(struct mount *, struct vnode *,
int, int, int, int);
+int softdep_complete_trunc(struct vnode *, void *);
void softdep_get_depcounts(struct mount *, int *, int *);
void softdep_initialize(void);
void softdep_uninitialize(void);
int softdep_mount(struct vnode *, struct mount *, struct fs *,
struct ucred *);
-void softdep_move_dependencies(struct buf *, struct buf *);
+void softdep_unmount(struct mount *);
+int softdep_move_dependencies(struct buf *, struct buf *);
int softdep_flushworklist(struct mount *, int *, struct thread *);
int softdep_flushfiles(struct mount *, int, struct thread *);
void softdep_update_inodeblock(struct inode *, struct buf *, int);
@@ -117,7 +123,8 @@ void softdep_freefile(struct vnode *, ino_t, int);
int softdep_request_cleanup(struct fs *, struct vnode *);
void softdep_setup_freeblocks(struct inode *, off_t, int);
void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t);
-void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t);
+void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t,
+ int, int);
void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t,
ufs2_daddr_t, long, long, struct buf *);
void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t,
@@ -126,11 +133,20 @@ void softdep_setup_allocindir_meta(struct buf *, struct inode *,
struct buf *, int, ufs2_daddr_t);
void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t,
struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *);
+void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int,
+ struct workhead *);
+void softdep_setup_inofree(struct mount *, struct buf *, ino_t,
+ struct workhead *);
+void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);
+void *softdep_setup_trunc(struct vnode *vp, off_t length, int flags);
void softdep_fsync_mountdev(struct vnode *);
int softdep_sync_metadata(struct vnode *);
int softdep_process_worklist(struct mount *, int);
int softdep_fsync(struct vnode *);
int softdep_waitidle(struct mount *);
+int softdep_prealloc(struct vnode *, int);
+int softdep_journal_lookup(struct mount *, struct vnode **);
+
int ffs_rdonly(struct inode *);
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index b2f906730121..3b6983258b93 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -92,15 +92,6 @@ ffs_update(vp, waitfor)
fs = ip->i_fs;
if (fs->fs_ronly)
return (0);
- /*
- * Ensure that uid and gid are correct. This is a temporary
- * fix until fsck has been changed to do the update.
- */
- if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */
- fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */
- ip->i_din1->di_ouid = ip->i_uid; /* XXX */
- ip->i_din1->di_ogid = ip->i_gid; /* XXX */
- } /* XXX */
error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
(int)fs->fs_bsize, NOCRED, &bp);
if (error) {
@@ -160,6 +151,7 @@ ffs_truncate(vp, length, flags, cred, td)
ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR];
ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
ufs2_daddr_t count, blocksreleased = 0, datablocks;
+ void *cookie;
struct bufobj *bo;
struct fs *fs;
struct buf *bp;
@@ -173,11 +165,14 @@ ffs_truncate(vp, length, flags, cred, td)
fs = ip->i_fs;
ump = ip->i_ump;
bo = &vp->v_bufobj;
+ cookie = NULL;
ASSERT_VOP_LOCKED(vp, "ffs_truncate");
if (length < 0)
return (EINVAL);
+ if (length > fs->fs_maxfilesize)
+ return (EFBIG);
/*
* Historically clients did not have to specify which data
* they were truncating. So, if not specified, we assume
@@ -192,6 +187,7 @@ ffs_truncate(vp, length, flags, cred, td)
* (e.g., the file is being unlinked), then pick it off with
* soft updates below.
*/
+ allerror = 0;
needextclean = 0;
softdepslowdown = DOINGSOFTDEP(vp) && softdep_slowdown(vp);
extblocks = 0;
@@ -212,6 +208,8 @@ ffs_truncate(vp, length, flags, cred, td)
panic("ffs_truncate: partial trunc of extdata");
if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
return (error);
+ if (DOINGSUJ(vp))
+ cookie = softdep_setup_trunc(vp, length, flags);
osize = ip->i_din2->di_extsize;
ip->i_din2->di_blocks -= extblocks;
#ifdef QUOTA
@@ -227,19 +225,19 @@ ffs_truncate(vp, length, flags, cred, td)
}
ip->i_flag |= IN_CHANGE;
if ((error = ffs_update(vp, 1)))
- return (error);
+ goto out;
for (i = 0; i < NXADDR; i++) {
if (oldblks[i] == 0)
continue;
ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i],
- sblksize(fs, osize, i), ip->i_number);
+ sblksize(fs, osize, i), ip->i_number, NULL);
}
}
}
- if ((flags & IO_NORMAL) == 0)
- return (0);
- if (length > fs->fs_maxfilesize)
- return (EFBIG);
+ if ((flags & IO_NORMAL) == 0) {
+ error = 0;
+ goto out;
+ }
if (vp->v_type == VLNK &&
(ip->i_size < vp->v_mount->mnt_maxsymlinklen ||
datablocks == 0)) {
@@ -253,24 +251,52 @@ ffs_truncate(vp, length, flags, cred, td)
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (needextclean)
softdep_setup_freeblocks(ip, length, IO_EXT);
- return (ffs_update(vp, 1));
+ error = ffs_update(vp, 1);
+ goto out;
}
if (ip->i_size == length) {
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (needextclean)
softdep_setup_freeblocks(ip, length, IO_EXT);
- return (ffs_update(vp, 0));
+ error = ffs_update(vp, 0);
+ goto out;
}
if (fs->fs_ronly)
panic("ffs_truncate: read-only filesystem");
#ifdef QUOTA
error = getinoquota(ip);
if (error)
- return (error);
+ goto out;
#endif
if ((ip->i_flags & SF_SNAPSHOT) != 0)
ffs_snapremove(vp);
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+ osize = ip->i_size;
+ /*
+ * Lengthen the size of the file. We must ensure that the
+ * last byte of the file is allocated. Since the smallest
+ * value of osize is 0, length will be at least 1.
+ */
+ if (osize < length) {
+ vnode_pager_setsize(vp, length);
+ flags |= BA_CLRBUF;
+ error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
+ if (error) {
+ vnode_pager_setsize(vp, osize);
+ goto out;
+ }
+ ip->i_size = length;
+ DIP_SET(ip, i_size, length);
+ if (bp->b_bufsize == fs->fs_bsize)
+ bp->b_flags |= B_CLUSTEROK;
+ if (flags & IO_SYNC)
+ bwrite(bp);
+ else
+ bawrite(bp);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ error = ffs_update(vp, 1);
+ goto out;
+ }
if (DOINGSOFTDEP(vp)) {
if (length > 0 || softdepslowdown) {
/*
@@ -283,11 +309,18 @@ ffs_truncate(vp, length, flags, cred, td)
* so that it will have no data structures left.
*/
if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
- return (error);
+ goto out;
UFS_LOCK(ump);
if (ip->i_flag & IN_SPACECOUNTED)
fs->fs_pendingblocks -= datablocks;
UFS_UNLOCK(ump);
+ /*
+ * We have to journal the truncation before we change
+ * any blocks so we don't leave the file partially
+ * truncated.
+ */
+ if (DOINGSUJ(vp) && cookie == NULL)
+ cookie = softdep_setup_trunc(vp, length, flags);
} else {
#ifdef QUOTA
(void) chkdq(ip, -datablocks, NOCRED, 0);
@@ -301,33 +334,9 @@ ffs_truncate(vp, length, flags, cred, td)
OFF_TO_IDX(lblktosize(fs, -extblocks)));
vnode_pager_setsize(vp, 0);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
- return (ffs_update(vp, 0));
- }
- }
- osize = ip->i_size;
- /*
- * Lengthen the size of the file. We must ensure that the
- * last byte of the file is allocated. Since the smallest
- * value of osize is 0, length will be at least 1.
- */
- if (osize < length) {
- vnode_pager_setsize(vp, length);
- flags |= BA_CLRBUF;
- error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
- if (error) {
- vnode_pager_setsize(vp, osize);
- return (error);
+ error = ffs_update(vp, 0);
+ goto out;
}
- ip->i_size = length;
- DIP_SET(ip, i_size, length);
- if (bp->b_bufsize == fs->fs_bsize)
- bp->b_flags |= B_CLUSTEROK;
- if (flags & IO_SYNC)
- bwrite(bp);
- else
- bawrite(bp);
- ip->i_flag |= IN_CHANGE | IN_UPDATE;
- return (ffs_update(vp, 1));
}
/*
* Shorten the size of the file. If the file is not being
@@ -345,9 +354,8 @@ ffs_truncate(vp, length, flags, cred, td)
lbn = lblkno(fs, length);
flags |= BA_CLRBUF;
error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
- if (error) {
- return (error);
- }
+ if (error)
+ goto out;
/*
* When we are doing soft updates and the UFS_BALLOC
* above fills in a direct block hole with a full sized
@@ -359,7 +367,7 @@ ffs_truncate(vp, length, flags, cred, td)
if (DOINGSOFTDEP(vp) && lbn < NDADDR &&
fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
(error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
- return (error);
+ goto out;
ip->i_size = length;
DIP_SET(ip, i_size, length);
size = blksize(fs, ip, lbn);
@@ -405,7 +413,13 @@ ffs_truncate(vp, length, flags, cred, td)
DIP_SET(ip, i_db[i], 0);
}
ip->i_flag |= IN_CHANGE | IN_UPDATE;
- allerror = ffs_update(vp, 1);
+ /*
+ * When doing softupdate journaling we must preserve the size along
+ * with the old pointers until they are freed or we might not
+ * know how many fragments remain.
+ */
+ if (!DOINGSUJ(vp))
+ allerror = ffs_update(vp, 1);
/*
* Having written the new inode to disk, save its new configuration
@@ -445,7 +459,7 @@ ffs_truncate(vp, length, flags, cred, td)
if (lastiblock[level] < 0) {
DIP_SET(ip, i_ib[level], 0);
ffs_blkfree(ump, fs, ip->i_devvp, bn,
- fs->fs_bsize, ip->i_number);
+ fs->fs_bsize, ip->i_number, NULL);
blocksreleased += nblocks;
}
}
@@ -464,7 +478,8 @@ ffs_truncate(vp, length, flags, cred, td)
continue;
DIP_SET(ip, i_db[i], 0);
bsize = blksize(fs, ip, i);
- ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number);
+ ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number,
+ NULL);
blocksreleased += btodb(bsize);
}
if (lastblock < 0)
@@ -496,7 +511,7 @@ ffs_truncate(vp, length, flags, cred, td)
*/
bn += numfrags(fs, newspace);
ffs_blkfree(ump, fs, ip->i_devvp, bn,
- oldspace - newspace, ip->i_number);
+ oldspace - newspace, ip->i_number, NULL);
blocksreleased += btodb(oldspace - newspace);
}
}
@@ -528,7 +543,14 @@ done:
#ifdef QUOTA
(void) chkdq(ip, -blocksreleased, NOCRED, 0);
#endif
- return (allerror);
+ error = allerror;
+out:
+ if (cookie) {
+ allerror = softdep_complete_trunc(vp, cookie);
+ if (allerror != 0 && error == 0)
+ error = allerror;
+ }
+ return (error);
}
/*
@@ -638,7 +660,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
blocksreleased += blkcount;
}
ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize,
- ip->i_number);
+ ip->i_number, NULL);
blocksreleased += nblocks;
}
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
index b36cb58808bd..11362cfbc755 100644
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@@ -142,7 +142,7 @@ MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
static int cgaccount(int, struct vnode *, struct buf *, int);
static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
- ufs_lbn_t, int), int);
+ ufs_lbn_t, int), int, int);
static int indiracct_ufs1(struct vnode *, struct vnode *, int,
ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
@@ -155,7 +155,7 @@ static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
struct fs *, ufs_lbn_t, int);
static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
- ufs_lbn_t, int), int);
+ ufs_lbn_t, int), int, int);
static int indiracct_ufs2(struct vnode *, struct vnode *, int,
ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
@@ -582,7 +582,8 @@ loop:
len = fragroundup(fs, blkoff(fs, xp->i_size));
if (len != 0 && len < fs->fs_bsize) {
ffs_blkfree(ump, copy_fs, vp,
- DIP(xp, i_db[loc]), len, xp->i_number);
+ DIP(xp, i_db[loc]), len, xp->i_number,
+ NULL);
blkno = DIP(xp, i_db[loc]);
DIP_SET(xp, i_db[loc], 0);
}
@@ -590,15 +591,15 @@ loop:
snaplistsize += 1;
if (xp->i_ump->um_fstype == UFS1)
error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
- BLK_NOCOPY);
+ BLK_NOCOPY, 1);
else
error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
- BLK_NOCOPY);
+ BLK_NOCOPY, 1);
if (blkno)
DIP_SET(xp, i_db[loc], blkno);
if (!error)
error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
- xp->i_mode);
+ xp->i_mode, NULL);
VOP_UNLOCK(xvp, 0);
vdrop(xvp);
if (error) {
@@ -612,6 +613,26 @@ loop:
}
MNT_IUNLOCK(mp);
/*
+ * Erase the journal file from the snapshot.
+ */
+ if (fs->fs_flags & FS_SUJ) {
+ error = softdep_journal_lookup(mp, &xvp);
+ if (error) {
+ free(copy_fs->fs_csp, M_UFSMNT);
+ bawrite(sbp);
+ sbp = NULL;
+ goto out1;
+ }
+ xp = VTOI(xvp);
+ if (xp->i_ump->um_fstype == UFS1)
+ error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
+ BLK_NOCOPY, 0);
+ else
+ error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
+ BLK_NOCOPY, 0);
+ vput(xvp);
+ }
+ /*
* Acquire a lock on the snapdata structure, creating it if necessary.
*/
sn = ffs_snapdata_acquire(devvp);
@@ -691,16 +712,16 @@ out1:
break;
if (xp->i_ump->um_fstype == UFS1)
error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
- BLK_SNAP);
+ BLK_SNAP, 0);
else
error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
- BLK_SNAP);
+ BLK_SNAP, 0);
if (error == 0 && xp->i_effnlink == 0) {
error = ffs_freefile(ump,
copy_fs,
vp,
xp->i_number,
- xp->i_mode);
+ xp->i_mode, NULL);
}
if (error) {
fs->fs_snapinum[snaploc] = 0;
@@ -719,9 +740,11 @@ out1:
* the list of allocated blocks in i_snapblklist.
*/
if (ip->i_ump->um_fstype == UFS1)
- error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
+ error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
+ BLK_SNAP, 0);
else
- error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
+ error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
+ BLK_SNAP, 0);
if (error) {
fs->fs_snapinum[snaploc] = 0;
free(snapblklist, M_UFSMNT);
@@ -954,13 +977,14 @@ cgaccount(cg, vp, nbp, passno)
* is reproduced once each for UFS1 and UFS2.
*/
static int
-expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
+expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
struct vnode *snapvp;
struct inode *cancelip;
struct fs *fs;
int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
struct fs *, ufs_lbn_t, int);
int expungetype;
+ int clearmode;
{
int i, error, indiroff;
ufs_lbn_t lbn, rlbn;
@@ -1005,7 +1029,7 @@ expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
*/
dip = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
- if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
+ if (clearmode || cancelip->i_effnlink == 0)
dip->di_mode = 0;
dip->di_size = 0;
dip->di_blocks = 0;
@@ -1220,7 +1244,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
*ip->i_snapblklist++ = lblkno;
if (blkno == BLK_SNAP)
blkno = blkstofrags(fs, lblkno);
- ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
+ ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);
}
return (0);
}
@@ -1234,13 +1258,14 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
* is reproduced once each for UFS1 and UFS2.
*/
static int
-expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
+expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
struct vnode *snapvp;
struct inode *cancelip;
struct fs *fs;
int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
struct fs *, ufs_lbn_t, int);
int expungetype;
+ int clearmode;
{
int i, error, indiroff;
ufs_lbn_t lbn, rlbn;
@@ -1285,7 +1310,7 @@ expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
*/
dip = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
- if (expungetype == BLK_NOCOPY)
+ if (clearmode || cancelip->i_effnlink == 0)
dip->di_mode = 0;
dip->di_size = 0;
dip->di_blocks = 0;
@@ -1500,7 +1525,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
*ip->i_snapblklist++ = lblkno;
if (blkno == BLK_SNAP)
blkno = blkstofrags(fs, lblkno);
- ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
+ ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);
}
return (0);
}
@@ -1657,6 +1682,13 @@ ffs_snapremove(vp)
ip->i_flags &= ~SF_SNAPSHOT;
DIP_SET(ip, i_flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ /*
+ * The dirtied indirects must be written out before
+ * softdep_setup_freeblocks() is called. Otherwise indir_trunc()
+ * may find indirect pointers using the magic BLK_* values.
+ */
+ if (DOINGSOFTDEP(vp))
+ ffs_syncvnode(vp, MNT_WAIT);
#ifdef QUOTA
/*
* Reenable disk quotas for ex-snapshot file.
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 4d652c114dd1..30937051357a 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -1,5 +1,7 @@
/*-
- * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
+ * Copyright 1998, 2000 Marshall Kirk McKusick.
+ * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
*
* The soft updates code is derived from the appendix of a University
* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
@@ -23,17 +25,16 @@
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
- * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
*/
@@ -50,6 +51,7 @@ __FBSDID("$FreeBSD$");
#ifndef DEBUG
#define DEBUG
#endif
+#define SUJ_DEBUG
#include <sys/param.h>
#include <sys/kernel.h>
@@ -62,6 +64,7 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
+#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
@@ -130,10 +133,12 @@ softdep_setup_inomapdep(bp, ip, newinum)
}
void
-softdep_setup_blkmapdep(bp, mp, newblkno)
+softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
struct buf *bp;
struct mount *mp;
ufs2_daddr_t newblkno;
+ int frags;
+ int oldfrags;
{
panic("softdep_setup_blkmapdep called");
@@ -403,31 +408,13 @@ softdep_get_depcounts(struct mount *mp,
* These definitions need to be adapted to the system to which
* this file is being ported.
*/
-/*
- * malloc types defined for the softdep system.
- */
-static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
-static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
-static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
-static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
-static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
-static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
-static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
-static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
-static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
-static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
-static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
-static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
-static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
-static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
-static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
#define D_PAGEDEP 0
#define D_INODEDEP 1
-#define D_NEWBLK 2
-#define D_BMSAFEMAP 3
+#define D_BMSAFEMAP 2
+#define D_NEWBLK 3
#define D_ALLOCDIRECT 4
#define D_INDIRDEP 5
#define D_ALLOCINDIR 6
@@ -438,7 +425,67 @@ static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
#define D_MKDIR 11
#define D_DIRREM 12
#define D_NEWDIRBLK 13
-#define D_LAST D_NEWDIRBLK
+#define D_FREEWORK 14
+#define D_FREEDEP 15
+#define D_JADDREF 16
+#define D_JREMREF 17
+#define D_JMVREF 18
+#define D_JNEWBLK 19
+#define D_JFREEBLK 20
+#define D_JFREEFRAG 21
+#define D_JSEG 22
+#define D_JSEGDEP 23
+#define D_SBDEP 24
+#define D_JTRUNC 25
+#define D_LAST D_JTRUNC
+
+unsigned long dep_current[D_LAST + 1];
+unsigned long dep_total[D_LAST + 1];
+
+
+SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
+SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
+ "total dependencies allocated");
+SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
+ "current dependencies allocated");
+
+#define SOFTDEP_TYPE(type, str, long) \
+ static MALLOC_DEFINE(M_ ## type, #str, long); \
+ SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \
+ &dep_total[D_ ## type], 0, ""); \
+ SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \
+ &dep_current[D_ ## type], 0, "");
+
+SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
+SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
+SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
+ "Block or frag allocated from cyl group map");
+SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
+SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
+SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
+SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
+SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
+SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
+SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
+SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
+SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
+SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
+SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
+SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
+SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
+SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
+SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
+SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
+SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
+SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
+SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
+SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
+SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
+SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
+SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
+
+static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
+static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
/*
* translate from workitem type to memory type
@@ -447,8 +494,8 @@ static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
static struct malloc_type *memtype[] = {
M_PAGEDEP,
M_INODEDEP,
- M_NEWBLK,
M_BMSAFEMAP,
+ M_NEWBLK,
M_ALLOCDIRECT,
M_INDIRDEP,
M_ALLOCINDIR,
@@ -458,7 +505,19 @@ static struct malloc_type *memtype[] = {
M_DIRADD,
M_MKDIR,
M_DIRREM,
- M_NEWDIRBLK
+ M_NEWDIRBLK,
+ M_FREEWORK,
+ M_FREEDEP,
+ M_JADDREF,
+ M_JREMREF,
+ M_JMVREF,
+ M_JNEWBLK,
+ M_JFREEBLK,
+ M_JFREEFRAG,
+ M_JSEG,
+ M_JSEGDEP,
+ M_SBDEP,
+ M_JTRUNC
};
#define DtoM(type) (memtype[type])
@@ -467,17 +526,21 @@ static struct malloc_type *memtype[] = {
* Names of malloc types.
*/
#define TYPENAME(type) \
- ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
+ ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
/*
* End system adaptation definitions.
*/
+#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino)
+#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino)
+
/*
* Forward declarations.
*/
struct inodedep_hashhead;
struct newblk_hashhead;
struct pagedep_hashhead;
+struct bmsafemap_hashhead;
/*
* Internal function prototypes.
@@ -487,59 +550,172 @@ static void drain_output(struct vnode *);
static struct buf *getdirtybuf(struct buf *, struct mtx *, int);
static void clear_remove(struct thread *);
static void clear_inodedeps(struct thread *);
+static void unlinked_inodedep(struct mount *, struct inodedep *);
+static void clear_unlinked_inodedep(struct inodedep *);
+static struct inodedep *first_unlinked_inodedep(struct ufsmount *);
static int flush_pagedep_deps(struct vnode *, struct mount *,
struct diraddhd *);
+static void free_pagedep(struct pagedep *);
+static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
static int flush_inodedep_deps(struct mount *, ino_t);
static int flush_deplist(struct allocdirectlst *, int, int *);
static int handle_written_filepage(struct pagedep *, struct buf *);
+static int handle_written_sbdep(struct sbdep *, struct buf *);
+static void initiate_write_sbdep(struct sbdep *);
static void diradd_inode_written(struct diradd *, struct inodedep *);
+static int handle_written_indirdep(struct indirdep *, struct buf *,
+ struct buf**);
static int handle_written_inodeblock(struct inodedep *, struct buf *);
-static void handle_allocdirect_partdone(struct allocdirect *);
+static int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
+static void handle_written_jaddref(struct jaddref *);
+static void handle_written_jremref(struct jremref *);
+static void handle_written_jseg(struct jseg *, struct buf *);
+static void handle_written_jnewblk(struct jnewblk *);
+static void handle_written_jfreeblk(struct jfreeblk *);
+static void handle_written_jfreefrag(struct jfreefrag *);
+static void complete_jseg(struct jseg *);
+static void jseg_write(struct fs *, struct jblocks *, struct jseg *,
+ uint8_t *);
+static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
+static void jremref_write(struct jremref *, struct jseg *, uint8_t *);
+static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
+static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
+static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
+static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
+static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
+static inline void inoref_write(struct inoref *, struct jseg *,
+ struct jrefrec *);
+static void handle_allocdirect_partdone(struct allocdirect *,
+ struct workhead *);
+static void cancel_newblk(struct newblk *, struct workhead *);
+static void indirdep_complete(struct indirdep *);
static void handle_allocindir_partdone(struct allocindir *);
static void initiate_write_filepage(struct pagedep *, struct buf *);
+static void initiate_write_indirdep(struct indirdep*, struct buf *);
static void handle_written_mkdir(struct mkdir *, int);
+static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
static void handle_workitem_freefile(struct freefile *);
static void handle_workitem_remove(struct dirrem *, struct vnode *);
static struct dirrem *newdirrem(struct buf *, struct inode *,
struct inode *, int, struct dirrem **);
-static void free_diradd(struct diradd *);
-static void free_allocindir(struct allocindir *, struct inodedep *);
+static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
+ struct freeblks *);
+static void free_indirdep(struct indirdep *);
+static void free_diradd(struct diradd *, struct workhead *);
+static void merge_diradd(struct inodedep *, struct diradd *);
+static void complete_diradd(struct diradd *);
+static struct diradd *diradd_lookup(struct pagedep *, int);
+static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
+ struct jremref *);
+static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
+ struct jremref *);
+static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
+ struct jremref *, struct jremref *);
+static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
+ struct jremref *);
+static void cancel_allocindir(struct allocindir *, struct inodedep *,
+ struct freeblks *);
+static void complete_mkdir(struct mkdir *);
static void free_newdirblk(struct newdirblk *);
-static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
- ufs2_daddr_t *);
-static void deallocate_dependencies(struct buf *, struct inodedep *);
-static void free_allocdirect(struct allocdirectlst *,
- struct allocdirect *, int);
+static void free_jremref(struct jremref *);
+static void free_jaddref(struct jaddref *);
+static void free_jsegdep(struct jsegdep *);
+static void free_jseg(struct jseg *);
+static void free_jnewblk(struct jnewblk *);
+static void free_jfreeblk(struct jfreeblk *);
+static void free_jfreefrag(struct jfreefrag *);
+static void free_freedep(struct freedep *);
+static void journal_jremref(struct dirrem *, struct jremref *,
+ struct inodedep *);
+static void cancel_jnewblk(struct jnewblk *, struct workhead *);
+static int cancel_jaddref(struct jaddref *, struct inodedep *,
+ struct workhead *);
+static void cancel_jfreefrag(struct jfreefrag *);
+static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
+static int deallocate_dependencies(struct buf *, struct inodedep *,
+ struct freeblks *);
+static void free_newblk(struct newblk *);
+static void cancel_allocdirect(struct allocdirectlst *,
+ struct allocdirect *, struct freeblks *, int);
static int check_inode_unwritten(struct inodedep *);
static int free_inodedep(struct inodedep *);
+static void freework_freeblock(struct freework *);
static void handle_workitem_freeblocks(struct freeblks *, int);
+static void handle_complete_freeblocks(struct freeblks *);
+static void handle_workitem_indirblk(struct freework *);
+static void handle_written_freework(struct freework *);
static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
static void setup_allocindir_phase2(struct buf *, struct inode *,
- struct allocindir *);
+ struct inodedep *, struct allocindir *, ufs_lbn_t);
static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
- ufs2_daddr_t);
+ ufs2_daddr_t, ufs_lbn_t);
static void handle_workitem_freefrag(struct freefrag *);
-static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
+static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
+ ufs_lbn_t);
static void allocdirect_merge(struct allocdirectlst *,
struct allocdirect *, struct allocdirect *);
-static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
-static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
- struct newblk **);
-static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
+static struct freefrag *allocindir_merge(struct allocindir *,
+ struct allocindir *);
+static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
+ struct bmsafemap **);
+static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
+ int cg);
+static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
+ int, struct newblk **);
+static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
struct inodedep **);
static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
-static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
+static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
+ struct pagedep **);
static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
struct mount *mp, int, struct pagedep **);
static void pause_timer(void *);
static int request_cleanup(struct mount *, int);
static int process_worklist_item(struct mount *, int);
-static void add_to_worklist(struct worklist *);
+static void process_removes(struct vnode *);
+static void jwork_move(struct workhead *, struct workhead *);
+static void add_to_worklist(struct worklist *, int);
+static void remove_from_worklist(struct worklist *);
static void softdep_flush(void);
static int softdep_speedup(void);
+static void worklist_speedup(void);
+static int journal_mount(struct mount *, struct fs *, struct ucred *);
+static void journal_unmount(struct mount *);
+static int journal_space(struct ufsmount *, int);
+static void journal_suspend(struct ufsmount *);
+static void softdep_prelink(struct vnode *, struct vnode *);
+static void add_to_journal(struct worklist *);
+static void remove_from_journal(struct worklist *);
+static void softdep_process_journal(struct mount *, int);
+static struct jremref *newjremref(struct dirrem *, struct inode *,
+ struct inode *ip, off_t, nlink_t);
+static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
+ uint16_t);
+static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
+ uint16_t);
+static inline struct jsegdep *inoref_jseg(struct inoref *);
+static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
+static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
+ ufs2_daddr_t, int);
+static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
+ ufs2_daddr_t, long, ufs_lbn_t);
+static struct freework *newfreework(struct freeblks *, struct freework *,
+ ufs_lbn_t, ufs2_daddr_t, int, int);
+static void jwait(struct worklist *wk);
+static struct inodedep *inodedep_lookup_ip(struct inode *);
+static int bmsafemap_rollbacks(struct bmsafemap *);
+static struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
+static void handle_jwork(struct workhead *);
+static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
+ struct mkdir **);
+static struct jblocks *jblocks_create(void);
+static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
+static void jblocks_free(struct jblocks *, struct mount *, int);
+static void jblocks_destroy(struct jblocks *);
+static void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
/*
* Exported softdep operations.
@@ -572,40 +748,128 @@ MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
(item)->wk_state &= ~ONWORKLIST; \
LIST_REMOVE(item, wk_list); \
} while (0)
+#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT
+#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE
+
#else /* DEBUG */
-static void worklist_insert(struct workhead *, struct worklist *);
-static void worklist_remove(struct worklist *);
+static void worklist_insert(struct workhead *, struct worklist *, int);
+static void worklist_remove(struct worklist *, int);
-#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
-#define WORKLIST_REMOVE(item) worklist_remove(item)
+#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
+#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
+#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
+#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
static void
-worklist_insert(head, item)
+worklist_insert(head, item, locked)
struct workhead *head;
struct worklist *item;
+ int locked;
{
- mtx_assert(&lk, MA_OWNED);
+ if (locked)
+ mtx_assert(&lk, MA_OWNED);
if (item->wk_state & ONWORKLIST)
- panic("worklist_insert: already on list");
+ panic("worklist_insert: %p %s(0x%X) already on list",
+ item, TYPENAME(item->wk_type), item->wk_state);
item->wk_state |= ONWORKLIST;
LIST_INSERT_HEAD(head, item, wk_list);
}
static void
-worklist_remove(item)
+worklist_remove(item, locked)
struct worklist *item;
+ int locked;
{
- mtx_assert(&lk, MA_OWNED);
+ if (locked)
+ mtx_assert(&lk, MA_OWNED);
if ((item->wk_state & ONWORKLIST) == 0)
- panic("worklist_remove: not on list");
+ panic("worklist_remove: %p %s(0x%X) not on list",
+ item, TYPENAME(item->wk_type), item->wk_state);
item->wk_state &= ~ONWORKLIST;
LIST_REMOVE(item, wk_list);
}
#endif /* DEBUG */
/*
+ * Merge two jsegdeps keeping only the oldest one as newer references
+ * can't be discarded until after older references.
+ */
+static inline struct jsegdep *
+jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
+{
+ struct jsegdep *swp;
+
+ if (two == NULL)
+ return (one);
+
+ if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
+ swp = one;
+ one = two;
+ two = swp;
+ }
+ WORKLIST_REMOVE(&two->jd_list);
+ free_jsegdep(two);
+
+ return (one);
+}
+
+/*
+ * If two freedeps are compatible free one to reduce list size.
+ */
+static inline struct freedep *
+freedep_merge(struct freedep *one, struct freedep *two)
+{
+ if (two == NULL)
+ return (one);
+
+ if (one->fd_freework == two->fd_freework) {
+ WORKLIST_REMOVE(&two->fd_list);
+ free_freedep(two);
+ }
+ return (one);
+}
+
+/*
+ * Move journal work from one list to another. Duplicate freedeps and
+ * jsegdeps are coalesced to keep the lists as small as possible.
+ */
+static void
+jwork_move(dst, src)
+ struct workhead *dst;
+ struct workhead *src;
+{
+ struct freedep *freedep;
+ struct jsegdep *jsegdep;
+ struct worklist *wkn;
+ struct worklist *wk;
+
+ KASSERT(dst != src,
+ ("jwork_move: dst == src"));
+ freedep = NULL;
+ jsegdep = NULL;
+ LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
+ if (wk->wk_type == D_JSEGDEP)
+ jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
+ if (wk->wk_type == D_FREEDEP)
+ freedep = freedep_merge(WK_FREEDEP(wk), freedep);
+ }
+
+ mtx_assert(&lk, MA_OWNED);
+ while ((wk = LIST_FIRST(src)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ WORKLIST_INSERT(dst, wk);
+ if (wk->wk_type == D_JSEGDEP) {
+ jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
+ continue;
+ }
+ if (wk->wk_type == D_FREEDEP)
+ freedep = freedep_merge(WK_FREEDEP(wk), freedep);
+ }
+}
+
+/*
* Routines for tracking and managing workitems.
*/
static void workitem_free(struct worklist *, int);
@@ -623,13 +887,16 @@ workitem_free(item, type)
#ifdef DEBUG
if (item->wk_state & ONWORKLIST)
- panic("workitem_free: still on list");
+ panic("workitem_free: %s(0x%X) still on list",
+ TYPENAME(item->wk_type), item->wk_state);
if (item->wk_type != type)
- panic("workitem_free: type mismatch");
+ panic("workitem_free: type mismatch %s != %s",
+ TYPENAME(item->wk_type), TYPENAME(type));
#endif
ump = VFSTOUFS(item->wk_mp);
if (--ump->softdep_deps == 0 && ump->softdep_req)
wakeup(&ump->softdep_deps);
+ dep_current[type]--;
free(item, DtoM(type));
}
@@ -643,6 +910,8 @@ workitem_alloc(item, type, mp)
item->wk_mp = mp;
item->wk_state = 0;
ACQUIRE_LOCK(&lk);
+ dep_current[type]++;
+ dep_total[type]++;
VFSTOUFS(mp)->softdep_deps++;
VFSTOUFS(mp)->softdep_accdeps++;
FREE_LOCK(&lk);
@@ -678,24 +947,66 @@ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
-
-SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
-/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
+static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */
+static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */
+static int stat_journal_min; /* Times hit journal min threshold */
+static int stat_journal_low; /* Times hit journal low threshold */
+static int stat_journal_wait; /* Times blocked in jwait(). */
+static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
+static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
+static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */
+static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */
+
+SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
+ &max_softdeps, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
+ &tickdelay, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
+ &maxindirdeps, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
+ &stat_worklist_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
+ &stat_blk_limit_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
+ &stat_ino_limit_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
+ &stat_blk_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
+ &stat_ino_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
+ &stat_sync_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
+ &stat_indir_blk_ptrs, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
+ &stat_inode_bitmap, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
+ &stat_direct_blk_ptrs, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
+ &stat_dir_entry, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
+ &stat_jaddref, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
+ &stat_jnewblk, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
+ &stat_journal_low, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
+ &stat_journal_min, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
+ &stat_journal_wait, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
+ &stat_jwait_filepage, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
+ &stat_jwait_freeblks, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
+ &stat_jwait_inode, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
+ &stat_jwait_newblk, 0, "");
SYSCTL_DECL(_vfs_ffs);
+LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
+static u_long bmsafemap_hash; /* size of hash table - 1 */
+
static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */
SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
&compute_summary_at_mount, 0, "Recompute summary at mount");
@@ -770,16 +1081,22 @@ softdep_flush(void)
}
}
-static int
-softdep_speedup(void)
+static void
+worklist_speedup(void)
{
-
mtx_assert(&lk, MA_OWNED);
if (req_pending == 0) {
req_pending = 1;
wakeup(&req_pending);
}
+}
+static int
+softdep_speedup(void)
+{
+
+ worklist_speedup();
+ bd_speedup();
return speedup_syncer();
}
@@ -791,15 +1108,17 @@ softdep_speedup(void)
* and does so in order from first to last.
*/
static void
-add_to_worklist(wk)
+add_to_worklist(wk, nodelay)
struct worklist *wk;
+ int nodelay;
{
struct ufsmount *ump;
mtx_assert(&lk, MA_OWNED);
ump = VFSTOUFS(wk->wk_mp);
if (wk->wk_state & ONWORKLIST)
- panic("add_to_worklist: already on list");
+ panic("add_to_worklist: %s(0x%X) already on list",
+ TYPENAME(wk->wk_type), wk->wk_state);
wk->wk_state |= ONWORKLIST;
if (LIST_EMPTY(&ump->softdep_workitem_pending))
LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
@@ -807,6 +1126,30 @@ add_to_worklist(wk)
LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
ump->softdep_worklist_tail = wk;
ump->softdep_on_worklist += 1;
+ if (nodelay)
+ worklist_speedup();
+}
+
+/*
+ * Remove the item to be processed. If we are removing the last
+ * item on the list, we need to recalculate the tail pointer.
+ */
+static void
+remove_from_worklist(wk)
+ struct worklist *wk;
+{
+ struct ufsmount *ump;
+ struct worklist *wkend;
+
+ ump = VFSTOUFS(wk->wk_mp);
+ WORKLIST_REMOVE(wk);
+ if (wk == ump->softdep_worklist_tail) {
+ LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
+ if (LIST_NEXT(wkend, wk_list) == NULL)
+ break;
+ ump->softdep_worklist_tail = wkend;
+ }
+ ump->softdep_on_worklist -= 1;
}
/*
@@ -838,8 +1181,9 @@ softdep_process_worklist(mp, full)
ACQUIRE_LOCK(&lk);
loopcount = 1;
starttime = time_second;
+ softdep_process_journal(mp, full?MNT_WAIT:0);
while (ump->softdep_on_worklist > 0) {
- if ((cnt = process_worklist_item(mp, 0)) == -1)
+ if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
break;
else
matchcnt += cnt;
@@ -871,16 +1215,61 @@ softdep_process_worklist(mp, full)
* second. Otherwise the other mountpoints may get
* excessively backlogged.
*/
- if (!full && starttime != time_second) {
- matchcnt = -1;
+ if (!full && starttime != time_second)
break;
- }
}
FREE_LOCK(&lk);
return (matchcnt);
}
/*
+ * Process all removes associated with a vnode if we are running out of
+ * journal space. Any other process which attempts to flush these will
+ * be unable as we have the vnodes locked.
+ */
+static void
+process_removes(vp)
+ struct vnode *vp;
+{
+ struct inodedep *inodedep;
+ struct dirrem *dirrem;
+ struct mount *mp;
+ ino_t inum;
+
+ mtx_assert(&lk, MA_OWNED);
+
+ mp = vp->v_mount;
+ inum = VTOI(vp)->i_number;
+ for (;;) {
+ if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
+ return;
+ LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)
+ if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
+ (COMPLETE | ONWORKLIST))
+ break;
+ if (dirrem == NULL)
+ return;
+ /*
+ * If another thread is trying to lock this vnode it will
+ * fail but we must wait for it to do so before we can
+ * proceed.
+ */
+ if (dirrem->dm_state & INPROGRESS) {
+ dirrem->dm_state |= IOWAITING;
+ msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);
+ continue;
+ }
+ remove_from_worklist(&dirrem->dm_list);
+ FREE_LOCK(&lk);
+ if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
+ panic("process_removes: suspended filesystem");
+ handle_workitem_remove(dirrem, vp);
+ vn_finished_secondary_write(mp);
+ ACQUIRE_LOCK(&lk);
+ }
+}
+
+/*
* Process one item on the worklist.
*/
static int
@@ -888,7 +1277,7 @@ process_worklist_item(mp, flags)
struct mount *mp;
int flags;
{
- struct worklist *wk, *wkend;
+ struct worklist *wk, *wkXXX;
struct ufsmount *ump;
struct vnode *vp;
int matchcnt = 0;
@@ -908,11 +1297,14 @@ process_worklist_item(mp, flags)
* inodes, we have to skip over any dirrem requests whose
* vnodes are resident and locked.
*/
- ump = VFSTOUFS(mp);
vp = NULL;
+ ump = VFSTOUFS(mp);
LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
- if (wk->wk_state & INPROGRESS)
+ if (wk->wk_state & INPROGRESS) {
+ wkXXX = wk;
continue;
+ }
+ wkXXX = wk; /* Record the last valid wk pointer. */
if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
break;
wk->wk_state |= INPROGRESS;
@@ -921,6 +1313,10 @@ process_worklist_item(mp, flags)
ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
ACQUIRE_LOCK(&lk);
+ if (wk->wk_state & IOWAITING) {
+ wk->wk_state &= ~IOWAITING;
+ wakeup(wk);
+ }
wk->wk_state &= ~INPROGRESS;
ump->softdep_on_worklist_inprogress--;
if (vp != NULL)
@@ -928,21 +1324,7 @@ process_worklist_item(mp, flags)
}
if (wk == 0)
return (-1);
- /*
- * Remove the item to be processed. If we are removing the last
- * item on the list, we need to recalculate the tail pointer.
- * As this happens rarely and usually when the list is short,
- * we just run down the list to find it rather than tracking it
- * in the above loop.
- */
- WORKLIST_REMOVE(wk);
- if (wk == ump->softdep_worklist_tail) {
- LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
- if (LIST_NEXT(wkend, wk_list) == NULL)
- break;
- ump->softdep_worklist_tail = wkend;
- }
- ump->softdep_on_worklist -= 1;
+ remove_from_worklist(wk);
FREE_LOCK(&lk);
if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
panic("process_worklist_item: suspended filesystem");
@@ -952,6 +1334,8 @@ process_worklist_item(mp, flags)
case D_DIRREM:
/* removal of a directory entry */
handle_workitem_remove(WK_DIRREM(wk), vp);
+ if (vp)
+ vput(vp);
break;
case D_FREEBLKS:
@@ -969,6 +1353,11 @@ process_worklist_item(mp, flags)
handle_workitem_freefile(WK_FREEFILE(wk));
break;
+ case D_FREEWORK:
+ /* Final block in an indirect was freed. */
+ handle_workitem_indirblk(WK_FREEWORK(wk));
+ break;
+
default:
panic("%s_process_worklist: Unknown type %s",
"softdep", TYPENAME(wk->wk_type));
@@ -982,19 +1371,22 @@ process_worklist_item(mp, flags)
/*
* Move dependencies from one buffer to another.
*/
-void
+int
softdep_move_dependencies(oldbp, newbp)
struct buf *oldbp;
struct buf *newbp;
{
struct worklist *wk, *wktail;
+ int dirty;
- if (!LIST_EMPTY(&newbp->b_dep))
- panic("softdep_move_dependencies: need merge code");
- wktail = 0;
+ dirty = 0;
+ wktail = NULL;
ACQUIRE_LOCK(&lk);
while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
LIST_REMOVE(wk, wk_list);
+ if (wk->wk_type == D_BMSAFEMAP &&
+ bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
+ dirty = 1;
if (wktail == 0)
LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
else
@@ -1002,6 +1394,8 @@ softdep_move_dependencies(oldbp, newbp)
wktail = wk;
}
FREE_LOCK(&lk);
+
+ return (dirty);
}
/*
@@ -1198,23 +1592,22 @@ pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
* This routine must be called with splbio interrupts blocked.
*/
static int
-pagedep_lookup(ip, lbn, flags, pagedeppp)
- struct inode *ip;
+pagedep_lookup(mp, ino, lbn, flags, pagedeppp)
+ struct mount *mp;
+ ino_t ino;
ufs_lbn_t lbn;
int flags;
struct pagedep **pagedeppp;
{
struct pagedep *pagedep;
struct pagedep_hashhead *pagedephd;
- struct mount *mp;
int ret;
int i;
mtx_assert(&lk, MA_OWNED);
- mp = ITOV(ip)->v_mount;
- pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
+ pagedephd = PAGEDEP_HASH(mp, ino, lbn);
- ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
+ ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
if (*pagedeppp || (flags & DEPALLOC) == 0)
return (ret);
FREE_LOCK(&lk);
@@ -1222,12 +1615,12 @@ pagedep_lookup(ip, lbn, flags, pagedeppp)
M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
ACQUIRE_LOCK(&lk);
- ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
+ ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
if (*pagedeppp) {
WORKITEM_FREE(pagedep, D_PAGEDEP);
return (ret);
}
- pagedep->pd_ino = ip->i_number;
+ pagedep->pd_ino = ino;
pagedep->pd_lbn = lbn;
LIST_INIT(&pagedep->pd_dirremhd);
LIST_INIT(&pagedep->pd_pendinghd);
@@ -1314,10 +1707,14 @@ inodedep_lookup(mp, inum, flags, inodedeppp)
inodedep->id_savedino1 = NULL;
inodedep->id_savedsize = -1;
inodedep->id_savedextsize = -1;
- inodedep->id_buf = NULL;
+ inodedep->id_savednlink = -1;
+ inodedep->id_bmsafemap = NULL;
+ inodedep->id_mkdiradd = NULL;
+ LIST_INIT(&inodedep->id_dirremhd);
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
LIST_INIT(&inodedep->id_bufwait);
+ TAILQ_INIT(&inodedep->id_inoreflst);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
TAILQ_INIT(&inodedep->id_extupdt);
@@ -1336,17 +1733,29 @@ u_long newblk_hash; /* size of hash table - 1 */
(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
static int
-newblk_find(newblkhd, fs, newblkno, newblkpp)
+newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
struct newblk_hashhead *newblkhd;
- struct fs *fs;
+ struct mount *mp;
ufs2_daddr_t newblkno;
+ int flags;
struct newblk **newblkpp;
{
struct newblk *newblk;
- LIST_FOREACH(newblk, newblkhd, nb_hash)
- if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
- break;
+ LIST_FOREACH(newblk, newblkhd, nb_hash) {
+ if (newblkno != newblk->nb_newblkno)
+ continue;
+ if (mp != newblk->nb_list.wk_mp)
+ continue;
+ /*
+ * If we're creating a new dependency don't match those that
+ * have already been converted to allocdirects. This is for
+ * a frag extend.
+ */
+ if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
+ continue;
+ break;
+ }
if (newblk) {
*newblkpp = newblk;
return (1);
@@ -1361,8 +1770,8 @@ newblk_find(newblkhd, fs, newblkno, newblkpp)
* Found or allocated entry is returned in newblkpp.
*/
static int
-newblk_lookup(fs, newblkno, flags, newblkpp)
- struct fs *fs;
+newblk_lookup(mp, newblkno, flags, newblkpp)
+ struct mount *mp;
ufs2_daddr_t newblkno;
int flags;
struct newblk **newblkpp;
@@ -1370,21 +1779,25 @@ newblk_lookup(fs, newblkno, flags, newblkpp)
struct newblk *newblk;
struct newblk_hashhead *newblkhd;
- newblkhd = NEWBLK_HASH(fs, newblkno);
- if (newblk_find(newblkhd, fs, newblkno, newblkpp))
+ newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
+ if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
return (1);
if ((flags & DEPALLOC) == 0)
return (0);
FREE_LOCK(&lk);
- newblk = malloc(sizeof(struct newblk),
- M_NEWBLK, M_SOFTDEP_FLAGS);
+ newblk = malloc(sizeof(union allblk), M_NEWBLK,
+ M_SOFTDEP_FLAGS | M_ZERO);
+ workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
ACQUIRE_LOCK(&lk);
- if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
- free(newblk, M_NEWBLK);
+ if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
+ WORKITEM_FREE(newblk, D_NEWBLK);
return (1);
}
- newblk->nb_state = 0;
- newblk->nb_fs = fs;
+ newblk->nb_freefrag = NULL;
+ LIST_INIT(&newblk->nb_indirdeps);
+ LIST_INIT(&newblk->nb_newdirblk);
+ LIST_INIT(&newblk->nb_jwork);
+ newblk->nb_state = ATTACHED;
newblk->nb_newblkno = newblkno;
LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
*newblkpp = newblk;
@@ -1401,10 +1814,10 @@ softdep_initialize()
LIST_INIT(&mkdirlisthd);
max_softdeps = desiredvnodes * 4;
- pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
- &pagedep_hash);
+ pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
- newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
+ newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash);
+ bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
/* initialise bioops hack */
bioops.io_start = softdep_disk_io_initiation;
@@ -1428,6 +1841,7 @@ softdep_uninitialize()
hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
+ hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
}
/*
@@ -1457,9 +1871,16 @@ softdep_mount(devvp, mp, fs, cred)
MNT_IUNLOCK(mp);
ump = VFSTOUFS(mp);
LIST_INIT(&ump->softdep_workitem_pending);
+ LIST_INIT(&ump->softdep_journal_pending);
+ TAILQ_INIT(&ump->softdep_unlinked);
ump->softdep_worklist_tail = NULL;
ump->softdep_on_worklist = 0;
ump->softdep_deps = 0;
+ if ((fs->fs_flags & FS_SUJ) &&
+ (error = journal_mount(mp, fs, cred)) != 0) {
+ printf("Failed to start journal: %d\n", error);
+ return (error);
+ }
/*
* When doing soft updates, the counters in the
* superblock may have gotten out of sync. Recomputation
@@ -1493,6 +1914,2019 @@ softdep_mount(devvp, mp, fs, cred)
return (0);
}
+void
+softdep_unmount(mp)
+ struct mount *mp;
+{
+
+ if (mp->mnt_kern_flag & MNTK_SUJ)
+ journal_unmount(mp);
+}
+
+struct jblocks {
+ struct jseglst jb_segs; /* TAILQ of current segments. */
+ struct jseg *jb_writeseg; /* Next write to complete. */
+ struct jextent *jb_extent; /* Extent array. */
+ uint64_t jb_nextseq; /* Next sequence number. */
+ uint64_t jb_oldestseq; /* Oldest active sequence number. */
+ int jb_avail; /* Available extents. */
+ int jb_used; /* Last used extent. */
+ int jb_head; /* Allocator head. */
+ int jb_off; /* Allocator extent offset. */
+ int jb_blocks; /* Total disk blocks covered. */
+ int jb_free; /* Total disk blocks free. */
+ int jb_min; /* Minimum free space. */
+ int jb_low; /* Low on space. */
+ int jb_age; /* Insertion time of oldest rec. */
+ int jb_suspended; /* Did journal suspend writes? */
+};
+
+struct jextent {
+ ufs2_daddr_t je_daddr; /* Disk block address. */
+ int je_blocks; /* Disk block count. */
+};
+
+static struct jblocks *
+jblocks_create(void)
+{
+ struct jblocks *jblocks;
+
+ jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&jblocks->jb_segs);
+ jblocks->jb_avail = 10;
+ jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
+ M_JBLOCKS, M_WAITOK | M_ZERO);
+
+ return (jblocks);
+}
+
+static ufs2_daddr_t
+jblocks_alloc(jblocks, bytes, actual)
+ struct jblocks *jblocks;
+ int bytes;
+ int *actual;
+{
+ ufs2_daddr_t daddr;
+ struct jextent *jext;
+ int freecnt;
+ int blocks;
+
+ blocks = bytes / DEV_BSIZE;
+ jext = &jblocks->jb_extent[jblocks->jb_head];
+ freecnt = jext->je_blocks - jblocks->jb_off;
+ if (freecnt == 0) {
+ jblocks->jb_off = 0;
+ if (++jblocks->jb_head > jblocks->jb_used)
+ jblocks->jb_head = 0;
+ jext = &jblocks->jb_extent[jblocks->jb_head];
+ freecnt = jext->je_blocks;
+ }
+ if (freecnt > blocks)
+ freecnt = blocks;
+ *actual = freecnt * DEV_BSIZE;
+ daddr = jext->je_daddr + jblocks->jb_off;
+ jblocks->jb_off += freecnt;
+ jblocks->jb_free -= freecnt;
+
+ return (daddr);
+}
+
+static void
+jblocks_free(jblocks, mp, bytes)
+ struct jblocks *jblocks;
+ struct mount *mp;
+ int bytes;
+{
+
+ jblocks->jb_free += bytes / DEV_BSIZE;
+ if (jblocks->jb_suspended)
+ worklist_speedup();
+ wakeup(jblocks);
+}
+
+static void
+jblocks_destroy(jblocks)
+ struct jblocks *jblocks;
+{
+
+ if (jblocks->jb_extent)
+ free(jblocks->jb_extent, M_JBLOCKS);
+ free(jblocks, M_JBLOCKS);
+}
+
+static void
+jblocks_add(jblocks, daddr, blocks)
+ struct jblocks *jblocks;
+ ufs2_daddr_t daddr;
+ int blocks;
+{
+ struct jextent *jext;
+
+ jblocks->jb_blocks += blocks;
+ jblocks->jb_free += blocks;
+ jext = &jblocks->jb_extent[jblocks->jb_used];
+ /* Adding the first block. */
+ if (jext->je_daddr == 0) {
+ jext->je_daddr = daddr;
+ jext->je_blocks = blocks;
+ return;
+ }
+ /* Extending the last extent. */
+ if (jext->je_daddr + jext->je_blocks == daddr) {
+ jext->je_blocks += blocks;
+ return;
+ }
+ /* Adding a new extent. */
+ if (++jblocks->jb_used == jblocks->jb_avail) {
+ jblocks->jb_avail *= 2;
+ jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
+ M_JBLOCKS, M_WAITOK | M_ZERO);
+ memcpy(jext, jblocks->jb_extent,
+ sizeof(struct jextent) * jblocks->jb_used);
+ free(jblocks->jb_extent, M_JBLOCKS);
+ jblocks->jb_extent = jext;
+ }
+ jext = &jblocks->jb_extent[jblocks->jb_used];
+ jext->je_daddr = daddr;
+ jext->je_blocks = blocks;
+ return;
+}
+
+int
+softdep_journal_lookup(mp, vpp)
+ struct mount *mp;
+ struct vnode **vpp;
+{
+ struct componentname cnp;
+ struct vnode *dvp;
+ ino_t sujournal;
+ int error;
+
+ error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
+ if (error)
+ return (error);
+ bzero(&cnp, sizeof(cnp));
+ cnp.cn_nameiop = LOOKUP;
+ cnp.cn_flags = ISLASTCN;
+ cnp.cn_thread = curthread;
+ cnp.cn_cred = curthread->td_ucred;
+ cnp.cn_pnbuf = SUJ_FILE;
+ cnp.cn_nameptr = SUJ_FILE;
+ cnp.cn_namelen = strlen(SUJ_FILE);
+ error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
+ vput(dvp);
+ if (error != 0)
+ return (error);
+ error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
+ return (error);
+}
+
+/*
+ * Open and verify the journal file.
+ */
+static int
+journal_mount(mp, fs, cred)
+ struct mount *mp;
+ struct fs *fs;
+ struct ucred *cred;
+{
+ struct jblocks *jblocks;
+ struct vnode *vp;
+ struct inode *ip;
+ ufs2_daddr_t blkno;
+ int bcount;
+ int error;
+ int i;
+
+ mp->mnt_kern_flag |= MNTK_SUJ;
+ error = softdep_journal_lookup(mp, &vp);
+ if (error != 0) {
+ printf("Failed to find journal. Use tunefs to create one\n");
+ return (error);
+ }
+ ip = VTOI(vp);
+ if (ip->i_size < SUJ_MIN) {
+ error = ENOSPC;
+ goto out;
+ }
+ bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */
+ jblocks = jblocks_create();
+ for (i = 0; i < bcount; i++) {
+ error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
+ if (error)
+ break;
+ jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
+ }
+ if (error) {
+ jblocks_destroy(jblocks);
+ goto out;
+ }
+ jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
+ jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
+ /*
+ * Only validate the journal contents if the filesystem is clean,
+ * otherwise we write the logs but they'll never be used. If the
+ * filesystem was still dirty when we mounted it the journal is
+ * invalid and a new journal can only be valid if it starts from a
+ * clean mount.
+ */
+ if (fs->fs_clean) {
+ DIP_SET(ip, i_modrev, fs->fs_mtime);
+ ip->i_flags |= IN_MODIFIED;
+ ffs_update(vp, 1);
+ }
+ VFSTOUFS(mp)->softdep_jblocks = jblocks;
+out:
+ vput(vp);
+ return (error);
+}
+
+static void
+journal_unmount(mp)
+ struct mount *mp;
+{
+ struct ufsmount *ump;
+
+ ump = VFSTOUFS(mp);
+ if (ump->softdep_jblocks)
+ jblocks_destroy(ump->softdep_jblocks);
+ ump->softdep_jblocks = NULL;
+}
+
+/*
+ * Called when a journal record is ready to be written. Space is allocated
+ * and the journal entry is created when the journal is flushed to stable
+ * store.
+ */
+static void
+add_to_journal(wk)
+ struct worklist *wk;
+{
+ struct ufsmount *ump;
+
+ mtx_assert(&lk, MA_OWNED);
+ ump = VFSTOUFS(wk->wk_mp);
+ if (wk->wk_state & ONWORKLIST)
+ panic("add_to_journal: %s(0x%X) already on list",
+ TYPENAME(wk->wk_type), wk->wk_state);
+ wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
+ if (LIST_EMPTY(&ump->softdep_journal_pending)) {
+ ump->softdep_jblocks->jb_age = ticks;
+ LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
+ } else
+ LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
+ ump->softdep_journal_tail = wk;
+ ump->softdep_on_journal += 1;
+}
+
+/*
+ * Remove an arbitrary item for the journal worklist maintain the tail
+ * pointer. This happens when a new operation obviates the need to
+ * journal an old operation.
+ */
+static void
+remove_from_journal(wk)
+ struct worklist *wk;
+{
+ struct ufsmount *ump;
+
+ mtx_assert(&lk, MA_OWNED);
+ ump = VFSTOUFS(wk->wk_mp);
+#ifdef DEBUG /* XXX Expensive, temporary. */
+ {
+ struct worklist *wkn;
+
+ LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
+ if (wkn == wk)
+ break;
+ if (wkn == NULL)
+ panic("remove_from_journal: %p is not in journal", wk);
+ }
+#endif
+ /*
+ * We emulate a TAILQ to save space in most structures which do not
+ * require TAILQ semantics. Here we must update the tail position
+ * when removing the tail which is not the final entry.
+ */
+ if (ump->softdep_journal_tail == wk)
+ ump->softdep_journal_tail =
+ (struct worklist *)wk->wk_list.le_prev;
+
+ WORKLIST_REMOVE(wk);
+ ump->softdep_on_journal -= 1;
+}
+
+/*
+ * Check for journal space as well as dependency limits so the prelink
+ * code can throttle both journaled and non-journaled filesystems.
+ * Threshold is 0 for low and 1 for min.
+ */
+static int
+journal_space(ump, thresh)
+ struct ufsmount *ump;
+ int thresh;
+{
+ struct jblocks *jblocks;
+ int avail;
+
+ /*
+ * We use a tighter restriction here to prevent request_cleanup()
+ * running in threads from running into locks we currently hold.
+ */
+ if (num_inodedep > (max_softdeps / 10) * 9)
+ return (0);
+
+ jblocks = ump->softdep_jblocks;
+ if (jblocks == NULL)
+ return (1);
+ if (thresh)
+ thresh = jblocks->jb_min;
+ else
+ thresh = jblocks->jb_low;
+ avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
+ avail = jblocks->jb_free - avail;
+
+ return (avail > thresh);
+}
+
+static void
+journal_suspend(ump)
+ struct ufsmount *ump;
+{
+ struct jblocks *jblocks;
+ struct mount *mp;
+
+ mp = UFSTOVFS(ump);
+ jblocks = ump->softdep_jblocks;
+ MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
+ stat_journal_min++;
+ mp->mnt_kern_flag |= MNTK_SUSPEND;
+ mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
+ }
+ jblocks->jb_suspended = 1;
+ MNT_IUNLOCK(mp);
+}
+
+/*
+ * Called before any allocation function to be certain that there is
+ * sufficient space in the journal prior to creating any new records.
+ * Since in the case of block allocation we may have multiple locked
+ * buffers at the time of the actual allocation we can not block
+ * when the journal records are created. Doing so would create a deadlock
+ * if any of these buffers needed to be flushed to reclaim space. Instead
+ * we require a sufficiently large amount of available space such that
+ * each thread in the system could have passed this allocation check and
+ * still have sufficient free space. With 20% of a minimum journal size
+ * of 1MB we have 6553 records available.
+ */
+int
+softdep_prealloc(vp, waitok)
+ struct vnode *vp;
+ int waitok;
+{
+ struct ufsmount *ump;
+
+ if (DOINGSUJ(vp) == 0)
+ return (0);
+ ump = VFSTOUFS(vp->v_mount);
+ ACQUIRE_LOCK(&lk);
+ if (journal_space(ump, 0)) {
+ FREE_LOCK(&lk);
+ return (0);
+ }
+ stat_journal_low++;
+ FREE_LOCK(&lk);
+ if (waitok == MNT_NOWAIT)
+ return (ENOSPC);
+ /*
+ * Attempt to sync this vnode once to flush any journal
+ * work attached to it.
+ */
+ ffs_syncvnode(vp, waitok);
+ ACQUIRE_LOCK(&lk);
+ process_removes(vp);
+ if (journal_space(ump, 0) == 0) {
+ softdep_speedup();
+ if (journal_space(ump, 1) == 0)
+ journal_suspend(ump);
+ }
+ FREE_LOCK(&lk);
+
+ return (0);
+}
+
+/*
+ * Before adjusting a link count on a vnode verify that we have sufficient
+ * journal space. If not, process operations that depend on the currently
+ * locked pair of vnodes to try to flush space as the syncer, buf daemon,
+ * and softdep flush threads can not acquire these locks to reclaim space.
+ */
+static void
+softdep_prelink(dvp, vp)
+ struct vnode *dvp;
+ struct vnode *vp;
+{
+ struct ufsmount *ump;
+
+ ump = VFSTOUFS(dvp->v_mount);
+ mtx_assert(&lk, MA_OWNED);
+ if (journal_space(ump, 0))
+ return;
+ stat_journal_low++;
+ FREE_LOCK(&lk);
+ if (vp)
+ ffs_syncvnode(vp, MNT_NOWAIT);
+ ffs_syncvnode(dvp, MNT_WAIT);
+ ACQUIRE_LOCK(&lk);
+ /* Process vp before dvp as it may create .. removes. */
+ if (vp)
+ process_removes(vp);
+ process_removes(dvp);
+ softdep_speedup();
+ process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
+ process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
+ if (journal_space(ump, 0) == 0) {
+ softdep_speedup();
+ if (journal_space(ump, 1) == 0)
+ journal_suspend(ump);
+ }
+}
+
+static void
+jseg_write(fs, jblocks, jseg, data)
+ struct fs *fs;
+ struct jblocks *jblocks;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jsegrec *rec;
+
+ rec = (struct jsegrec *)data;
+ rec->jsr_seq = jseg->js_seq;
+ rec->jsr_oldest = jblocks->jb_oldestseq;
+ rec->jsr_cnt = jseg->js_cnt;
+ rec->jsr_blocks = jseg->js_size / DEV_BSIZE;
+ rec->jsr_crc = 0;
+ rec->jsr_time = fs->fs_mtime;
+}
+
+static inline void
+inoref_write(inoref, jseg, rec)
+ struct inoref *inoref;
+ struct jseg *jseg;
+ struct jrefrec *rec;
+{
+
+ inoref->if_jsegdep->jd_seg = jseg;
+ rec->jr_ino = inoref->if_ino;
+ rec->jr_parent = inoref->if_parent;
+ rec->jr_nlink = inoref->if_nlink;
+ rec->jr_mode = inoref->if_mode;
+ rec->jr_diroff = inoref->if_diroff;
+}
+
+static void
+jaddref_write(jaddref, jseg, data)
+ struct jaddref *jaddref;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jrefrec *rec;
+
+ rec = (struct jrefrec *)data;
+ rec->jr_op = JOP_ADDREF;
+ inoref_write(&jaddref->ja_ref, jseg, rec);
+}
+
+static void
+jremref_write(jremref, jseg, data)
+ struct jremref *jremref;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jrefrec *rec;
+
+ rec = (struct jrefrec *)data;
+ rec->jr_op = JOP_REMREF;
+ inoref_write(&jremref->jr_ref, jseg, rec);
+}
+
+static void
+jmvref_write(jmvref, jseg, data)
+ struct jmvref *jmvref;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jmvrec *rec;
+
+ rec = (struct jmvrec *)data;
+ rec->jm_op = JOP_MVREF;
+ rec->jm_ino = jmvref->jm_ino;
+ rec->jm_parent = jmvref->jm_parent;
+ rec->jm_oldoff = jmvref->jm_oldoff;
+ rec->jm_newoff = jmvref->jm_newoff;
+}
+
+static void
+jnewblk_write(jnewblk, jseg, data)
+ struct jnewblk *jnewblk;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jblkrec *rec;
+
+ jnewblk->jn_jsegdep->jd_seg = jseg;
+ rec = (struct jblkrec *)data;
+ rec->jb_op = JOP_NEWBLK;
+ rec->jb_ino = jnewblk->jn_ino;
+ rec->jb_blkno = jnewblk->jn_blkno;
+ rec->jb_lbn = jnewblk->jn_lbn;
+ rec->jb_frags = jnewblk->jn_frags;
+ rec->jb_oldfrags = jnewblk->jn_oldfrags;
+}
+
+static void
+jfreeblk_write(jfreeblk, jseg, data)
+ struct jfreeblk *jfreeblk;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jblkrec *rec;
+
+ jfreeblk->jf_jsegdep->jd_seg = jseg;
+ rec = (struct jblkrec *)data;
+ rec->jb_op = JOP_FREEBLK;
+ rec->jb_ino = jfreeblk->jf_ino;
+ rec->jb_blkno = jfreeblk->jf_blkno;
+ rec->jb_lbn = jfreeblk->jf_lbn;
+ rec->jb_frags = jfreeblk->jf_frags;
+ rec->jb_oldfrags = 0;
+}
+
+static void
+jfreefrag_write(jfreefrag, jseg, data)
+ struct jfreefrag *jfreefrag;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jblkrec *rec;
+
+ jfreefrag->fr_jsegdep->jd_seg = jseg;
+ rec = (struct jblkrec *)data;
+ rec->jb_op = JOP_FREEBLK;
+ rec->jb_ino = jfreefrag->fr_ino;
+ rec->jb_blkno = jfreefrag->fr_blkno;
+ rec->jb_lbn = jfreefrag->fr_lbn;
+ rec->jb_frags = jfreefrag->fr_frags;
+ rec->jb_oldfrags = 0;
+}
+
+static void
+jtrunc_write(jtrunc, jseg, data)
+ struct jtrunc *jtrunc;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jtrncrec *rec;
+
+ rec = (struct jtrncrec *)data;
+ rec->jt_op = JOP_TRUNC;
+ rec->jt_ino = jtrunc->jt_ino;
+ rec->jt_size = jtrunc->jt_size;
+ rec->jt_extsize = jtrunc->jt_extsize;
+}
+
+/*
+ * Flush some journal records to disk.
+ */
+static void
+softdep_process_journal(mp, flags)
+ struct mount *mp;
+ int flags;
+{
+ struct jblocks *jblocks;
+ struct ufsmount *ump;
+ struct worklist *wk;
+ struct jseg *jseg;
+ struct buf *bp;
+ uint8_t *data;
+ struct fs *fs;
+ int segwritten;
+ int jrecmin; /* Minimum records per block. */
+ int jrecmax; /* Maximum records per block. */
+ int size;
+ int cnt;
+ int off;
+
+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
+ return;
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ jblocks = ump->softdep_jblocks;
+ /*
+ * We write anywhere between a disk block and fs block. The upper
+ * bound is picked to prevent buffer cache fragmentation and limit
+ * processing time per I/O.
+ */
+ jrecmin = (DEV_BSIZE / JREC_SIZE) - 1; /* -1 for seg header */
+ jrecmax = (fs->fs_bsize / DEV_BSIZE) * jrecmin;
+ segwritten = 0;
+ while ((cnt = ump->softdep_on_journal) != 0) {
+ /*
+ * Create a new segment to hold as many as 'cnt' journal
+ * entries and add them to the segment. Notice cnt is
+ * off by one to account for the space required by the
+ * jsegrec. If we don't have a full block to log skip it
+ * unless we haven't written anything.
+ */
+ cnt++;
+ if (cnt < jrecmax && segwritten)
+ break;
+ /*
+ * Verify some free journal space. softdep_prealloc() should
+ * guarantee that we don't run out so this is indicative of
+ * a problem with the flow control. Try to recover
+ * gracefully in any event.
+ */
+ while (jblocks->jb_free == 0) {
+ if (flags != MNT_WAIT)
+ break;
+ printf("softdep: Out of journal space!\n");
+ softdep_speedup();
+ msleep(jblocks, &lk, PRIBIO, "jblocks", 1);
+ }
+ FREE_LOCK(&lk);
+ jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jseg->js_list, D_JSEG, mp);
+ LIST_INIT(&jseg->js_entries);
+ jseg->js_state = ATTACHED;
+ jseg->js_jblocks = jblocks;
+ bp = geteblk(fs->fs_bsize, 0);
+ ACQUIRE_LOCK(&lk);
+ /*
+ * If there was a race while we were allocating the block
+ * and jseg the entry we care about was likely written.
+ * We bail out in both the WAIT and NOWAIT case and assume
+ * the caller will loop if the entry it cares about is
+ * not written.
+ */
+ if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
+ bp->b_flags |= B_INVAL | B_NOCACHE;
+ WORKITEM_FREE(jseg, D_JSEG);
+ FREE_LOCK(&lk);
+ brelse(bp);
+ ACQUIRE_LOCK(&lk);
+ break;
+ }
+ /*
+ * Calculate the disk block size required for the available
+ * records rounded to the min size.
+ */
+ cnt = ump->softdep_on_journal;
+ if (cnt < jrecmax)
+ size = howmany(cnt, jrecmin) * DEV_BSIZE;
+ else
+ size = fs->fs_bsize;
+ /*
+ * Allocate a disk block for this journal data and account
+ * for truncation of the requested size if enough contiguous
+ * space was not available.
+ */
+ bp->b_blkno = jblocks_alloc(jblocks, size, &size);
+ bp->b_lblkno = bp->b_blkno;
+ bp->b_offset = bp->b_blkno * DEV_BSIZE;
+ bp->b_bcount = size;
+ bp->b_bufobj = &ump->um_devvp->v_bufobj;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
+ /*
+ * Initialize our jseg with cnt records. Assign the next
+ * sequence number to it and link it in-order.
+ */
+ cnt = MIN(ump->softdep_on_journal,
+ (size / DEV_BSIZE) * jrecmin);
+ jseg->js_buf = bp;
+ jseg->js_cnt = cnt;
+ jseg->js_refs = cnt + 1; /* Self ref. */
+ jseg->js_size = size;
+ jseg->js_seq = jblocks->jb_nextseq++;
+ if (TAILQ_EMPTY(&jblocks->jb_segs))
+ jblocks->jb_oldestseq = jseg->js_seq;
+ TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
+ if (jblocks->jb_writeseg == NULL)
+ jblocks->jb_writeseg = jseg;
+ /*
+ * Start filling in records from the pending list.
+ */
+ data = bp->b_data;
+ off = 0;
+ while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
+ != NULL) {
+ /* Place a segment header on every device block. */
+ if ((off % DEV_BSIZE) == 0) {
+ jseg_write(fs, jblocks, jseg, data);
+ off += JREC_SIZE;
+ data = bp->b_data + off;
+ }
+ remove_from_journal(wk);
+ wk->wk_state |= IOSTARTED;
+ WORKLIST_INSERT(&jseg->js_entries, wk);
+ switch (wk->wk_type) {
+ case D_JADDREF:
+ jaddref_write(WK_JADDREF(wk), jseg, data);
+ break;
+ case D_JREMREF:
+ jremref_write(WK_JREMREF(wk), jseg, data);
+ break;
+ case D_JMVREF:
+ jmvref_write(WK_JMVREF(wk), jseg, data);
+ break;
+ case D_JNEWBLK:
+ jnewblk_write(WK_JNEWBLK(wk), jseg, data);
+ break;
+ case D_JFREEBLK:
+ jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
+ break;
+ case D_JFREEFRAG:
+ jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
+ break;
+ case D_JTRUNC:
+ jtrunc_write(WK_JTRUNC(wk), jseg, data);
+ break;
+ default:
+ panic("process_journal: Unknown type %s",
+ TYPENAME(wk->wk_type));
+ /* NOTREACHED */
+ }
+ if (--cnt == 0)
+ break;
+ off += JREC_SIZE;
+ data = bp->b_data + off;
+ }
+ /*
+ * Write this one buffer and continue.
+ */
+ WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
+ FREE_LOCK(&lk);
+ BO_LOCK(bp->b_bufobj);
+ bgetvp(ump->um_devvp, bp);
+ BO_UNLOCK(bp->b_bufobj);
+ if (flags == MNT_NOWAIT)
+ bawrite(bp);
+ else
+ bwrite(bp);
+ ACQUIRE_LOCK(&lk);
+ }
+ /*
+ * If we've suspended the filesystem because we ran out of journal
+ * space either try to sync it here to make some progress or
+ * unsuspend it if we already have.
+ */
+ if (flags == 0 && jblocks && jblocks->jb_suspended) {
+ if (journal_space(ump, jblocks->jb_min)) {
+ FREE_LOCK(&lk);
+ jblocks->jb_suspended = 0;
+ mp->mnt_susp_owner = curthread;
+ vfs_write_resume(mp);
+ ACQUIRE_LOCK(&lk);
+ return;
+ }
+ FREE_LOCK(&lk);
+ VFS_SYNC(mp, MNT_NOWAIT);
+ ffs_sbupdate(ump, MNT_WAIT, 0);
+ ACQUIRE_LOCK(&lk);
+ }
+}
+
+/*
+ * Complete a jseg, allowing all dependencies awaiting journal writes
+ * to proceed. Each journal dependency also attaches a jsegdep to dependent
+ * structures so that the journal segment can be freed to reclaim space.
+ */
+static void
+complete_jseg(jseg)
+ struct jseg *jseg;
+{
+ struct worklist *wk;
+ struct jmvref *jmvref;
+ int waiting;
+ int i;
+
+ i = 0;
+ while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ waiting = wk->wk_state & IOWAITING;
+ wk->wk_state &= ~(IOSTARTED | IOWAITING);
+ wk->wk_state |= COMPLETE;
+ KASSERT(i < jseg->js_cnt,
+ ("handle_written_jseg: overflow %d >= %d",
+ i, jseg->js_cnt));
+ switch (wk->wk_type) {
+ case D_JADDREF:
+ handle_written_jaddref(WK_JADDREF(wk));
+ break;
+ case D_JREMREF:
+ handle_written_jremref(WK_JREMREF(wk));
+ break;
+ case D_JMVREF:
+ /* No jsegdep here. */
+ free_jseg(jseg);
+ jmvref = WK_JMVREF(wk);
+ LIST_REMOVE(jmvref, jm_deps);
+ free_pagedep(jmvref->jm_pagedep);
+ WORKITEM_FREE(jmvref, D_JMVREF);
+ break;
+ case D_JNEWBLK:
+ handle_written_jnewblk(WK_JNEWBLK(wk));
+ break;
+ case D_JFREEBLK:
+ handle_written_jfreeblk(WK_JFREEBLK(wk));
+ break;
+ case D_JFREEFRAG:
+ handle_written_jfreefrag(WK_JFREEFRAG(wk));
+ break;
+ case D_JTRUNC:
+ WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;
+ WORKITEM_FREE(wk, D_JTRUNC);
+ break;
+ default:
+ panic("handle_written_jseg: Unknown type %s",
+ TYPENAME(wk->wk_type));
+ /* NOTREACHED */
+ }
+ if (waiting)
+ wakeup(wk);
+ }
+ /* Release the self reference so the structure may be freed. */
+ free_jseg(jseg);
+}
+
+/*
+ * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg
+ * completions in order only.
+ */
+static void
+handle_written_jseg(jseg, bp)
+ struct jseg *jseg;
+ struct buf *bp;
+{
+ struct jblocks *jblocks;
+ struct jseg *jsegn;
+
+ if (jseg->js_refs == 0)
+ panic("handle_written_jseg: No self-reference on %p", jseg);
+ jseg->js_state |= DEPCOMPLETE;
+ /*
+ * We'll never need this buffer again, set flags so it will be
+ * discarded.
+ */
+ bp->b_flags |= B_INVAL | B_NOCACHE;
+ jblocks = jseg->js_jblocks;
+ /*
+ * Don't allow out of order completions. If this isn't the first
+ * block wait for it to write before we're done.
+ */
+ if (jseg != jblocks->jb_writeseg)
+ return;
+ /* Iterate through available jsegs processing their entries. */
+ do {
+ jsegn = TAILQ_NEXT(jseg, js_next);
+ complete_jseg(jseg);
+ jseg = jsegn;
+ } while (jseg && jseg->js_state & DEPCOMPLETE);
+ jblocks->jb_writeseg = jseg;
+}
+
+static inline struct jsegdep *
+inoref_jseg(inoref)
+ struct inoref *inoref;
+{
+ struct jsegdep *jsegdep;
+
+ jsegdep = inoref->if_jsegdep;
+ inoref->if_jsegdep = NULL;
+
+ return (jsegdep);
+}
+
+/*
+ * Called once a jremref has made it to stable store. The jremref is marked
+ * complete and we attempt to free it. Any pagedeps writes sleeping waiting
+ * for the jremref to complete will be awoken by free_jremref.
+ */
+static void
+handle_written_jremref(jremref)
+ struct jremref *jremref;
+{
+ struct inodedep *inodedep;
+ struct jsegdep *jsegdep;
+ struct dirrem *dirrem;
+
+ /* Grab the jsegdep. */
+ jsegdep = inoref_jseg(&jremref->jr_ref);
+ /*
+ * Remove us from the inoref list.
+ */
+ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
+ 0, &inodedep) == 0)
+ panic("handle_written_jremref: Lost inodedep");
+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
+ /*
+ * Complete the dirrem.
+ */
+ dirrem = jremref->jr_dirrem;
+ jremref->jr_dirrem = NULL;
+ LIST_REMOVE(jremref, jr_deps);
+ jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
+ WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);
+ if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
+ (dirrem->dm_state & COMPLETE) != 0)
+ add_to_worklist(&dirrem->dm_list, 0);
+ free_jremref(jremref);
+}
+
+/*
+ * Called once a jaddref has made it to stable store. The dependency is
+ * marked complete and any dependent structures are added to the inode
+ * bufwait list to be completed as soon as it is written. If a bitmap write
+ * depends on this entry we move the inode into the inodedephd of the
+ * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
+ */
+static void
+handle_written_jaddref(jaddref)
+ struct jaddref *jaddref;
+{
+ struct jsegdep *jsegdep;
+ struct inodedep *inodedep;
+ struct diradd *diradd;
+ struct mkdir *mkdir;
+
+ /* Grab the jsegdep. */
+ jsegdep = inoref_jseg(&jaddref->ja_ref);
+ mkdir = NULL;
+ diradd = NULL;
+ if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
+ 0, &inodedep) == 0)
+ panic("handle_written_jaddref: Lost inodedep.");
+ if (jaddref->ja_diradd == NULL)
+ panic("handle_written_jaddref: No dependency");
+ if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
+ diradd = jaddref->ja_diradd;
+ WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
+ } else if (jaddref->ja_state & MKDIR_PARENT) {
+ mkdir = jaddref->ja_mkdir;
+ WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
+ } else if (jaddref->ja_state & MKDIR_BODY)
+ mkdir = jaddref->ja_mkdir;
+ else
+ panic("handle_written_jaddref: Unknown dependency %p",
+ jaddref->ja_diradd);
+ jaddref->ja_diradd = NULL; /* also clears ja_mkdir */
+ /*
+ * Remove us from the inode list.
+ */
+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
+ /*
+ * The mkdir may be waiting on the jaddref to clear before freeing.
+ */
+ if (mkdir) {
+ KASSERT(mkdir->md_list.wk_type == D_MKDIR,
+ ("handle_written_jaddref: Incorrect type for mkdir %s",
+ TYPENAME(mkdir->md_list.wk_type)));
+ mkdir->md_jaddref = NULL;
+ diradd = mkdir->md_diradd;
+ mkdir->md_state |= DEPCOMPLETE;
+ complete_mkdir(mkdir);
+ }
+ WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);
+ if (jaddref->ja_state & NEWBLOCK) {
+ inodedep->id_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
+ inodedep, id_deps);
+ }
+ free_jaddref(jaddref);
+}
+
+/*
+ * Called once a jnewblk journal is written. The allocdirect or allocindir
+ * is placed in the bmsafemap to await notification of a written bitmap.
+ */
+static void
+handle_written_jnewblk(jnewblk)
+ struct jnewblk *jnewblk;
+{
+ struct bmsafemap *bmsafemap;
+ struct jsegdep *jsegdep;
+ struct newblk *newblk;
+
+ /* Grab the jsegdep. */
+ jsegdep = jnewblk->jn_jsegdep;
+ jnewblk->jn_jsegdep = NULL;
+ /*
+ * Add the written block to the bmsafemap so it can be notified when
+ * the bitmap is on disk.
+ */
+ newblk = jnewblk->jn_newblk;
+ jnewblk->jn_newblk = NULL;
+ if (newblk == NULL)
+ panic("handle_written_jnewblk: No dependency for the segdep.");
+
+ newblk->nb_jnewblk = NULL;
+ bmsafemap = newblk->nb_bmsafemap;
+ WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);
+ newblk->nb_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+ free_jnewblk(jnewblk);
+}
+
+/*
+ * Cancel a jfreefrag that won't be needed, probably due to colliding with
+ * an in-flight allocation that has not yet been committed. Divorce us
+ * from the freefrag and mark it DEPCOMPLETE so that it may be added
+ * to the worklist.
+ */
+static void
+cancel_jfreefrag(jfreefrag)
+ struct jfreefrag *jfreefrag;
+{
+ struct freefrag *freefrag;
+
+ if (jfreefrag->fr_jsegdep) {
+ free_jsegdep(jfreefrag->fr_jsegdep);
+ jfreefrag->fr_jsegdep = NULL;
+ }
+ freefrag = jfreefrag->fr_freefrag;
+ jfreefrag->fr_freefrag = NULL;
+ freefrag->ff_jfreefrag = NULL;
+ free_jfreefrag(jfreefrag);
+ freefrag->ff_state |= DEPCOMPLETE;
+}
+
+/*
+ * Free a jfreefrag when the parent freefrag is rendered obsolete.
+ */
+static void
+free_jfreefrag(jfreefrag)
+ struct jfreefrag *jfreefrag;
+{
+
+ if (jfreefrag->fr_state & IOSTARTED)
+ WORKLIST_REMOVE(&jfreefrag->fr_list);
+ else if (jfreefrag->fr_state & ONWORKLIST)
+ remove_from_journal(&jfreefrag->fr_list);
+ if (jfreefrag->fr_freefrag != NULL)
+ panic("free_jfreefrag: Still attached to a freefrag.");
+ WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
+}
+
+/*
+ * Called when the journal write for a jfreefrag completes. The parent
+ * freefrag is added to the worklist if this completes its dependencies.
+ */
+static void
+handle_written_jfreefrag(jfreefrag)
+ struct jfreefrag *jfreefrag;
+{
+ struct jsegdep *jsegdep;
+ struct freefrag *freefrag;
+
+ /* Grab the jsegdep. */
+ jsegdep = jfreefrag->fr_jsegdep;
+ jfreefrag->fr_jsegdep = NULL;
+ freefrag = jfreefrag->fr_freefrag;
+ if (freefrag == NULL)
+ panic("handle_written_jfreefrag: No freefrag.");
+ freefrag->ff_state |= DEPCOMPLETE;
+ freefrag->ff_jfreefrag = NULL;
+ WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
+ if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
+ add_to_worklist(&freefrag->ff_list, 0);
+ jfreefrag->fr_freefrag = NULL;
+ free_jfreefrag(jfreefrag);
+}
+
+/*
+ * Called when the journal write for a jfreeblk completes. The jfreeblk
+ * is removed from the freeblks list of pending journal writes and the
+ * jsegdep is moved to the freeblks jwork to be completed when all blocks
+ * have been reclaimed.
+ */
+static void
+handle_written_jfreeblk(jfreeblk)
+ struct jfreeblk *jfreeblk;
+{
+ struct freeblks *freeblks;
+ struct jsegdep *jsegdep;
+
+ /* Grab the jsegdep. */
+ jsegdep = jfreeblk->jf_jsegdep;
+ jfreeblk->jf_jsegdep = NULL;
+ freeblks = jfreeblk->jf_freeblks;
+ LIST_REMOVE(jfreeblk, jf_deps);
+ WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
+ /*
+ * If the freeblks is all journaled, we can add it to the worklist.
+ */
+ if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&
+ (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ /* Remove from the b_dep that is waiting on this write. */
+ if (freeblks->fb_state & ONWORKLIST)
+ WORKLIST_REMOVE(&freeblks->fb_list);
+ add_to_worklist(&freeblks->fb_list, 1);
+ }
+
+ free_jfreeblk(jfreeblk);
+}
+
+static struct jsegdep *
+newjsegdep(struct worklist *wk)
+{
+ struct jsegdep *jsegdep;
+
+ jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
+ jsegdep->jd_seg = NULL;
+
+ return (jsegdep);
+}
+
+static struct jmvref *
+newjmvref(dp, ino, oldoff, newoff)
+ struct inode *dp;
+ ino_t ino;
+ off_t oldoff;
+ off_t newoff;
+{
+ struct jmvref *jmvref;
+
+ jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
+ jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
+ jmvref->jm_parent = dp->i_number;
+ jmvref->jm_ino = ino;
+ jmvref->jm_oldoff = oldoff;
+ jmvref->jm_newoff = newoff;
+
+ return (jmvref);
+}
+
+/*
+ * Allocate a new jremref that tracks the removal of ip from dp with the
+ * directory entry offset of diroff. Mark the entry as ATTACHED and
+ * DEPCOMPLETE as we have all the information required for the journal write
+ * and the directory has already been removed from the buffer. The caller
+ * is responsible for linking the jremref into the pagedep and adding it
+ * to the journal to write. The MKDIR_PARENT flag is set if we're doing
+ * a DOTDOT addition so handle_workitem_remove() can properly assign
+ * the jsegdep when we're done.
+ */
+static struct jremref *
+newjremref(dirrem, dp, ip, diroff, nlink)
+ struct dirrem *dirrem;
+ struct inode *dp;
+ struct inode *ip;
+ off_t diroff;
+ nlink_t nlink;
+{
+ struct jremref *jremref;
+
+ jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
+ jremref->jr_state = ATTACHED;
+ newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
+ nlink, ip->i_mode);
+ jremref->jr_dirrem = dirrem;
+
+ return (jremref);
+}
+
+static inline void
+newinoref(inoref, ino, parent, diroff, nlink, mode)
+ struct inoref *inoref;
+ ino_t ino;
+ ino_t parent;
+ off_t diroff;
+ nlink_t nlink;
+ uint16_t mode;
+{
+
+ inoref->if_jsegdep = newjsegdep(&inoref->if_list);
+ inoref->if_diroff = diroff;
+ inoref->if_ino = ino;
+ inoref->if_parent = parent;
+ inoref->if_nlink = nlink;
+ inoref->if_mode = mode;
+}
+
+/*
+ * Allocate a new jaddref to track the addition of ino to dp at diroff. The
+ * directory offset may not be known until later. The caller is responsible
+ * adding the entry to the journal when this information is available. nlink
+ * should be the link count prior to the addition and mode is only required
+ * to have the correct FMT.
+ */
+static struct jaddref *
+newjaddref(dp, ino, diroff, nlink, mode)
+ struct inode *dp;
+ ino_t ino;
+ off_t diroff;
+ int16_t nlink;
+ uint16_t mode;
+{
+ struct jaddref *jaddref;
+
+ jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
+ jaddref->ja_state = ATTACHED;
+ jaddref->ja_mkdir = NULL;
+ newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
+
+ return (jaddref);
+}
+
+/*
+ * Create a new free dependency for a freework. The caller is responsible
+ * for adjusting the reference count when it has the lock held. The freedep
+ * will track an outstanding bitmap write that will ultimately clear the
+ * freework to continue.
+ */
+static struct freedep *
+newfreedep(struct freework *freework)
+{
+ struct freedep *freedep;
+
+ freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
+ workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
+ freedep->fd_freework = freework;
+
+ return (freedep);
+}
+
+/*
+ * Free a freedep structure once the buffer it is linked to is written. If
+ * this is the last reference to the freework schedule it for completion.
+ */
+static void
+free_freedep(freedep)
+ struct freedep *freedep;
+{
+
+ if (--freedep->fd_freework->fw_ref == 0)
+ add_to_worklist(&freedep->fd_freework->fw_list, 1);
+ WORKITEM_FREE(freedep, D_FREEDEP);
+}
+
+/*
+ * Allocate a new freework structure that may be a level in an indirect
+ * when parent is not NULL or a top level block when it is. The top level
+ * freework structures are allocated without lk held and before the freeblks
+ * is visible outside of softdep_setup_freeblocks().
+ */
+static struct freework *
+newfreework(freeblks, parent, lbn, nb, frags, journal)
+ struct freeblks *freeblks;
+ struct freework *parent;
+ ufs_lbn_t lbn;
+ ufs2_daddr_t nb;
+ int frags;
+ int journal;
+{
+ struct freework *freework;
+
+ freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
+ workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
+ freework->fw_freeblks = freeblks;
+ freework->fw_parent = parent;
+ freework->fw_lbn = lbn;
+ freework->fw_blkno = nb;
+ freework->fw_frags = frags;
+ freework->fw_ref = 0;
+ freework->fw_off = 0;
+ LIST_INIT(&freework->fw_jwork);
+
+ if (parent == NULL) {
+ WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,
+ &freework->fw_list);
+ freeblks->fb_ref++;
+ }
+ if (journal)
+ newjfreeblk(freeblks, lbn, nb, frags);
+
+ return (freework);
+}
+
+/*
+ * Allocate a new jfreeblk to journal top level block pointer when truncating
+ * a file. The caller must add this to the worklist when lk is held.
+ */
+static struct jfreeblk *
+newjfreeblk(freeblks, lbn, blkno, frags)
+ struct freeblks *freeblks;
+ ufs_lbn_t lbn;
+ ufs2_daddr_t blkno;
+ int frags;
+{
+ struct jfreeblk *jfreeblk;
+
+ jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);
+ jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);
+ jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;
+ jfreeblk->jf_ino = freeblks->fb_previousinum;
+ jfreeblk->jf_lbn = lbn;
+ jfreeblk->jf_blkno = blkno;
+ jfreeblk->jf_frags = frags;
+ jfreeblk->jf_freeblks = freeblks;
+ LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);
+
+ return (jfreeblk);
+}
+
+static void move_newblock_dep(struct jaddref *, struct inodedep *);
+/*
+ * If we're canceling a new bitmap we have to search for another ref
+ * to move into the bmsafemap dep. This might be better expressed
+ * with another structure.
+ */
+static void
+move_newblock_dep(jaddref, inodedep)
+ struct jaddref *jaddref;
+ struct inodedep *inodedep;
+{
+ struct inoref *inoref;
+ struct jaddref *jaddrefn;
+
+ jaddrefn = NULL;
+ for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
+ inoref = TAILQ_NEXT(inoref, if_deps)) {
+ if ((jaddref->ja_state & NEWBLOCK) &&
+ inoref->if_list.wk_type == D_JADDREF) {
+ jaddrefn = (struct jaddref *)inoref;
+ break;
+ }
+ }
+ if (jaddrefn == NULL)
+ return;
+ jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
+ jaddrefn->ja_state |= jaddref->ja_state &
+ (ATTACHED | UNDONE | NEWBLOCK);
+ jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
+ jaddref->ja_state |= ATTACHED;
+ LIST_REMOVE(jaddref, ja_bmdeps);
+ LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
+ ja_bmdeps);
+}
+
+/*
+ * Cancel a jaddref either before it has been written or while it is being
+ * written. This happens when a link is removed before the add reaches
+ * the disk. The jaddref dependency is kept linked into the bmsafemap
+ * and inode to prevent the link count or bitmap from reaching the disk
+ * until handle_workitem_remove() re-adjusts the counts and bitmaps as
+ * required.
+ *
+ * Returns 1 if the canceled addref requires journaling of the remove and
+ * 0 otherwise.
+ */
+static int
+cancel_jaddref(jaddref, inodedep, wkhd)
+ struct jaddref *jaddref;
+ struct inodedep *inodedep;
+ struct workhead *wkhd;
+{
+ struct inoref *inoref;
+ struct jsegdep *jsegdep;
+ int needsj;
+
+ KASSERT((jaddref->ja_state & COMPLETE) == 0,
+ ("cancel_jaddref: Canceling complete jaddref"));
+ if (jaddref->ja_state & (IOSTARTED | COMPLETE))
+ needsj = 1;
+ else
+ needsj = 0;
+ if (inodedep == NULL)
+ if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
+ 0, &inodedep) == 0)
+ panic("cancel_jaddref: Lost inodedep");
+ /*
+ * We must adjust the nlink of any reference operation that follows
+ * us so that it is consistent with the in-memory reference. This
+ * ensures that inode nlink rollbacks always have the correct link.
+ */
+ if (needsj == 0)
+ for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
+ inoref = TAILQ_NEXT(inoref, if_deps))
+ inoref->if_nlink--;
+ jsegdep = inoref_jseg(&jaddref->ja_ref);
+ if (jaddref->ja_state & NEWBLOCK)
+ move_newblock_dep(jaddref, inodedep);
+ if (jaddref->ja_state & IOWAITING) {
+ jaddref->ja_state &= ~IOWAITING;
+ wakeup(&jaddref->ja_list);
+ }
+ jaddref->ja_mkdir = NULL;
+ if (jaddref->ja_state & IOSTARTED) {
+ jaddref->ja_state &= ~IOSTARTED;
+ WORKLIST_REMOVE(&jaddref->ja_list);
+ WORKLIST_INSERT(wkhd, &jsegdep->jd_list);
+ } else {
+ free_jsegdep(jsegdep);
+ remove_from_journal(&jaddref->ja_list);
+ }
+ /*
+ * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
+ * can arrange for them to be freed with the bitmap. Otherwise we
+ * no longer need this addref attached to the inoreflst and it
+ * will incorrectly adjust nlink if we leave it.
+ */
+ if ((jaddref->ja_state & NEWBLOCK) == 0) {
+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ jaddref->ja_state |= COMPLETE;
+ free_jaddref(jaddref);
+ return (needsj);
+ }
+ jaddref->ja_state |= GOINGAWAY;
+ /*
+ * Leave the head of the list for jsegdeps for fast merging.
+ */
+ if (LIST_FIRST(wkhd) != NULL) {
+ jaddref->ja_state |= ONWORKLIST;
+ LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
+ } else
+ WORKLIST_INSERT(wkhd, &jaddref->ja_list);
+
+ return (needsj);
+}
+
+/*
+ * Attempt to free a jaddref structure when some work completes. This
+ * should only succeed once the entry is written and all dependencies have
+ * been notified.
+ */
+static void
+free_jaddref(jaddref)
+ struct jaddref *jaddref;
+{
+
+ if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
+ return;
+ if (jaddref->ja_ref.if_jsegdep)
+ panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
+ jaddref, jaddref->ja_state);
+ if (jaddref->ja_state & NEWBLOCK)
+ LIST_REMOVE(jaddref, ja_bmdeps);
+ if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))
+ panic("free_jaddref: Bad state %p(0x%X)",
+ jaddref, jaddref->ja_state);
+ if (jaddref->ja_mkdir != NULL)
+ panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
+ WORKITEM_FREE(jaddref, D_JADDREF);
+}
+
+/*
+ * Free a jremref structure once it has been written or discarded.
+ */
+static void
+free_jremref(jremref)
+ struct jremref *jremref;
+{
+
+ if (jremref->jr_ref.if_jsegdep)
+ free_jsegdep(jremref->jr_ref.if_jsegdep);
+ if (jremref->jr_state & IOSTARTED)
+ panic("free_jremref: IO still pending");
+ WORKITEM_FREE(jremref, D_JREMREF);
+}
+
+/*
+ * Free a jnewblk structure.
+ */
+static void
+free_jnewblk(jnewblk)
+ struct jnewblk *jnewblk;
+{
+
+ if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
+ return;
+ LIST_REMOVE(jnewblk, jn_deps);
+ if (jnewblk->jn_newblk != NULL)
+ panic("free_jnewblk: Dependency still attached.");
+ WORKITEM_FREE(jnewblk, D_JNEWBLK);
+}
+
+/*
+ * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk
+ * is kept linked into the bmsafemap until the free completes, thus
+ * preventing the modified state from ever reaching disk. The free
+ * routine must pass this structure via ffs_blkfree() to
+ * softdep_setup_freeblks() so there is no race in releasing the space.
+ */
+static void
+cancel_jnewblk(jnewblk, wkhd)
+ struct jnewblk *jnewblk;
+ struct workhead *wkhd;
+{
+ struct jsegdep *jsegdep;
+
+ jsegdep = jnewblk->jn_jsegdep;
+ jnewblk->jn_jsegdep = NULL;
+ free_jsegdep(jsegdep);
+ jnewblk->jn_newblk = NULL;
+ jnewblk->jn_state |= GOINGAWAY;
+ if (jnewblk->jn_state & IOSTARTED) {
+ jnewblk->jn_state &= ~IOSTARTED;
+ WORKLIST_REMOVE(&jnewblk->jn_list);
+ } else
+ remove_from_journal(&jnewblk->jn_list);
+ /*
+ * Leave the head of the list for jsegdeps for fast merging.
+ */
+ if (LIST_FIRST(wkhd) != NULL) {
+ jnewblk->jn_state |= ONWORKLIST;
+ LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list);
+ } else
+ WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
+ if (jnewblk->jn_state & IOWAITING) {
+ jnewblk->jn_state &= ~IOWAITING;
+ wakeup(&jnewblk->jn_list);
+ }
+}
+
+static void
+free_jfreeblk(jfreeblk)
+ struct jfreeblk *jfreeblk;
+{
+
+ WORKITEM_FREE(jfreeblk, D_JFREEBLK);
+}
+
+/*
+ * Release one reference to a jseg and free it if the count reaches 0. This
+ * should eventually reclaim journal space as well.
+ */
+static void
+free_jseg(jseg)
+ struct jseg *jseg;
+{
+ struct jblocks *jblocks;
+
+ KASSERT(jseg->js_refs > 0,
+ ("free_jseg: Invalid refcnt %d", jseg->js_refs));
+ if (--jseg->js_refs != 0)
+ return;
+ /*
+ * Free only those jsegs which have none allocated before them to
+ * preserve the journal space ordering.
+ */
+ jblocks = jseg->js_jblocks;
+ while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
+ jblocks->jb_oldestseq = jseg->js_seq;
+ if (jseg->js_refs != 0)
+ break;
+ TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
+ jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
+ KASSERT(LIST_EMPTY(&jseg->js_entries),
+ ("free_jseg: Freed jseg has valid entries."));
+ WORKITEM_FREE(jseg, D_JSEG);
+ }
+}
+
+/*
+ * Release a jsegdep and decrement the jseg count.
+ */
+static void
+free_jsegdep(jsegdep)
+ struct jsegdep *jsegdep;
+{
+
+ if (jsegdep->jd_seg)
+ free_jseg(jsegdep->jd_seg);
+ WORKITEM_FREE(jsegdep, D_JSEGDEP);
+}
+
+/*
+ * Wait for a journal item to make it to disk. Initiate journal processing
+ * if required.
+ */
+static void
+jwait(wk)
+ struct worklist *wk;
+{
+
+ stat_journal_wait++;
+ /*
+ * If IO has not started we process the journal. We can't mark the
+ * worklist item as IOWAITING because we drop the lock while
+ * processing the journal and the worklist entry may be freed after
+ * this point. The caller may call back in and re-issue the request.
+ */
+ if ((wk->wk_state & IOSTARTED) == 0) {
+ softdep_process_journal(wk->wk_mp, MNT_WAIT);
+ return;
+ }
+ wk->wk_state |= IOWAITING;
+ msleep(wk, &lk, PRIBIO, "jwait", 0);
+}
+
+/*
+ * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
+ * appropriate. This is a convenience function to reduce duplicate code
+ * for the setup and revert functions below.
+ */
+static struct inodedep *
+inodedep_lookup_ip(ip)
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+
+ KASSERT(ip->i_nlink >= ip->i_effnlink,
+ ("inodedep_lookup_ip: bad delta"));
+ (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
+ DEPALLOC, &inodedep);
+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
+
+ return (inodedep);
+}
+
+/*
+ * Create a journal entry that describes a truncate that we're about to
+ * perform. The inode allocations and frees between here and the completion
+ * of the operation are done asynchronously and without journaling. At
+ * the end of the operation the vnode is sync'd and the journal space
+ * is released. Recovery will discover the partially completed truncate
+ * and complete it.
+ */
+void *
+softdep_setup_trunc(vp, length, flags)
+ struct vnode *vp;
+ off_t length;
+ int flags;
+{
+ struct jsegdep *jsegdep;
+ struct jtrunc *jtrunc;
+ struct ufsmount *ump;
+ struct inode *ip;
+
+ softdep_prealloc(vp, MNT_WAIT);
+ ip = VTOI(vp);
+ ump = VFSTOUFS(vp->v_mount);
+ jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);
+ jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);
+ jtrunc->jt_ino = ip->i_number;
+ jtrunc->jt_extsize = 0;
+ jtrunc->jt_size = length;
+ if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)
+ jtrunc->jt_extsize = ip->i_din2->di_extsize;
+ if ((flags & IO_NORMAL) == 0)
+ jtrunc->jt_size = DIP(ip, i_size);
+ ACQUIRE_LOCK(&lk);
+ add_to_journal(&jtrunc->jt_list);
+ while (jsegdep->jd_seg == NULL) {
+ stat_jwait_freeblks++;
+ jwait(&jtrunc->jt_list);
+ }
+ FREE_LOCK(&lk);
+
+ return (jsegdep);
+}
+
+/*
+ * After synchronous truncation is complete we free sync the vnode and
+ * release the jsegdep so the journal space can be freed.
+ */
+int
+softdep_complete_trunc(vp, cookie)
+ struct vnode *vp;
+ void *cookie;
+{
+ int error;
+
+ error = ffs_syncvnode(vp, MNT_WAIT);
+ ACQUIRE_LOCK(&lk);
+ free_jsegdep((struct jsegdep *)cookie);
+ FREE_LOCK(&lk);
+
+ return (error);
+}
+
+/*
+ * Called prior to creating a new inode and linking it to a directory. The
+ * jaddref structure must already be allocated by softdep_setup_inomapdep
+ * and it is discovered here so we can initialize the mode and update
+ * nlinkdelta.
+ */
+void
+softdep_setup_create(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ KASSERT(ip->i_nlink == 1,
+ ("softdep_setup_create: Invalid link count."));
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+ ("softdep_setup_create: No addref structure present."));
+ jaddref->ja_mode = ip->i_mode;
+ }
+ softdep_prelink(dvp, NULL);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Create a jaddref structure to track the addition of a DOTDOT link when
+ * we are reparenting an inode as part of a rename. This jaddref will be
+ * found by softdep_setup_directory_change. Adjusts nlinkdelta for
+ * non-journaling softdep.
+ */
+void
+softdep_setup_dotdot_link(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+ struct vnode *vp;
+
+ dvp = ITOV(dp);
+ vp = ITOV(ip);
+ jaddref = NULL;
+ /*
+ * We don't set MKDIR_PARENT as this is not tied to a mkdir and
+ * is used as a normal link would be.
+ */
+ if (DOINGSUJ(dvp))
+ jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
+ dp->i_effnlink - 1, dp->i_mode);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(dp);
+ if (jaddref)
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ softdep_prelink(dvp, ITOV(ip));
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Create a jaddref structure to track a new link to an inode. The directory
+ * offset is not known until softdep_setup_directory_add or
+ * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling
+ * softdep.
+ */
+void
+softdep_setup_link(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ jaddref = NULL;
+ if (DOINGSUJ(dvp))
+ jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
+ ip->i_mode);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (jaddref)
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ softdep_prelink(dvp, ITOV(ip));
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to create the jaddref structures to track . and .. references as
+ * well as lookup and further initialize the incomplete jaddref created
+ * by softdep_setup_inomapdep when the inode was allocated. Adjusts
+ * nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_setup_mkdir(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *dotdotaddref;
+ struct jaddref *dotaddref;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ dotaddref = dotdotaddref = NULL;
+ if (DOINGSUJ(dvp)) {
+ dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
+ ip->i_mode);
+ dotaddref->ja_state |= MKDIR_BODY;
+ dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
+ dp->i_effnlink - 1, dp->i_mode);
+ dotdotaddref->ja_state |= MKDIR_PARENT;
+ }
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL,
+ ("softdep_setup_mkdir: No addref structure present."));
+ KASSERT(jaddref->ja_parent == dp->i_number,
+ ("softdep_setup_mkdir: bad parent %d",
+ jaddref->ja_parent));
+ jaddref->ja_mode = ip->i_mode;
+ TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
+ if_deps);
+ }
+ inodedep = inodedep_lookup_ip(dp);
+ if (DOINGSUJ(dvp))
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
+ &dotdotaddref->ja_ref, if_deps);
+ softdep_prelink(ITOV(dp), NULL);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to track nlinkdelta of the inode and parent directories prior to
+ * unlinking a directory.
+ */
+void
+softdep_setup_rmdir(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ (void) inodedep_lookup_ip(ip);
+ (void) inodedep_lookup_ip(dp);
+ softdep_prelink(dvp, ITOV(ip));
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to track nlinkdelta of the inode and parent directories prior to
+ * unlink.
+ */
+void
+softdep_setup_unlink(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ (void) inodedep_lookup_ip(ip);
+ (void) inodedep_lookup_ip(dp);
+ softdep_prelink(dvp, ITOV(ip));
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed non-directory
+ * creation. Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_create(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == dp->i_number,
+ ("softdep_revert_create: addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed dotdot link
+ * creation. Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_dotdot_link(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(dp);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == ip->i_number,
+ ("softdep_revert_dotdot_link: addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed link
+ * addition. Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_link(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == dp->i_number,
+ ("softdep_revert_link: addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed mkdir
+ * attempt. Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_mkdir(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(dp);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == ip->i_number,
+ ("softdep_revert_mkdir: dotdot addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == dp->i_number,
+ ("softdep_revert_mkdir: addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == ip->i_number,
+ ("softdep_revert_mkdir: dot addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to correct nlinkdelta after a failed rmdir.
+ */
+void
+softdep_revert_rmdir(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+
+ ACQUIRE_LOCK(&lk);
+ (void) inodedep_lookup_ip(ip);
+ (void) inodedep_lookup_ip(dp);
+ FREE_LOCK(&lk);
+}
+
/*
* Protecting the freemaps (or bitmaps).
*
@@ -1536,6 +3970,22 @@ softdep_setup_inomapdep(bp, ip, newinum)
{
struct inodedep *inodedep;
struct bmsafemap *bmsafemap;
+ struct jaddref *jaddref;
+ struct mount *mp;
+ struct fs *fs;
+
+ mp = UFSTOVFS(ip->i_ump);
+ fs = ip->i_ump->um_fs;
+ jaddref = NULL;
+
+ /*
+ * Allocate the journal reference add structure so that the bitmap
+ * can be dependent on it.
+ */
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ jaddref = newjaddref(ip, newinum, 0, 0, 0);
+ jaddref->ja_state |= NEWBLOCK;
+ }
/*
* Create a dependency for the newly allocated inode.
@@ -1544,14 +3994,20 @@ softdep_setup_inomapdep(bp, ip, newinum)
* the cylinder group map from which it was allocated.
*/
ACQUIRE_LOCK(&lk);
- if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
- &inodedep)))
- panic("softdep_setup_inomapdep: dependency for new inode "
- "already exists");
- inodedep->id_buf = bp;
+ if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
+ panic("softdep_setup_inomapdep: dependency %p for new"
+ "inode already exists", inodedep);
+ bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
+ if (jaddref) {
+ LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ } else {
+ inodedep->id_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
+ }
+ inodedep->id_bmsafemap = bmsafemap;
inodedep->id_state &= ~DEPCOMPLETE;
- bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
- LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
FREE_LOCK(&lk);
}
@@ -1560,29 +4016,98 @@ softdep_setup_inomapdep(bp, ip, newinum)
* allocate block or fragment.
*/
void
-softdep_setup_blkmapdep(bp, mp, newblkno)
+softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
struct buf *bp; /* buffer for cylgroup block with block map */
struct mount *mp; /* filesystem doing allocation */
ufs2_daddr_t newblkno; /* number of newly allocated block */
+ int frags; /* Number of fragments. */
+ int oldfrags; /* Previous number of fragments for extend. */
{
struct newblk *newblk;
struct bmsafemap *bmsafemap;
+ struct jnewblk *jnewblk;
struct fs *fs;
fs = VFSTOUFS(mp)->um_fs;
+ jnewblk = NULL;
/*
* Create a dependency for the newly allocated block.
* Add it to the dependency list for the buffer holding
* the cylinder group map from which it was allocated.
*/
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
+ jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
+ jnewblk->jn_state = ATTACHED;
+ jnewblk->jn_blkno = newblkno;
+ jnewblk->jn_frags = frags;
+ jnewblk->jn_oldfrags = oldfrags;
+#ifdef SUJ_DEBUG
+ {
+ struct cg *cgp;
+ uint8_t *blksfree;
+ long bno;
+ int i;
+
+ cgp = (struct cg *)bp->b_data;
+ blksfree = cg_blksfree(cgp);
+ bno = dtogd(fs, jnewblk->jn_blkno);
+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+ i++) {
+ if (isset(blksfree, bno + i))
+ panic("softdep_setup_blkmapdep: "
+ "free fragment %d from %d-%d "
+ "state 0x%X dep %p", i,
+ jnewblk->jn_oldfrags,
+ jnewblk->jn_frags,
+ jnewblk->jn_state,
+ jnewblk->jn_newblk);
+ }
+ }
+#endif
+ }
ACQUIRE_LOCK(&lk);
- if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
+ if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
panic("softdep_setup_blkmapdep: found block");
- newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
- LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+ newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
+ dtog(fs, newblkno));
+ if (jnewblk) {
+ jnewblk->jn_newblk = newblk;
+ LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
+ } else {
+ newblk->nb_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+ }
+ newblk->nb_bmsafemap = bmsafemap;
+ newblk->nb_jnewblk = jnewblk;
FREE_LOCK(&lk);
}
+#define BMSAFEMAP_HASH(fs, cg) \
+ (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
+
+static int
+bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
+ struct bmsafemap_hashhead *bmsafemaphd;
+ struct mount *mp;
+ int cg;
+ struct bmsafemap **bmsafemapp;
+{
+ struct bmsafemap *bmsafemap;
+
+ LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
+ if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
+ break;
+ if (bmsafemap) {
+ *bmsafemapp = bmsafemap;
+ return (1);
+ }
+ *bmsafemapp = NULL;
+
+ return (0);
+}
+
/*
* Find the bmsafemap associated with a cylinder group buffer.
* If none exists, create one. The buffer must be locked when
@@ -1590,27 +4115,43 @@ softdep_setup_blkmapdep(bp, mp, newblkno)
* splbio interrupts blocked.
*/
static struct bmsafemap *
-bmsafemap_lookup(mp, bp)
+bmsafemap_lookup(mp, bp, cg)
struct mount *mp;
struct buf *bp;
+ int cg;
{
- struct bmsafemap *bmsafemap;
+ struct bmsafemap_hashhead *bmsafemaphd;
+ struct bmsafemap *bmsafemap, *collision;
struct worklist *wk;
+ struct fs *fs;
mtx_assert(&lk, MA_OWNED);
- LIST_FOREACH(wk, &bp->b_dep, wk_list)
- if (wk->wk_type == D_BMSAFEMAP)
- return (WK_BMSAFEMAP(wk));
+ if (bp)
+ LIST_FOREACH(wk, &bp->b_dep, wk_list)
+ if (wk->wk_type == D_BMSAFEMAP)
+ return (WK_BMSAFEMAP(wk));
+ fs = VFSTOUFS(mp)->um_fs;
+ bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
+ if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)
+ return (bmsafemap);
FREE_LOCK(&lk);
bmsafemap = malloc(sizeof(struct bmsafemap),
M_BMSAFEMAP, M_SOFTDEP_FLAGS);
workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
bmsafemap->sm_buf = bp;
- LIST_INIT(&bmsafemap->sm_allocdirecthd);
- LIST_INIT(&bmsafemap->sm_allocindirhd);
LIST_INIT(&bmsafemap->sm_inodedephd);
+ LIST_INIT(&bmsafemap->sm_inodedepwr);
LIST_INIT(&bmsafemap->sm_newblkhd);
+ LIST_INIT(&bmsafemap->sm_newblkwr);
+ LIST_INIT(&bmsafemap->sm_jaddrefhd);
+ LIST_INIT(&bmsafemap->sm_jnewblkhd);
ACQUIRE_LOCK(&lk);
+ if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
+ WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+ return (collision);
+ }
+ bmsafemap->sm_cg = cg;
+ LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
return (bmsafemap);
}
@@ -1645,9 +4186,9 @@ bmsafemap_lookup(mp, bp)
* unreferenced fragments.
*/
void
-softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
+softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
struct inode *ip; /* inode to which block is being added */
- ufs_lbn_t lbn; /* block pointer within inode */
+ ufs_lbn_t off; /* block pointer within inode */
ufs2_daddr_t newblkno; /* disk block number being added */
ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */
long newsize; /* size of new block */
@@ -1656,34 +4197,33 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
{
struct allocdirect *adp, *oldadp;
struct allocdirectlst *adphead;
- struct bmsafemap *bmsafemap;
+ struct freefrag *freefrag;
struct inodedep *inodedep;
struct pagedep *pagedep;
+ struct jnewblk *jnewblk;
struct newblk *newblk;
struct mount *mp;
+ ufs_lbn_t lbn;
+ lbn = bp->b_lblkno;
mp = UFSTOVFS(ip->i_ump);
- adp = malloc(sizeof(struct allocdirect),
- M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
- workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
- adp->ad_lbn = lbn;
- adp->ad_newblkno = newblkno;
- adp->ad_oldblkno = oldblkno;
- adp->ad_newsize = newsize;
- adp->ad_oldsize = oldsize;
- adp->ad_state = ATTACHED;
- LIST_INIT(&adp->ad_newdirblk);
- if (newblkno == oldblkno)
- adp->ad_freefrag = NULL;
+ if (oldblkno && oldblkno != newblkno)
+ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
else
- adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
+ freefrag = NULL;
ACQUIRE_LOCK(&lk);
- if (lbn >= NDADDR) {
+ if (off >= NDADDR) {
+ if (lbn > 0)
+ panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
+ lbn, off);
/* allocating an indirect block */
if (oldblkno != 0)
panic("softdep_setup_allocdirect: non-zero indir");
} else {
+ if (off != lbn)
+ panic("softdep_setup_allocdirect: lbn %jd != off %jd",
+ lbn, off);
/*
* Allocating a direct block.
*
@@ -1692,26 +4232,39 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
* deletions.
*/
if ((ip->i_mode & IFMT) == IFDIR &&
- pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
+ pagedep_lookup(mp, ip->i_number, off, DEPALLOC,
+ &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
}
- if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
+ if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
panic("softdep_setup_allocdirect: lost block");
- if (newblk->nb_state == DEPCOMPLETE) {
- adp->ad_state |= DEPCOMPLETE;
- adp->ad_buf = NULL;
- } else {
- bmsafemap = newblk->nb_bmsafemap;
- adp->ad_buf = bmsafemap->sm_buf;
- LIST_REMOVE(newblk, nb_deps);
- LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
- }
- LIST_REMOVE(newblk, nb_hash);
- free(newblk, M_NEWBLK);
+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+ ("softdep_setup_allocdirect: newblk already initialized"));
+ /*
+ * Convert the newblk to an allocdirect.
+ */
+ newblk->nb_list.wk_type = D_ALLOCDIRECT;
+ adp = (struct allocdirect *)newblk;
+ newblk->nb_freefrag = freefrag;
+ adp->ad_offset = off;
+ adp->ad_oldblkno = oldblkno;
+ adp->ad_newsize = newsize;
+ adp->ad_oldsize = oldsize;
+ /*
+ * Finish initializing the journal.
+ */
+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+ jnewblk->jn_ino = ip->i_number;
+ jnewblk->jn_lbn = lbn;
+ add_to_journal(&jnewblk->jn_list);
+ }
+ if (freefrag && freefrag->ff_jfreefrag != NULL)
+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);
inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
adp->ad_inodedep = inodedep;
- WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
+
+ WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
/*
* The list of allocdirects must be kept in sorted and ascending
* order so that the rollback routines can quickly determine the
@@ -1726,24 +4279,25 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
*/
adphead = &inodedep->id_newinoupdt;
oldadp = TAILQ_LAST(adphead, allocdirectlst);
- if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
+ if (oldadp == NULL || oldadp->ad_offset <= off) {
/* insert at end of list */
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
- if (oldadp != NULL && oldadp->ad_lbn == lbn)
+ if (oldadp != NULL && oldadp->ad_offset == off)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
return;
}
TAILQ_FOREACH(oldadp, adphead, ad_next) {
- if (oldadp->ad_lbn >= lbn)
+ if (oldadp->ad_offset >= off)
break;
}
if (oldadp == NULL)
panic("softdep_setup_allocdirect: lost entry");
/* insert in middle of list */
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
- if (oldadp->ad_lbn == lbn)
+ if (oldadp->ad_offset == off)
allocdirect_merge(adphead, adp, oldadp);
+
FREE_LOCK(&lk);
}
@@ -1761,10 +4315,11 @@ allocdirect_merge(adphead, newadp, oldadp)
struct freefrag *freefrag;
struct newdirblk *newdirblk;
+ freefrag = NULL;
mtx_assert(&lk, MA_OWNED);
if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
newadp->ad_oldsize != oldadp->ad_newsize ||
- newadp->ad_lbn >= NDADDR)
+ newadp->ad_offset >= NDADDR)
panic("%s %jd != new %jd || old size %ld != new %ld",
"allocdirect_merge: old blkno",
(intmax_t)newadp->ad_oldblkno,
@@ -1779,7 +4334,7 @@ allocdirect_merge(adphead, newadp, oldadp)
* This action is done by swapping the freefrag dependencies.
* The new dependency gains the old one's freefrag, and the
* old one gets the new one and then immediately puts it on
- * the worklist when it is freed by free_allocdirect. It is
+ * the worklist when it is freed by free_newblk. It is
* not possible to do this swap when the old dependency had a
* non-zero size but no previous fragment to free. This condition
* arises when the new block is an extension of the old block.
@@ -1788,8 +4343,8 @@ allocdirect_merge(adphead, newadp, oldadp)
* the old dependency, so cannot legitimately be freed until the
* conditions for the new dependency are fulfilled.
*/
+ freefrag = newadp->ad_freefrag;
if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
- freefrag = newadp->ad_freefrag;
newadp->ad_freefrag = oldadp->ad_freefrag;
oldadp->ad_freefrag = freefrag;
}
@@ -1804,32 +4359,118 @@ allocdirect_merge(adphead, newadp, oldadp)
panic("allocdirect_merge: extra newdirblk");
WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
}
- free_allocdirect(adphead, oldadp, 0);
+ TAILQ_REMOVE(adphead, oldadp, ad_next);
+ /*
+ * We need to move any journal dependencies over to the freefrag
+ * that releases this block if it exists. Otherwise we are
+ * extending an existing block and we'll wait until that is
+ * complete to release the journal space and extend the
+ * new journal to cover this old space as well.
+ */
+ if (freefrag == NULL) {
+ struct jnewblk *jnewblk;
+ struct jnewblk *njnewblk;
+
+ if (oldadp->ad_newblkno != newadp->ad_newblkno)
+ panic("allocdirect_merge: %jd != %jd",
+ oldadp->ad_newblkno, newadp->ad_newblkno);
+ jnewblk = oldadp->ad_block.nb_jnewblk;
+ cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork);
+ /*
+ * We have an unwritten jnewblk, we need to merge the
+ * frag bits with our own. The newer adp's journal can not
+ * be written prior to the old one so no need to check for
+ * it here.
+ */
+ if (jnewblk) {
+ njnewblk = newadp->ad_block.nb_jnewblk;
+ if (njnewblk == NULL)
+ panic("allocdirect_merge: No jnewblk");
+ if (jnewblk->jn_state & UNDONE) {
+ njnewblk->jn_state |= UNDONE | NEWBLOCK;
+ njnewblk->jn_state &= ~ATTACHED;
+ jnewblk->jn_state &= ~UNDONE;
+ }
+ njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
+ WORKLIST_REMOVE(&jnewblk->jn_list);
+ jnewblk->jn_state |= ATTACHED | COMPLETE;
+ free_jnewblk(jnewblk);
+ }
+ } else {
+ /*
+ * We can skip journaling for this freefrag and just complete
+ * any pending journal work for the allocdirect that is being
+ * removed after the freefrag completes.
+ */
+ if (freefrag->ff_jfreefrag)
+ cancel_jfreefrag(freefrag->ff_jfreefrag);
+ cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork);
+ }
+ free_newblk(&oldadp->ad_block);
}
-
+
/*
- * Allocate a new freefrag structure if needed.
+ * Allocate a jfreefrag structure to journal a single block free.
+ */
+static struct jfreefrag *
+newjfreefrag(freefrag, ip, blkno, size, lbn)
+ struct freefrag *freefrag;
+ struct inode *ip;
+ ufs2_daddr_t blkno;
+ long size;
+ ufs_lbn_t lbn;
+{
+ struct jfreefrag *jfreefrag;
+ struct fs *fs;
+
+ fs = ip->i_fs;
+ jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
+ M_SOFTDEP_FLAGS);
+ workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
+ jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
+ jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
+ jfreefrag->fr_ino = ip->i_number;
+ jfreefrag->fr_lbn = lbn;
+ jfreefrag->fr_blkno = blkno;
+ jfreefrag->fr_frags = numfrags(fs, size);
+ jfreefrag->fr_freefrag = freefrag;
+
+ return (jfreefrag);
+}
+
+/*
+ * Allocate a new freefrag structure.
*/
static struct freefrag *
-newfreefrag(ip, blkno, size)
+newfreefrag(ip, blkno, size, lbn)
struct inode *ip;
ufs2_daddr_t blkno;
long size;
+ ufs_lbn_t lbn;
{
struct freefrag *freefrag;
struct fs *fs;
- if (blkno == 0)
- return (NULL);
fs = ip->i_fs;
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
panic("newfreefrag: frag size");
freefrag = malloc(sizeof(struct freefrag),
- M_FREEFRAG, M_SOFTDEP_FLAGS);
+ M_FREEFRAG, M_SOFTDEP_FLAGS);
workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
+ freefrag->ff_state = ATTACHED;
+ LIST_INIT(&freefrag->ff_jwork);
freefrag->ff_inum = ip->i_number;
freefrag->ff_blkno = blkno;
freefrag->ff_fragsize = size;
+
+ if (fs->fs_flags & FS_SUJ) {
+ freefrag->ff_jfreefrag =
+ newjfreefrag(freefrag, ip, blkno, size, lbn);
+ } else {
+ freefrag->ff_state |= DEPCOMPLETE;
+ freefrag->ff_jfreefrag = NULL;
+ }
+
return (freefrag);
}
@@ -1842,9 +4483,17 @@ handle_workitem_freefrag(freefrag)
struct freefrag *freefrag;
{
struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
+ struct workhead wkhd;
+ /*
+ * It would be illegal to add new completion items to the
+ * freefrag after it was schedule to be done so it must be
+ * safe to modify the list head here.
+ */
+ LIST_INIT(&wkhd);
+ LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
- freefrag->ff_fragsize, freefrag->ff_inum);
+ freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);
ACQUIRE_LOCK(&lk);
WORKITEM_FREE(freefrag, D_FREEFRAG);
FREE_LOCK(&lk);
@@ -1856,9 +4505,9 @@ handle_workitem_freefrag(freefrag)
* See the description of softdep_setup_allocdirect above for details.
*/
void
-softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
+softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
struct inode *ip;
- ufs_lbn_t lbn;
+ ufs_lbn_t off;
ufs2_daddr_t newblkno;
ufs2_daddr_t oldblkno;
long newsize;
@@ -1867,50 +4516,55 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
{
struct allocdirect *adp, *oldadp;
struct allocdirectlst *adphead;
- struct bmsafemap *bmsafemap;
+ struct freefrag *freefrag;
struct inodedep *inodedep;
+ struct jnewblk *jnewblk;
struct newblk *newblk;
struct mount *mp;
+ ufs_lbn_t lbn;
+
+ if (off >= NXADDR)
+ panic("softdep_setup_allocext: lbn %lld > NXADDR",
+ (long long)off);
+ lbn = bp->b_lblkno;
mp = UFSTOVFS(ip->i_ump);
- adp = malloc(sizeof(struct allocdirect),
- M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
- workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
- adp->ad_lbn = lbn;
- adp->ad_newblkno = newblkno;
- adp->ad_oldblkno = oldblkno;
- adp->ad_newsize = newsize;
- adp->ad_oldsize = oldsize;
- adp->ad_state = ATTACHED | EXTDATA;
- LIST_INIT(&adp->ad_newdirblk);
- if (newblkno == oldblkno)
- adp->ad_freefrag = NULL;
+ if (oldblkno && oldblkno != newblkno)
+ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
else
- adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
+ freefrag = NULL;
ACQUIRE_LOCK(&lk);
- if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
+ if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
panic("softdep_setup_allocext: lost block");
+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+ ("softdep_setup_allocext: newblk already initialized"));
+ /*
+ * Convert the newblk to an allocdirect.
+ */
+ newblk->nb_list.wk_type = D_ALLOCDIRECT;
+ adp = (struct allocdirect *)newblk;
+ newblk->nb_freefrag = freefrag;
+ adp->ad_offset = off;
+ adp->ad_oldblkno = oldblkno;
+ adp->ad_newsize = newsize;
+ adp->ad_oldsize = oldsize;
+ adp->ad_state |= EXTDATA;
+ /*
+ * Finish initializing the journal.
+ */
+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+ jnewblk->jn_ino = ip->i_number;
+ jnewblk->jn_lbn = lbn;
+ add_to_journal(&jnewblk->jn_list);
+ }
+ if (freefrag && freefrag->ff_jfreefrag != NULL)
+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);
inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
adp->ad_inodedep = inodedep;
- if (newblk->nb_state == DEPCOMPLETE) {
- adp->ad_state |= DEPCOMPLETE;
- adp->ad_buf = NULL;
- } else {
- bmsafemap = newblk->nb_bmsafemap;
- adp->ad_buf = bmsafemap->sm_buf;
- LIST_REMOVE(newblk, nb_deps);
- LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
- }
- LIST_REMOVE(newblk, nb_hash);
- free(newblk, M_NEWBLK);
-
- WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
- if (lbn >= NXADDR)
- panic("softdep_setup_allocext: lbn %lld > NXADDR",
- (long long)lbn);
+ WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
/*
* The list of allocdirects must be kept in sorted and ascending
* order so that the rollback routines can quickly determine the
@@ -1925,23 +4579,23 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
*/
adphead = &inodedep->id_newextupdt;
oldadp = TAILQ_LAST(adphead, allocdirectlst);
- if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
+ if (oldadp == NULL || oldadp->ad_offset <= off) {
/* insert at end of list */
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
- if (oldadp != NULL && oldadp->ad_lbn == lbn)
+ if (oldadp != NULL && oldadp->ad_offset == off)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
return;
}
TAILQ_FOREACH(oldadp, adphead, ad_next) {
- if (oldadp->ad_lbn >= lbn)
+ if (oldadp->ad_offset >= off)
break;
}
if (oldadp == NULL)
panic("softdep_setup_allocext: lost entry");
/* insert in middle of list */
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
- if (oldadp->ad_lbn == lbn)
+ if (oldadp->ad_offset == off)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
}
@@ -1975,22 +4629,39 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
* Allocate a new allocindir structure.
*/
static struct allocindir *
-newallocindir(ip, ptrno, newblkno, oldblkno)
+newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
struct inode *ip; /* inode for file being extended */
int ptrno; /* offset of pointer in indirect block */
ufs2_daddr_t newblkno; /* disk block number being added */
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
+ ufs_lbn_t lbn;
{
+ struct newblk *newblk;
struct allocindir *aip;
+ struct freefrag *freefrag;
+ struct jnewblk *jnewblk;
- aip = malloc(sizeof(struct allocindir),
- M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
- workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
- aip->ai_state = ATTACHED;
+ if (oldblkno)
+ freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
+ else
+ freefrag = NULL;
+ ACQUIRE_LOCK(&lk);
+ if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
+ panic("new_allocindir: lost block");
+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+ ("newallocindir: newblk already initialized"));
+ newblk->nb_list.wk_type = D_ALLOCINDIR;
+ newblk->nb_freefrag = freefrag;
+ aip = (struct allocindir *)newblk;
aip->ai_offset = ptrno;
- aip->ai_newblkno = newblkno;
aip->ai_oldblkno = oldblkno;
- aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+ jnewblk->jn_ino = ip->i_number;
+ jnewblk->jn_lbn = lbn;
+ add_to_journal(&jnewblk->jn_list);
+ }
+ if (freefrag && freefrag->ff_jfreefrag != NULL)
+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);
return (aip);
}
@@ -2008,22 +4679,28 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
struct buf *nbp; /* buffer holding allocated page */
{
+ struct inodedep *inodedep;
struct allocindir *aip;
struct pagedep *pagedep;
+ struct mount *mp;
+ if (lbn != nbp->b_lblkno)
+ panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
+ lbn, bp->b_lblkno);
ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
- aip = newallocindir(ip, ptrno, newblkno, oldblkno);
- ACQUIRE_LOCK(&lk);
+ mp = UFSTOVFS(ip->i_ump);
+ aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
+ (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
/*
* If we are allocating a directory page, then we must
* allocate an associated pagedep to track additions and
* deletions.
*/
if ((ip->i_mode & IFMT) == IFDIR &&
- pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
+ pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
- WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
- setup_allocindir_phase2(bp, ip, aip);
+ WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
+ setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
FREE_LOCK(&lk);
}
@@ -2039,38 +4716,68 @@ softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
int ptrno; /* offset of pointer in indirect block */
ufs2_daddr_t newblkno; /* disk block number being added */
{
+ struct inodedep *inodedep;
struct allocindir *aip;
+ ufs_lbn_t lbn;
+ lbn = nbp->b_lblkno;
ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
- aip = newallocindir(ip, ptrno, newblkno, 0);
- ACQUIRE_LOCK(&lk);
- WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
- setup_allocindir_phase2(bp, ip, aip);
+ aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
+ inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
+ WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
+ setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
FREE_LOCK(&lk);
}
+static void
+indirdep_complete(indirdep)
+ struct indirdep *indirdep;
+{
+ struct allocindir *aip;
+
+ LIST_REMOVE(indirdep, ir_next);
+ indirdep->ir_state &= ~ONDEPLIST;
+
+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
+ LIST_REMOVE(aip, ai_next);
+ free_newblk(&aip->ai_block);
+ }
+ /*
+ * If this indirdep is not attached to a buf it was simply waiting
+ * on completion to clear completehd. free_indirdep() asserts
+ * that nothing is dangling.
+ */
+ if ((indirdep->ir_state & ONWORKLIST) == 0)
+ free_indirdep(indirdep);
+}
+
/*
* Called to finish the allocation of the "aip" allocated
* by one of the two routines above.
*/
static void
-setup_allocindir_phase2(bp, ip, aip)
+setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
struct buf *bp; /* in-memory copy of the indirect block */
struct inode *ip; /* inode for file being extended */
+ struct inodedep *inodedep; /* Inodedep for ip */
struct allocindir *aip; /* allocindir allocated by the above routines */
+ ufs_lbn_t lbn; /* Logical block number for this block. */
{
struct worklist *wk;
+ struct fs *fs;
+ struct newblk *newblk;
struct indirdep *indirdep, *newindirdep;
- struct bmsafemap *bmsafemap;
struct allocindir *oldaip;
struct freefrag *freefrag;
- struct newblk *newblk;
+ struct mount *mp;
ufs2_daddr_t blkno;
+ mp = UFSTOVFS(ip->i_ump);
+ fs = ip->i_fs;
mtx_assert(&lk, MA_OWNED);
if (bp->b_lblkno >= 0)
panic("setup_allocindir_phase2: not indir blk");
- for (indirdep = NULL, newindirdep = NULL; ; ) {
+ for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
if (wk->wk_type != D_INDIRDEP)
continue;
@@ -2079,49 +4786,41 @@ setup_allocindir_phase2(bp, ip, aip)
}
if (indirdep == NULL && newindirdep) {
indirdep = newindirdep;
- WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
newindirdep = NULL;
+ WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
+ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,
+ &newblk)) {
+ indirdep->ir_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&newblk->nb_indirdeps,
+ indirdep, ir_next);
+ } else
+ indirdep->ir_state |= DEPCOMPLETE;
}
if (indirdep) {
- if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
- &newblk) == 0)
- panic("setup_allocindir: lost block");
- if (newblk->nb_state == DEPCOMPLETE) {
- aip->ai_state |= DEPCOMPLETE;
- aip->ai_buf = NULL;
- } else {
- bmsafemap = newblk->nb_bmsafemap;
- aip->ai_buf = bmsafemap->sm_buf;
- LIST_REMOVE(newblk, nb_deps);
- LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
- aip, ai_deps);
- }
- LIST_REMOVE(newblk, nb_hash);
- free(newblk, M_NEWBLK);
aip->ai_indirdep = indirdep;
/*
* Check to see if there is an existing dependency
* for this block. If there is, merge the old
- * dependency into the new one.
+ * dependency into the new one. This happens
+ * as a result of reallocblk only.
*/
if (aip->ai_oldblkno == 0)
oldaip = NULL;
else
- LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
+ LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,
+ ai_next)
if (oldaip->ai_offset == aip->ai_offset)
break;
- freefrag = NULL;
- if (oldaip != NULL) {
- if (oldaip->ai_newblkno != aip->ai_oldblkno)
- panic("setup_allocindir_phase2: blkno");
- aip->ai_oldblkno = oldaip->ai_oldblkno;
- freefrag = aip->ai_freefrag;
- aip->ai_freefrag = oldaip->ai_freefrag;
- oldaip->ai_freefrag = NULL;
- free_allocindir(oldaip, NULL);
- }
+ if (oldaip != NULL)
+ freefrag = allocindir_merge(aip, oldaip);
LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
+ KASSERT(aip->ai_offset >= 0 &&
+ aip->ai_offset < NINDIR(ip->i_ump->um_fs),
+ ("setup_allocindir_phase2: Bad offset %d",
+ aip->ai_offset));
+ KASSERT(indirdep->ir_savebp != NULL,
+ ("setup_allocindir_phase2 NULL ir_savebp"));
if (ip->i_ump->um_fstype == UFS1)
((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
[aip->ai_offset] = aip->ai_oldblkno;
@@ -2148,13 +4847,16 @@ setup_allocindir_phase2(bp, ip, aip)
}
newindirdep = malloc(sizeof(struct indirdep),
M_INDIRDEP, M_SOFTDEP_FLAGS);
- workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
- UFSTOVFS(ip->i_ump));
+ workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
newindirdep->ir_state = ATTACHED;
if (ip->i_ump->um_fstype == UFS1)
newindirdep->ir_state |= UFS1FMT;
+ newindirdep->ir_saveddata = NULL;
LIST_INIT(&newindirdep->ir_deplisthd);
LIST_INIT(&newindirdep->ir_donehd);
+ LIST_INIT(&newindirdep->ir_writehd);
+ LIST_INIT(&newindirdep->ir_completehd);
+ LIST_INIT(&newindirdep->ir_jwork);
if (bp->b_blkno == bp->b_lblkno) {
ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
NULL, NULL);
@@ -2169,6 +4871,51 @@ setup_allocindir_phase2(bp, ip, aip)
}
/*
+ * Merge two allocindirs which refer to the same block. Move newblock
+ * dependencies and setup the freefrags appropriately.
+ */
+static struct freefrag *
+allocindir_merge(aip, oldaip)
+ struct allocindir *aip;
+ struct allocindir *oldaip;
+{
+ struct newdirblk *newdirblk;
+ struct freefrag *freefrag;
+ struct worklist *wk;
+
+ if (oldaip->ai_newblkno != aip->ai_oldblkno)
+ panic("allocindir_merge: blkno");
+ aip->ai_oldblkno = oldaip->ai_oldblkno;
+ freefrag = aip->ai_freefrag;
+ aip->ai_freefrag = oldaip->ai_freefrag;
+ oldaip->ai_freefrag = NULL;
+ KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
+ /*
+ * If we are tracking a new directory-block allocation,
+ * move it from the old allocindir to the new allocindir.
+ */
+ if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
+ newdirblk = WK_NEWDIRBLK(wk);
+ WORKLIST_REMOVE(&newdirblk->db_list);
+ if (!LIST_EMPTY(&oldaip->ai_newdirblk))
+ panic("allocindir_merge: extra newdirblk");
+ WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);
+ }
+ /*
+ * We can skip journaling for this freefrag and just complete
+ * any pending journal work for the allocindir that is being
+ * removed after the freefrag completes.
+ */
+ if (freefrag->ff_jfreefrag)
+ cancel_jfreefrag(freefrag->ff_jfreefrag);
+ LIST_REMOVE(oldaip, ai_next);
+ cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork);
+ free_newblk(&oldaip->ai_block);
+
+ return (freefrag);
+}
+
+/*
* Block de-allocation dependencies.
*
* When blocks are de-allocated, the on-disk pointers must be nullified before
@@ -2203,9 +4950,12 @@ softdep_setup_freeblocks(ip, length, flags)
off_t length; /* The new length for the file */
int flags; /* IO_EXT and/or IO_NORMAL */
{
+ struct ufs1_dinode *dp1;
+ struct ufs2_dinode *dp2;
struct freeblks *freeblks;
struct inodedep *inodedep;
struct allocdirect *adp;
+ struct jfreeblk *jfreeblk;
struct bufobj *bo;
struct vnode *vp;
struct buf *bp;
@@ -2213,6 +4963,13 @@ softdep_setup_freeblocks(ip, length, flags)
ufs2_daddr_t extblocks, datablocks;
struct mount *mp;
int i, delay, error;
+ ufs2_daddr_t blkno;
+ ufs_lbn_t tmpval;
+ ufs_lbn_t lbn;
+ long oldextsize;
+ long oldsize;
+ int frags;
+ int needj;
fs = ip->i_fs;
mp = UFSTOVFS(ip->i_ump);
@@ -2221,32 +4978,53 @@ softdep_setup_freeblocks(ip, length, flags)
freeblks = malloc(sizeof(struct freeblks),
M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
+ LIST_INIT(&freeblks->fb_jfreeblkhd);
+ LIST_INIT(&freeblks->fb_jwork);
freeblks->fb_state = ATTACHED;
freeblks->fb_uid = ip->i_uid;
freeblks->fb_previousinum = ip->i_number;
freeblks->fb_devvp = ip->i_devvp;
+ freeblks->fb_chkcnt = 0;
ACQUIRE_LOCK(&lk);
+ /*
+ * If we're truncating a removed file that will never be written
+ * we don't need to journal the block frees. The canceled journals
+ * for the allocations will suffice.
+ */
+ inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
+ if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED ||
+ (fs->fs_flags & FS_SUJ) == 0)
+ needj = 0;
+ else
+ needj = 1;
num_freeblkdep++;
FREE_LOCK(&lk);
extblocks = 0;
if (fs->fs_magic == FS_UFS2_MAGIC)
extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
datablocks = DIP(ip, i_blocks) - extblocks;
- if ((flags & IO_NORMAL) == 0) {
- freeblks->fb_oldsize = 0;
- freeblks->fb_chkcnt = 0;
- } else {
- freeblks->fb_oldsize = ip->i_size;
+ if ((flags & IO_NORMAL) != 0) {
+ oldsize = ip->i_size;
ip->i_size = 0;
DIP_SET(ip, i_size, 0);
freeblks->fb_chkcnt = datablocks;
for (i = 0; i < NDADDR; i++) {
- freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
+ blkno = DIP(ip, i_db[i]);
DIP_SET(ip, i_db[i], 0);
+ if (blkno == 0)
+ continue;
+ frags = sblksize(fs, oldsize, i);
+ frags = numfrags(fs, frags);
+ newfreework(freeblks, NULL, i, blkno, frags, needj);
}
- for (i = 0; i < NIADDR; i++) {
- freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
+ for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
+ i++, tmpval *= NINDIR(fs)) {
+ blkno = DIP(ip, i_ib[i]);
DIP_SET(ip, i_ib[i], 0);
+ if (blkno)
+ newfreework(freeblks, NULL, -lbn - i, blkno,
+ fs->fs_frag, needj);
+ lbn += tmpval;
}
/*
* If the file was removed, then the space being freed was
@@ -2259,17 +5037,23 @@ softdep_setup_freeblocks(ip, length, flags)
UFS_UNLOCK(ip->i_ump);
}
}
- if ((flags & IO_EXT) == 0) {
- freeblks->fb_oldextsize = 0;
- } else {
- freeblks->fb_oldextsize = ip->i_din2->di_extsize;
+ if ((flags & IO_EXT) != 0) {
+ oldextsize = ip->i_din2->di_extsize;
ip->i_din2->di_extsize = 0;
freeblks->fb_chkcnt += extblocks;
for (i = 0; i < NXADDR; i++) {
- freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
+ blkno = ip->i_din2->di_extb[i];
ip->i_din2->di_extb[i] = 0;
+ if (blkno == 0)
+ continue;
+ frags = sblksize(fs, oldextsize, i);
+ frags = numfrags(fs, frags);
+ newfreework(freeblks, NULL, -1 - i, blkno, frags,
+ needj);
}
}
+ if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))
+ needj = 0;
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
/*
* Push the zero'ed inode to to its disk buffer so that we are free
@@ -2282,12 +5066,17 @@ softdep_setup_freeblocks(ip, length, flags)
brelse(bp);
softdep_error("softdep_setup_freeblocks", error);
}
- if (ip->i_ump->um_fstype == UFS1)
- *((struct ufs1_dinode *)bp->b_data +
- ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
- else
- *((struct ufs2_dinode *)bp->b_data +
- ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
+ if (ip->i_ump->um_fstype == UFS1) {
+ dp1 = ((struct ufs1_dinode *)bp->b_data +
+ ino_to_fsbo(fs, ip->i_number));
+ ip->i_din1->di_freelink = dp1->di_freelink;
+ *dp1 = *ip->i_din1;
+ } else {
+ dp2 = ((struct ufs2_dinode *)bp->b_data +
+ ino_to_fsbo(fs, ip->i_number));
+ ip->i_din2->di_freelink = dp2->di_freelink;
+ *dp2 = *ip->i_din2;
+ }
/*
* Find and eliminate any inode dependencies.
*/
@@ -2304,7 +5093,9 @@ softdep_setup_freeblocks(ip, length, flags)
*/
delay = (inodedep->id_state & DEPCOMPLETE);
if (delay)
- WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
+ WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
+ else if (needj)
+ freeblks->fb_state |= DEPCOMPLETE | COMPLETE;
/*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
@@ -2318,14 +5109,19 @@ softdep_setup_freeblocks(ip, length, flags)
merge_inode_lists(&inodedep->id_newinoupdt,
&inodedep->id_inoupdt);
while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
- free_allocdirect(&inodedep->id_inoupdt, adp, delay);
+ cancel_allocdirect(&inodedep->id_inoupdt, adp,
+ freeblks, delay);
}
if (flags & IO_EXT) {
merge_inode_lists(&inodedep->id_newextupdt,
&inodedep->id_extupdt);
while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
- free_allocdirect(&inodedep->id_extupdt, adp, delay);
+ cancel_allocdirect(&inodedep->id_extupdt, adp,
+ freeblks, delay);
}
+ LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)
+ add_to_journal(&jfreeblk->jf_list);
+
FREE_LOCK(&lk);
bdwrite(bp);
/*
@@ -2349,9 +5145,9 @@ restart:
BO_UNLOCK(bo);
ACQUIRE_LOCK(&lk);
(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
- deallocate_dependencies(bp, inodedep);
+ if (deallocate_dependencies(bp, inodedep, freeblks))
+ bp->b_flags |= B_INVAL | B_NOCACHE;
FREE_LOCK(&lk);
- bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
BO_LOCK(bo);
goto restart;
@@ -2361,7 +5157,7 @@ restart:
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
(void) free_inodedep(inodedep);
- if(delay) {
+ if (delay) {
freeblks->fb_state |= DEPCOMPLETE;
/*
* If the inode with zeroed block pointers is now on disk
@@ -2371,16 +5167,16 @@ restart:
* the request here than in the !delay case.
*/
if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
- add_to_worklist(&freeblks->fb_list);
+ add_to_worklist(&freeblks->fb_list, 1);
}
FREE_LOCK(&lk);
/*
- * If the inode has never been written to disk (delay == 0),
- * then we can process the freeblks now that we have deleted
- * the dependencies.
+ * If the inode has never been written to disk (delay == 0) and
+ * we're not waiting on any journal writes, then we can process the
+ * freeblks now that we have deleted the dependencies.
*/
- if (!delay)
+ if (!delay && !needj)
handle_workitem_freeblocks(freeblks, 0);
}
@@ -2389,19 +5185,23 @@ restart:
* be reallocated to a new vnode. The buffer must be locked, thus,
* no I/O completion operations can occur while we are manipulating
* its associated dependencies. The mutex is held so that other I/O's
- * associated with related dependencies do not occur.
+ * associated with related dependencies do not occur. Returns 1 if
+ * all dependencies were cleared, 0 otherwise.
*/
-static void
-deallocate_dependencies(bp, inodedep)
+static int
+deallocate_dependencies(bp, inodedep, freeblks)
struct buf *bp;
struct inodedep *inodedep;
+ struct freeblks *freeblks;
{
struct worklist *wk;
struct indirdep *indirdep;
+ struct newdirblk *newdirblk;
struct allocindir *aip;
struct pagedep *pagedep;
+ struct jremref *jremref;
+ struct jmvref *jmvref;
struct dirrem *dirrem;
- struct diradd *dap;
int i;
mtx_assert(&lk, MA_OWNED);
@@ -2410,47 +5210,24 @@ deallocate_dependencies(bp, inodedep)
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
- /*
- * None of the indirect pointers will ever be visible,
- * so they can simply be tossed. GOINGAWAY ensures
- * that allocated pointers will be saved in the buffer
- * cache until they are freed. Note that they will
- * only be able to be found by their physical address
- * since the inode mapping the logical address will
- * be gone. The save buffer used for the safe copy
- * was allocated in setup_allocindir_phase2 using
- * the physical address so it could be used for this
- * purpose. Hence we swap the safe copy with the real
- * copy, allowing the safe copy to be freed and holding
- * on to the real copy for later use in indir_trunc.
- */
- if (indirdep->ir_state & GOINGAWAY)
- panic("deallocate_dependencies: already gone");
- indirdep->ir_state |= GOINGAWAY;
- VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
- while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
- free_allocindir(aip, inodedep);
if (bp->b_lblkno >= 0 ||
bp->b_blkno != indirdep->ir_savebp->b_lblkno)
panic("deallocate_dependencies: not indir");
- bcopy(bp->b_data, indirdep->ir_savebp->b_data,
- bp->b_bcount);
- WORKLIST_REMOVE(wk);
- WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
+ cancel_indirdep(indirdep, bp, inodedep, freeblks);
continue;
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
/*
- * None of the directory additions will ever be
- * visible, so they can simply be tossed.
+ * There should be no directory add dependencies present
+ * as the directory could not be truncated until all
+ * children were removed.
*/
+ KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
+ ("deallocate_dependencies: pendinghd != NULL"));
for (i = 0; i < DAHASHSZ; i++)
- while ((dap =
- LIST_FIRST(&pagedep->pd_diraddhd[i])))
- free_diradd(dap);
- while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
- free_diradd(dap);
+ KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
+ ("deallocate_dependencies: diraddhd != NULL"));
/*
* Copy any directory remove dependencies to the list
* to be processed after the zero'ed inode is written.
@@ -2458,28 +5235,40 @@ deallocate_dependencies(bp, inodedep)
* can be dumped directly onto the work list.
*/
LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
+ /*
+ * If there are any dirrems we wait for
+ * the journal write to complete and
+ * then restart the buf scan as the lock
+ * has been dropped.
+ */
+ while ((jremref =
+ LIST_FIRST(&dirrem->dm_jremrefhd))
+ != NULL) {
+ stat_jwait_filepage++;
+ jwait(&jremref->jr_list);
+ return (0);
+ }
LIST_REMOVE(dirrem, dm_next);
dirrem->dm_dirinum = pagedep->pd_ino;
if (inodedep == NULL ||
(inodedep->id_state & ALLCOMPLETE) ==
- ALLCOMPLETE)
- add_to_worklist(&dirrem->dm_list);
- else
+ ALLCOMPLETE) {
+ dirrem->dm_state |= COMPLETE;
+ add_to_worklist(&dirrem->dm_list, 0);
+ } else
WORKLIST_INSERT(&inodedep->id_bufwait,
&dirrem->dm_list);
}
if ((pagedep->pd_state & NEWBLOCK) != 0) {
- LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
- if (wk->wk_type == D_NEWDIRBLK &&
- WK_NEWDIRBLK(wk)->db_pagedep ==
- pagedep)
- break;
- if (wk != NULL) {
- WORKLIST_REMOVE(wk);
- free_newdirblk(WK_NEWDIRBLK(wk));
- } else
- panic("deallocate_dependencies: "
- "lost pagedep");
+ newdirblk = pagedep->pd_newdirblk;
+ WORKLIST_REMOVE(&newdirblk->db_list);
+ free_newdirblk(newdirblk);
+ }
+ while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd))
+ != NULL) {
+ stat_jwait_filepage++;
+ jwait(&jmvref->jm_list);
+ return (0);
}
WORKLIST_REMOVE(&pagedep->pd_list);
LIST_REMOVE(pagedep, pd_hash);
@@ -2487,7 +5276,8 @@ deallocate_dependencies(bp, inodedep)
continue;
case D_ALLOCINDIR:
- free_allocindir(WK_ALLOCINDIR(wk), inodedep);
+ aip = WK_ALLOCINDIR(wk);
+ cancel_allocindir(aip, inodedep, freeblks);
continue;
case D_ALLOCDIRECT:
@@ -2502,46 +5292,155 @@ deallocate_dependencies(bp, inodedep)
/* NOTREACHED */
}
}
+
+ return (1);
}
/*
- * Free an allocdirect. Generate a new freefrag work request if appropriate.
- * This routine must be called with splbio interrupts blocked.
+ * An allocdirect is being canceled due to a truncate. We must make sure
+ * the journal entry is released in concert with the blkfree that releases
+ * the storage. Completed journal entries must not be released until the
+ * space is no longer pointed to by the inode or in the bitmap.
*/
static void
-free_allocdirect(adphead, adp, delay)
+cancel_allocdirect(adphead, adp, freeblks, delay)
struct allocdirectlst *adphead;
struct allocdirect *adp;
+ struct freeblks *freeblks;
int delay;
{
+ struct freework *freework;
+ struct newblk *newblk;
+ struct worklist *wk;
+ ufs_lbn_t lbn;
+
+ TAILQ_REMOVE(adphead, adp, ad_next);
+ newblk = (struct newblk *)adp;
+ /*
+ * If the journal hasn't been written the jnewblk must be passed
+ * to the call to ffs_freeblk that reclaims the space. We accomplish
+ * this by linking the journal dependency into the freework to be
+ * freed when freework_freeblock() is called. If the journal has
+ * been written we can simply reclaim the journal space when the
+ * freeblks work is complete.
+ */
+ if (newblk->nb_jnewblk == NULL) {
+ cancel_newblk(newblk, &freeblks->fb_jwork);
+ goto found;
+ }
+ lbn = newblk->nb_jnewblk->jn_lbn;
+ /*
+ * Find the correct freework structure so it releases the canceled
+ * journal when the bitmap is cleared. This preserves rollback
+ * until the allocation is reverted.
+ */
+ LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
+ freework = WK_FREEWORK(wk);
+ if (freework->fw_lbn != lbn)
+ continue;
+ cancel_newblk(newblk, &freework->fw_jwork);
+ goto found;
+ }
+ panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);
+found:
+ if (delay)
+ WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
+ &newblk->nb_list);
+ else
+ free_newblk(newblk);
+ return;
+}
+
+
+static void
+cancel_newblk(newblk, wkhd)
+ struct newblk *newblk;
+ struct workhead *wkhd;
+{
+ struct indirdep *indirdep;
+ struct allocindir *aip;
+
+ while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
+ indirdep->ir_state &= ~ONDEPLIST;
+ LIST_REMOVE(indirdep, ir_next);
+ /*
+ * If an indirdep is not on the buf worklist we need to
+ * free it here as deallocate_dependencies() will never
+ * find it. These pointers were never visible on disk and
+ * can be discarded immediately.
+ */
+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
+ LIST_REMOVE(aip, ai_next);
+ cancel_newblk(&aip->ai_block, wkhd);
+ free_newblk(&aip->ai_block);
+ }
+ /*
+ * If this indirdep is not attached to a buf it was simply
+ * waiting on completion to clear completehd. free_indirdep()
+ * asserts that nothing is dangling.
+ */
+ if ((indirdep->ir_state & ONWORKLIST) == 0)
+ free_indirdep(indirdep);
+ }
+ if (newblk->nb_state & ONDEPLIST) {
+ newblk->nb_state &= ~ONDEPLIST;
+ LIST_REMOVE(newblk, nb_deps);
+ }
+ if (newblk->nb_state & ONWORKLIST)
+ WORKLIST_REMOVE(&newblk->nb_list);
+ /*
+ * If the journal entry hasn't been written we hold onto the dep
+ * until it is safe to free along with the other journal work.
+ */
+ if (newblk->nb_jnewblk != NULL) {
+ cancel_jnewblk(newblk->nb_jnewblk, wkhd);
+ newblk->nb_jnewblk = NULL;
+ }
+ if (!LIST_EMPTY(&newblk->nb_jwork))
+ jwork_move(wkhd, &newblk->nb_jwork);
+}
+
+/*
+ * Free a newblk. Generate a new freefrag work request if appropriate.
+ * This must be called after the inode pointer and any direct block pointers
+ * are valid or fully removed via truncate or frag extension.
+ */
+static void
+free_newblk(newblk)
+ struct newblk *newblk;
+{
+ struct indirdep *indirdep;
struct newdirblk *newdirblk;
+ struct freefrag *freefrag;
struct worklist *wk;
mtx_assert(&lk, MA_OWNED);
- if ((adp->ad_state & DEPCOMPLETE) == 0)
- LIST_REMOVE(adp, ad_deps);
- TAILQ_REMOVE(adphead, adp, ad_next);
- if ((adp->ad_state & COMPLETE) == 0)
- WORKLIST_REMOVE(&adp->ad_list);
- if (adp->ad_freefrag != NULL) {
- if (delay)
- WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
- &adp->ad_freefrag->ff_list);
- else
- add_to_worklist(&adp->ad_freefrag->ff_list);
+ if (newblk->nb_state & ONDEPLIST)
+ LIST_REMOVE(newblk, nb_deps);
+ if (newblk->nb_state & ONWORKLIST)
+ WORKLIST_REMOVE(&newblk->nb_list);
+ LIST_REMOVE(newblk, nb_hash);
+ if ((freefrag = newblk->nb_freefrag) != NULL) {
+ freefrag->ff_state |= COMPLETE;
+ if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
+ add_to_worklist(&freefrag->ff_list, 0);
}
- if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
+ if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {
newdirblk = WK_NEWDIRBLK(wk);
WORKLIST_REMOVE(&newdirblk->db_list);
- if (!LIST_EMPTY(&adp->ad_newdirblk))
- panic("free_allocdirect: extra newdirblk");
- if (delay)
- WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
- &newdirblk->db_list);
- else
- free_newdirblk(newdirblk);
- }
- WORKITEM_FREE(adp, D_ALLOCDIRECT);
+ if (!LIST_EMPTY(&newblk->nb_newdirblk))
+ panic("free_newblk: extra newdirblk");
+ free_newdirblk(newdirblk);
+ }
+ while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
+ indirdep->ir_state |= DEPCOMPLETE;
+ indirdep_complete(indirdep);
+ }
+ KASSERT(newblk->nb_jnewblk == NULL,
+ ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
+ handle_jwork(&newblk->nb_jwork);
+ newblk->nb_list.wk_type = D_NEWBLK;
+ WORKITEM_FREE(newblk, D_NEWBLK);
}
/*
@@ -2554,6 +5453,7 @@ free_newdirblk(newdirblk)
{
struct pagedep *pagedep;
struct diradd *dap;
+ struct worklist *wk;
int i;
mtx_assert(&lk, MA_OWNED);
@@ -2571,17 +5471,25 @@ free_newdirblk(newdirblk)
pagedep->pd_state &= ~NEWBLOCK;
if ((pagedep->pd_state & ONWORKLIST) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
- free_diradd(dap);
+ free_diradd(dap, NULL);
/*
* If no dependencies remain, the pagedep will be freed.
*/
for (i = 0; i < DAHASHSZ; i++)
if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
break;
- if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
+ if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&
+ LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
+ KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,
+ ("free_newdirblk: Freeing non-free pagedep %p", pagedep));
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
}
+ /* Should only ever be one item in the list. */
+ while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
+ }
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
}
@@ -2608,6 +5516,7 @@ softdep_freefile(pvp, ino, mode)
freefile->fx_mode = mode;
freefile->fx_oldinum = ino;
freefile->fx_devvp = ip->i_devvp;
+ LIST_INIT(&freefile->fx_jwork);
if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
UFS_LOCK(ip->i_ump);
ip->i_fs->fs_pendinginodes += 1;
@@ -2618,11 +5527,34 @@ softdep_freefile(pvp, ino, mode)
* If the inodedep does not exist, then the zero'ed inode has
* been written to disk. If the allocated inode has never been
* written to disk, then the on-disk inode is zero'ed. In either
- * case we can free the file immediately.
+ * case we can free the file immediately. If the journal was
+ * canceled before being written the inode will never make it to
+ * disk and we must send the canceled journal entrys to
+ * ffs_freefile() to be cleared in conjunction with the bitmap.
+ * Any blocks waiting on the inode to write can be safely freed
+ * here as it will never been written.
*/
ACQUIRE_LOCK(&lk);
- if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
- check_inode_unwritten(inodedep)) {
+ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
+ /*
+ * Remove this inode from the unlinked list and set
+ * GOINGAWAY as appropriate to indicate that this inode
+ * will never be written.
+ */
+ if (inodedep && inodedep->id_state & UNLINKED) {
+ /*
+ * Save the journal work to be freed with the bitmap
+ * before we clear UNLINKED. Otherwise it can be lost
+ * if the inode block is written.
+ */
+ handle_bufwait(inodedep, &freefile->fx_jwork);
+ clear_unlinked_inodedep(inodedep);
+ /* Re-acquire inodedep as we've dropped lk. */
+ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
+ if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0)
+ inodedep->id_state |= GOINGAWAY;
+ }
+ if (inodedep == NULL || check_inode_unwritten(inodedep)) {
FREE_LOCK(&lk);
handle_workitem_freefile(freefile);
return;
@@ -2654,7 +5586,8 @@ check_inode_unwritten(inodedep)
{
mtx_assert(&lk, MA_OWNED);
- if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
+
+ if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
!LIST_EMPTY(&inodedep->id_pendinghd) ||
!LIST_EMPTY(&inodedep->id_bufwait) ||
!LIST_EMPTY(&inodedep->id_inowait) ||
@@ -2662,9 +5595,9 @@ check_inode_unwritten(inodedep)
!TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
+ inodedep->id_mkdiradd != NULL ||
inodedep->id_nlinkdelta != 0)
return (0);
-
/*
* Another process might be in initiate_write_inodeblock_ufs[12]
* trying to allocate memory without holding "Softdep Lock".
@@ -2673,9 +5606,11 @@ check_inode_unwritten(inodedep)
inodedep->id_savedino1 == NULL)
return (0);
+ if (inodedep->id_state & ONDEPLIST)
+ LIST_REMOVE(inodedep, id_deps);
+ inodedep->id_state &= ~ONDEPLIST;
inodedep->id_state |= ALLCOMPLETE;
- LIST_REMOVE(inodedep, id_deps);
- inodedep->id_buf = NULL;
+ inodedep->id_bmsafemap = NULL;
if (inodedep->id_state & ONWORKLIST)
WORKLIST_REMOVE(&inodedep->id_list);
if (inodedep->id_savedino1 != NULL) {
@@ -2696,17 +5631,23 @@ free_inodedep(inodedep)
{
mtx_assert(&lk, MA_OWNED);
- if ((inodedep->id_state & ONWORKLIST) != 0 ||
+ if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
+ !LIST_EMPTY(&inodedep->id_dirremhd) ||
!LIST_EMPTY(&inodedep->id_pendinghd) ||
!LIST_EMPTY(&inodedep->id_bufwait) ||
!LIST_EMPTY(&inodedep->id_inowait) ||
+ !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
!TAILQ_EMPTY(&inodedep->id_inoupdt) ||
!TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
- inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
+ inodedep->id_mkdiradd != NULL ||
+ inodedep->id_nlinkdelta != 0 ||
+ inodedep->id_savedino1 != NULL)
return (0);
+ if (inodedep->id_state & ONDEPLIST)
+ LIST_REMOVE(inodedep, id_deps);
LIST_REMOVE(inodedep, id_hash);
WORKITEM_FREE(inodedep, D_INODEDEP);
num_inodedep -= 1;
@@ -2714,6 +5655,126 @@ free_inodedep(inodedep)
}
/*
+ * Free the block referenced by a freework structure. The parent freeblks
+ * structure is released and completed when the final cg bitmap reaches
+ * the disk. This routine may be freeing a jnewblk which never made it to
+ * disk in which case we do not have to wait as the operation is undone
+ * in memory immediately.
+ */
+static void
+freework_freeblock(freework)
+ struct freework *freework;
+{
+ struct freeblks *freeblks;
+ struct ufsmount *ump;
+ struct workhead wkhd;
+ struct fs *fs;
+ int complete;
+ int pending;
+ int bsize;
+ int needj;
+
+ freeblks = freework->fw_freeblks;
+ ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+ fs = ump->um_fs;
+ needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ;
+ complete = 0;
+ LIST_INIT(&wkhd);
+ /*
+ * If we are canceling an existing jnewblk pass it to the free
+ * routine, otherwise pass the freeblk which will ultimately
+ * release the freeblks. If we're not journaling, we can just
+ * free the freeblks immediately.
+ */
+ if (!LIST_EMPTY(&freework->fw_jwork)) {
+ LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
+ complete = 1;
+ } else if (needj)
+ WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list);
+ bsize = lfragtosize(fs, freework->fw_frags);
+ pending = btodb(bsize);
+ ACQUIRE_LOCK(&lk);
+ freeblks->fb_chkcnt -= pending;
+ FREE_LOCK(&lk);
+ /*
+ * extattr blocks don't show up in pending blocks. XXX why?
+ */
+ if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {
+ UFS_LOCK(ump);
+ fs->fs_pendingblocks -= pending;
+ UFS_UNLOCK(ump);
+ }
+ ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,
+ bsize, freeblks->fb_previousinum, &wkhd);
+ if (complete == 0 && needj)
+ return;
+ /*
+ * The jnewblk will be discarded and the bits in the map never
+ * made it to disk. We can immediately free the freeblk.
+ */
+ ACQUIRE_LOCK(&lk);
+ handle_written_freework(freework);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Start, continue, or finish the process of freeing an indirect block tree.
+ * The free operation may be paused at any point with fw_off containing the
+ * offset to restart from. This enables us to implement some flow control
+ * for large truncates which may fan out and generate a huge number of
+ * dependencies.
+ */
+static void
+handle_workitem_indirblk(freework)
+ struct freework *freework;
+{
+ struct freeblks *freeblks;
+ struct ufsmount *ump;
+ struct fs *fs;
+
+
+ freeblks = freework->fw_freeblks;
+ ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+ fs = ump->um_fs;
+ if (freework->fw_off == NINDIR(fs))
+ freework_freeblock(freework);
+ else
+ indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
+ freework->fw_lbn);
+}
+
+/*
+ * Called when a freework structure attached to a cg buf is written. The
+ * ref on either the parent or the freeblks structure is released and
+ * either may be added to the worklist if it is the final ref.
+ */
+static void
+handle_written_freework(freework)
+ struct freework *freework;
+{
+ struct freeblks *freeblks;
+ struct freework *parent;
+
+ freeblks = freework->fw_freeblks;
+ parent = freework->fw_parent;
+ if (parent) {
+ if (--parent->fw_ref != 0)
+ parent = NULL;
+ freeblks = NULL;
+ } else if (--freeblks->fb_ref != 0)
+ freeblks = NULL;
+ WORKITEM_FREE(freework, D_FREEWORK);
+ /*
+ * Don't delay these block frees or it takes an intolerable amount
+ * of time to process truncates and free their journal entries.
+ */
+ if (freeblks)
+ add_to_worklist(&freeblks->fb_list, 1);
+ if (parent)
+ add_to_worklist(&parent->fw_list, 1);
+}
+
+/*
* This workitem routine performs the block de-allocation.
* The workitem is added to the pending list after the updated
* inode block has been written to disk. As mentioned above,
@@ -2726,99 +5787,79 @@ handle_workitem_freeblocks(freeblks, flags)
struct freeblks *freeblks;
int flags;
{
+ struct freework *freework;
+ struct worklist *wk;
+
+ KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),
+ ("handle_workitem_freeblocks: Journal entries not written."));
+ if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {
+ handle_complete_freeblocks(freeblks);
+ return;
+ }
+ freeblks->fb_ref++;
+ while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
+ KASSERT(wk->wk_type == D_FREEWORK,
+ ("handle_workitem_freeblocks: Unknown type %s",
+ TYPENAME(wk->wk_type)));
+ WORKLIST_REMOVE_UNLOCKED(wk);
+ freework = WK_FREEWORK(wk);
+ if (freework->fw_lbn <= -NDADDR)
+ handle_workitem_indirblk(freework);
+ else
+ freework_freeblock(freework);
+ }
+ ACQUIRE_LOCK(&lk);
+ if (--freeblks->fb_ref != 0)
+ freeblks = NULL;
+ FREE_LOCK(&lk);
+ if (freeblks)
+ handle_complete_freeblocks(freeblks);
+}
+
+/*
+ * Once all of the freework workitems are complete we can retire the
+ * freeblocks dependency and any journal work awaiting completion. This
+ * can not be called until all other dependencies are stable on disk.
+ */
+static void
+handle_complete_freeblocks(freeblks)
+ struct freeblks *freeblks;
+{
struct inode *ip;
struct vnode *vp;
struct fs *fs;
struct ufsmount *ump;
- int i, nblocks, level, bsize;
- ufs2_daddr_t bn, blocksreleased = 0;
- int error, allerror = 0;
- ufs_lbn_t baselbns[NIADDR], tmpval;
- int fs_pendingblocks;
+ int flags;
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
fs = ump->um_fs;
- fs_pendingblocks = 0;
- tmpval = 1;
- baselbns[0] = NDADDR;
- for (i = 1; i < NIADDR; i++) {
- tmpval *= NINDIR(fs);
- baselbns[i] = baselbns[i - 1] + tmpval;
- }
- nblocks = btodb(fs->fs_bsize);
- blocksreleased = 0;
- /*
- * Release all extended attribute blocks or frags.
- */
- if (freeblks->fb_oldextsize > 0) {
- for (i = (NXADDR - 1); i >= 0; i--) {
- if ((bn = freeblks->fb_eblks[i]) == 0)
- continue;
- bsize = sblksize(fs, freeblks->fb_oldextsize, i);
- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
- freeblks->fb_previousinum);
- blocksreleased += btodb(bsize);
- }
- }
- /*
- * Release all data blocks or frags.
- */
- if (freeblks->fb_oldsize > 0) {
- /*
- * Indirect blocks first.
- */
- for (level = (NIADDR - 1); level >= 0; level--) {
- if ((bn = freeblks->fb_iblks[level]) == 0)
- continue;
- if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
- level, baselbns[level], &blocksreleased)) != 0)
- allerror = error;
- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
- fs->fs_bsize, freeblks->fb_previousinum);
- fs_pendingblocks += nblocks;
- blocksreleased += nblocks;
- }
- /*
- * All direct blocks or frags.
- */
- for (i = (NDADDR - 1); i >= 0; i--) {
- if ((bn = freeblks->fb_dblks[i]) == 0)
- continue;
- bsize = sblksize(fs, freeblks->fb_oldsize, i);
- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
- freeblks->fb_previousinum);
- fs_pendingblocks += btodb(bsize);
- blocksreleased += btodb(bsize);
- }
- }
- UFS_LOCK(ump);
- fs->fs_pendingblocks -= fs_pendingblocks;
- UFS_UNLOCK(ump);
+ flags = LK_NOWAIT;
+
/*
* If we still have not finished background cleanup, then check
* to see if the block count needs to be adjusted.
*/
- if (freeblks->fb_chkcnt != blocksreleased &&
- (fs->fs_flags & FS_UNCLEAN) != 0 &&
+ if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&
ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
- (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)
- == 0) {
+ (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {
ip = VTOI(vp);
- DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
- freeblks->fb_chkcnt - blocksreleased);
+ DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);
ip->i_flag |= IN_CHANGE;
vput(vp);
}
#ifdef INVARIANTS
- if (freeblks->fb_chkcnt != blocksreleased &&
+ if (freeblks->fb_chkcnt != 0 &&
((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
printf("handle_workitem_freeblocks: block count\n");
- if (allerror)
- softdep_error("handle_workitem_freeblks", allerror);
#endif /* INVARIANTS */
ACQUIRE_LOCK(&lk);
+ /*
+ * All of the freeblock deps must be complete prior to this call
+ * so it's now safe to complete earlier outstanding journal entries.
+ */
+ handle_jwork(&freeblks->fb_jwork);
WORKITEM_FREE(freeblks, D_FREEBLKS);
num_freeblkdep--;
FREE_LOCK(&lk);
@@ -2830,29 +5871,42 @@ handle_workitem_freeblocks(freeblks, flags)
* and recursive calls to indirtrunc must be used to cleanse other indirect
* blocks.
*/
-static int
-indir_trunc(freeblks, dbn, level, lbn, countp)
- struct freeblks *freeblks;
+static void
+indir_trunc(freework, dbn, lbn)
+ struct freework *freework;
ufs2_daddr_t dbn;
- int level;
ufs_lbn_t lbn;
- ufs2_daddr_t *countp;
{
+ struct freework *nfreework;
+ struct workhead wkhd;
+ struct jnewblk *jnewblk;
+ struct freeblks *freeblks;
struct buf *bp;
struct fs *fs;
+ struct worklist *wkn;
struct worklist *wk;
struct indirdep *indirdep;
struct ufsmount *ump;
ufs1_daddr_t *bap1 = 0;
- ufs2_daddr_t nb, *bap2 = 0;
+ ufs2_daddr_t nb, nnb, *bap2 = 0;
ufs_lbn_t lbnadd;
int i, nblocks, ufs1fmt;
- int error, allerror = 0;
int fs_pendingblocks;
+ int freedeps;
+ int needj;
+ int level;
+ int cnt;
+ LIST_INIT(&wkhd);
+ level = lbn_level(lbn);
+ if (level == -1)
+ panic("indir_trunc: Invalid lbn %jd\n", lbn);
+ freeblks = freework->fw_freeblks;
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
fs = ump->um_fs;
fs_pendingblocks = 0;
+ freedeps = 0;
+ needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ;
lbnadd = 1;
for (i = level; i > 0; i--)
lbnadd *= NINDIR(fs);
@@ -2877,13 +5931,14 @@ indir_trunc(freeblks, dbn, level, lbn, countp)
ACQUIRE_LOCK(&lk);
if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
if (wk->wk_type != D_INDIRDEP ||
- (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
- (indirdep->ir_state & GOINGAWAY) == 0)
- panic("indir_trunc: lost indirdep");
- WORKLIST_REMOVE(wk);
- WORKITEM_FREE(indirdep, D_INDIRDEP);
+ (wk->wk_state & GOINGAWAY) == 0)
+ panic("indir_trunc: lost indirdep %p", wk);
+ indirdep = WK_INDIRDEP(wk);
+ LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
+ free_indirdep(indirdep);
if (!LIST_EMPTY(&bp->b_dep))
- panic("indir_trunc: dangling dep");
+ panic("indir_trunc: dangling dep %p",
+ LIST_FIRST(&bp->b_dep));
ump->um_numindirdeps -= 1;
FREE_LOCK(&lk);
} else {
@@ -2892,11 +5947,10 @@ indir_trunc(freeblks, dbn, level, lbn, countp)
brelse(bp);
#endif
FREE_LOCK(&lk);
- error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
- NOCRED, &bp);
- if (error) {
+ if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
+ NOCRED, &bp) != 0) {
brelse(bp);
- return (error);
+ return;
}
}
/*
@@ -2909,57 +5963,264 @@ indir_trunc(freeblks, dbn, level, lbn, countp)
ufs1fmt = 0;
bap2 = (ufs2_daddr_t *)bp->b_data;
}
- nblocks = btodb(fs->fs_bsize);
- for (i = NINDIR(fs) - 1; i >= 0; i--) {
- if (ufs1fmt)
- nb = bap1[i];
+ /*
+ * Reclaim indirect blocks which never made it to disk.
+ */
+ cnt = 0;
+ LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {
+ struct workhead freewk;
+ if (wk->wk_type != D_JNEWBLK)
+ continue;
+ WORKLIST_REMOVE_UNLOCKED(wk);
+ LIST_INIT(&freewk);
+ WORKLIST_INSERT_UNLOCKED(&freewk, wk);
+ jnewblk = WK_JNEWBLK(wk);
+ if (jnewblk->jn_lbn > 0)
+ i = (jnewblk->jn_lbn - -lbn) / lbnadd;
else
+ i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd;
+ KASSERT(i >= 0 && i < NINDIR(fs),
+ ("indir_trunc: Index out of range %d parent %jd lbn %jd",
+ i, lbn, jnewblk->jn_lbn));
+ /* Clear the pointer so it isn't found below. */
+ if (ufs1fmt) {
+ nb = bap1[i];
+ bap1[i] = 0;
+ } else {
nb = bap2[i];
+ bap2[i] = 0;
+ }
+ KASSERT(nb == jnewblk->jn_blkno,
+ ("indir_trunc: Block mismatch %jd != %jd",
+ nb, jnewblk->jn_blkno));
+ ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno,
+ fs->fs_bsize, freeblks->fb_previousinum, &freewk);
+ cnt++;
+ }
+ ACQUIRE_LOCK(&lk);
+ if (needj)
+ freework->fw_ref += NINDIR(fs) + 1;
+ /* Any remaining journal work can be completed with freeblks. */
+ jwork_move(&freeblks->fb_jwork, &wkhd);
+ FREE_LOCK(&lk);
+ nblocks = btodb(fs->fs_bsize);
+ if (ufs1fmt)
+ nb = bap1[0];
+ else
+ nb = bap2[0];
+ nfreework = freework;
+ /*
+ * Reclaim on disk blocks.
+ */
+ for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
+ if (i != NINDIR(fs) - 1) {
+ if (ufs1fmt)
+ nnb = bap1[i+1];
+ else
+ nnb = bap2[i+1];
+ } else
+ nnb = 0;
if (nb == 0)
continue;
+ cnt++;
if (level != 0) {
- if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
- level - 1, lbn + (i * lbnadd), countp)) != 0)
- allerror = error;
+ ufs_lbn_t nlbn;
+
+ nlbn = (lbn + 1) - (i * lbnadd);
+ if (needj != 0) {
+ nfreework = newfreework(freeblks, freework,
+ nlbn, nb, fs->fs_frag, 0);
+ freedeps++;
+ }
+ indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
+ } else {
+ struct freedep *freedep;
+
+ /*
+ * Attempt to aggregate freedep dependencies for
+ * all blocks being released to the same CG.
+ */
+ LIST_INIT(&wkhd);
+ if (needj != 0 &&
+ (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
+ freedep = newfreedep(freework);
+ WORKLIST_INSERT_UNLOCKED(&wkhd,
+ &freedep->fd_list);
+ freedeps++;
+ }
+ ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
+ fs->fs_bsize, freeblks->fb_previousinum, &wkhd);
}
- ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
- freeblks->fb_previousinum);
+ }
+ if (level == 0)
+ fs_pendingblocks = (nblocks * cnt);
+ /*
+ * If we're not journaling we can free the indirect now. Otherwise
+ * setup the ref counts and offset so this indirect can be completed
+ * when its children are free.
+ */
+ if (needj == 0) {
fs_pendingblocks += nblocks;
- *countp += nblocks;
+ dbn = dbtofsb(fs, dbn);
+ ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
+ freeblks->fb_previousinum, NULL);
+ ACQUIRE_LOCK(&lk);
+ freeblks->fb_chkcnt -= fs_pendingblocks;
+ if (freework->fw_blkno == dbn)
+ handle_written_freework(freework);
+ FREE_LOCK(&lk);
+ freework = NULL;
+ } else {
+ ACQUIRE_LOCK(&lk);
+ freework->fw_off = i;
+ freework->fw_ref += freedeps;
+ freework->fw_ref -= NINDIR(fs) + 1;
+ if (freework->fw_ref != 0)
+ freework = NULL;
+ freeblks->fb_chkcnt -= fs_pendingblocks;
+ FREE_LOCK(&lk);
+ }
+ if (fs_pendingblocks) {
+ UFS_LOCK(ump);
+ fs->fs_pendingblocks -= fs_pendingblocks;
+ UFS_UNLOCK(ump);
}
- UFS_LOCK(ump);
- fs->fs_pendingblocks -= fs_pendingblocks;
- UFS_UNLOCK(ump);
bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
- return (allerror);
+ if (freework)
+ handle_workitem_indirblk(freework);
+ return;
}
/*
- * Free an allocindir.
- * This routine must be called with splbio interrupts blocked.
+ * Cancel an allocindir when it is removed via truncation.
*/
static void
-free_allocindir(aip, inodedep)
+cancel_allocindir(aip, inodedep, freeblks)
struct allocindir *aip;
struct inodedep *inodedep;
+ struct freeblks *freeblks;
{
- struct freefrag *freefrag;
+ struct newblk *newblk;
- mtx_assert(&lk, MA_OWNED);
- if ((aip->ai_state & DEPCOMPLETE) == 0)
- LIST_REMOVE(aip, ai_deps);
- if (aip->ai_state & ONWORKLIST)
- WORKLIST_REMOVE(&aip->ai_list);
+ /*
+ * If the journal hasn't been written the jnewblk must be passed
+ * to the call to ffs_freeblk that reclaims the space. We accomplish
+ * this by linking the journal dependency into the indirdep to be
+ * freed when indir_trunc() is called. If the journal has already
+ * been written we can simply reclaim the journal space when the
+ * freeblks work is complete.
+ */
LIST_REMOVE(aip, ai_next);
- if ((freefrag = aip->ai_freefrag) != NULL) {
+ newblk = (struct newblk *)aip;
+ if (newblk->nb_jnewblk == NULL)
+ cancel_newblk(newblk, &freeblks->fb_jwork);
+ else
+ cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork);
+ if (inodedep && inodedep->id_state & DEPCOMPLETE)
+ WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);
+ else
+ free_newblk(newblk);
+}
+
+/*
+ * Create the mkdir dependencies for . and .. in a new directory. Link them
+ * in to a newdirblk so any subsequent additions are tracked properly. The
+ * caller is responsible for adding the mkdir1 dependency to the journal
+ * and updating id_mkdiradd. This function returns with lk held.
+ */
+static struct mkdir *
+setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
+ struct diradd *dap;
+ ino_t newinum;
+ ino_t dinum;
+ struct buf *newdirbp;
+ struct mkdir **mkdirp;
+{
+ struct newblk *newblk;
+ struct pagedep *pagedep;
+ struct inodedep *inodedep;
+ struct newdirblk *newdirblk = 0;
+ struct mkdir *mkdir1, *mkdir2;
+ struct worklist *wk;
+ struct jaddref *jaddref;
+ struct mount *mp;
+
+ mp = dap->da_list.wk_mp;
+ newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
+ M_SOFTDEP_FLAGS);
+ workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
+ LIST_INIT(&newdirblk->db_mkdir);
+ mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
+ workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
+ mkdir1->md_state = ATTACHED | MKDIR_BODY;
+ mkdir1->md_diradd = dap;
+ mkdir1->md_jaddref = NULL;
+ mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
+ workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
+ mkdir2->md_state = ATTACHED | MKDIR_PARENT;
+ mkdir2->md_diradd = dap;
+ mkdir2->md_jaddref = NULL;
+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) {
+ mkdir1->md_state |= DEPCOMPLETE;
+ mkdir2->md_state |= DEPCOMPLETE;
+ }
+ /*
+ * Dependency on "." and ".." being written to disk.
+ */
+ mkdir1->md_buf = newdirbp;
+ ACQUIRE_LOCK(&lk);
+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
+ /*
+ * We must link the pagedep, allocdirect, and newdirblk for
+ * the initial file page so the pointer to the new directory
+ * is not written until the directory contents are live and
+ * any subsequent additions are not marked live until the
+ * block is reachable via the inode.
+ */
+ if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)
+ panic("setup_newdir: lost pagedep");
+ LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
+ if (wk->wk_type == D_ALLOCDIRECT)
+ break;
+ if (wk == NULL)
+ panic("setup_newdir: lost allocdirect");
+ newblk = WK_NEWBLK(wk);
+ pagedep->pd_state |= NEWBLOCK;
+ pagedep->pd_newdirblk = newdirblk;
+ newdirblk->db_pagedep = pagedep;
+ WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
+ WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
+ /*
+ * Look up the inodedep for the parent directory so that we
+ * can link mkdir2 into the pending dotdot jaddref or
+ * the inode write if there is none. If the inode is
+ * ALLCOMPLETE and no jaddref is present all dependencies have
+ * been satisfied and mkdir2 can be freed.
+ */
+ inodedep_lookup(mp, dinum, 0, &inodedep);
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
if (inodedep == NULL)
- add_to_worklist(&freefrag->ff_list);
- else
- WORKLIST_INSERT(&inodedep->id_bufwait,
- &freefrag->ff_list);
+ panic("setup_newdir: Lost parent.");
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
+ (jaddref->ja_state & MKDIR_PARENT),
+ ("setup_newdir: bad dotdot jaddref %p", jaddref));
+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
+ mkdir2->md_jaddref = jaddref;
+ jaddref->ja_mkdir = mkdir2;
+ } else if (inodedep == NULL ||
+ (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ dap->da_state &= ~MKDIR_PARENT;
+ WORKITEM_FREE(mkdir2, D_MKDIR);
+ } else {
+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
+ WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
}
- WORKITEM_FREE(aip, D_ALLOCINDIR);
+ *mkdirp = mkdir2;
+
+ return (mkdir1);
}
/*
@@ -2998,12 +6259,14 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
ufs_lbn_t lbn; /* block in directory containing new entry */
struct fs *fs;
struct diradd *dap;
- struct allocdirect *adp;
+ struct newblk *newblk;
struct pagedep *pagedep;
struct inodedep *inodedep;
struct newdirblk *newdirblk = 0;
struct mkdir *mkdir1, *mkdir2;
+ struct jaddref *jaddref;
struct mount *mp;
+ int isindir;
/*
* Whiteouts have no dependencies.
@@ -3013,6 +6276,8 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
bdwrite(newdirbp);
return (0);
}
+ jaddref = NULL;
+ mkdir1 = mkdir2 = NULL;
mp = UFSTOVFS(dp->i_ump);
fs = dp->i_fs;
lbn = lblkno(fs, diroffset);
@@ -3023,111 +6288,123 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
dap->da_offset = offset;
dap->da_newinum = newinum;
dap->da_state = ATTACHED;
- if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
+ LIST_INIT(&dap->da_jwork);
+ isindir = bp->b_lblkno >= NDADDR;
+ if (isnewblk &&
+ (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
newdirblk = malloc(sizeof(struct newdirblk),
M_NEWDIRBLK, M_SOFTDEP_FLAGS);
workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
+ LIST_INIT(&newdirblk->db_mkdir);
}
+ /*
+ * If we're creating a new directory setup the dependencies and set
+ * the dap state to wait for them. Otherwise it's COMPLETE and
+ * we can move on.
+ */
if (newdirbp == NULL) {
dap->da_state |= DEPCOMPLETE;
ACQUIRE_LOCK(&lk);
} else {
dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
- mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR,
- M_SOFTDEP_FLAGS);
- workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
- mkdir1->md_state = MKDIR_BODY;
- mkdir1->md_diradd = dap;
- mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR,
- M_SOFTDEP_FLAGS);
- workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
- mkdir2->md_state = MKDIR_PARENT;
- mkdir2->md_diradd = dap;
- /*
- * Dependency on "." and ".." being written to disk.
- */
- mkdir1->md_buf = newdirbp;
- ACQUIRE_LOCK(&lk);
- LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
- WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
- FREE_LOCK(&lk);
- bdwrite(newdirbp);
- /*
- * Dependency on link count increase for parent directory
- */
- ACQUIRE_LOCK(&lk);
- if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
- || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
- dap->da_state &= ~MKDIR_PARENT;
- WORKITEM_FREE(mkdir2, D_MKDIR);
- } else {
- LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
- WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
- }
+ mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
+ &mkdir2);
}
/*
* Link into parent directory pagedep to await its being written.
*/
- if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
+ if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
+#ifdef DEBUG
+ if (diradd_lookup(pagedep, offset) != NULL)
+ panic("softdep_setup_directory_add: %p already at off %d\n",
+ diradd_lookup(pagedep, offset), offset);
+#endif
dap->da_pagedep = pagedep;
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
da_pdlist);
+ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
/*
- * Link into its inodedep. Put it on the id_bufwait list if the inode
- * is not yet written. If it is written, do the post-inode write
- * processing to put it on the id_pendinghd list.
+ * If we're journaling, link the diradd into the jaddref so it
+ * may be completed after the journal entry is written. Otherwise,
+ * link the diradd into its inodedep. If the inode is not yet
+ * written place it on the bufwait list, otherwise do the post-inode
+ * write processing to put it on the id_pendinghd list.
*/
- (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
- if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+ ("softdep_setup_directory_add: bad jaddref %p", jaddref));
+ jaddref->ja_diroff = diroffset;
+ jaddref->ja_diradd = dap;
+ add_to_journal(&jaddref->ja_list);
+ } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
diradd_inode_written(dap, inodedep);
else
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
- if (isnewblk) {
+ /*
+ * Add the journal entries for . and .. links now that the primary
+ * link is written.
+ */
+ if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) {
+ jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
+ inoreflst, if_deps);
+ KASSERT(jaddref != NULL &&
+ jaddref->ja_ino == jaddref->ja_parent &&
+ (jaddref->ja_state & MKDIR_BODY),
+ ("softdep_setup_directory_add: bad dot jaddref %p",
+ jaddref));
+ mkdir1->md_jaddref = jaddref;
+ jaddref->ja_mkdir = mkdir1;
/*
- * Directories growing into indirect blocks are rare
- * enough and the frequency of new block allocation
- * in those cases even more rare, that we choose not
- * to bother tracking them. Rather we simply force the
- * new directory entry to disk.
+ * It is important that the dotdot journal entry
+ * is added prior to the dot entry since dot writes
+ * both the dot and dotdot links. These both must
+ * be added after the primary link for the journal
+ * to remain consistent.
*/
- if (lbn >= NDADDR) {
- FREE_LOCK(&lk);
- /*
- * We only have a new allocation when at the
- * beginning of a new block, not when we are
- * expanding into an existing block.
- */
- if (blkoff(fs, diroffset) == 0)
- return (1);
- return (0);
- }
+ add_to_journal(&mkdir2->md_jaddref->ja_list);
+ add_to_journal(&jaddref->ja_list);
+ }
+ /*
+ * If we are adding a new directory remember this diradd so that if
+ * we rename it we can keep the dot and dotdot dependencies. If
+ * we are adding a new name for an inode that has a mkdiradd we
+ * must be in rename and we have to move the dot and dotdot
+ * dependencies to this new name. The old name is being orphaned
+ * soon.
+ */
+ if (mkdir1 != NULL) {
+ if (inodedep->id_mkdiradd != NULL)
+ panic("softdep_setup_directory_add: Existing mkdir");
+ inodedep->id_mkdiradd = dap;
+ } else if (inodedep->id_mkdiradd)
+ merge_diradd(inodedep, dap);
+ if (newdirblk) {
/*
- * We only have a new allocation when at the beginning
- * of a new fragment, not when we are expanding into an
- * existing fragment. Also, there is nothing to do if we
- * are already tracking this block.
+ * There is nothing to do if we are already tracking
+ * this block.
*/
- if (fragoff(fs, diroffset) != 0) {
- FREE_LOCK(&lk);
- return (0);
- }
if ((pagedep->pd_state & NEWBLOCK) != 0) {
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
FREE_LOCK(&lk);
return (0);
}
- /*
- * Find our associated allocdirect and have it track us.
- */
- if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
- panic("softdep_setup_directory_add: lost inodedep");
- adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
- if (adp == NULL || adp->ad_lbn != lbn)
+ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
+ == 0)
panic("softdep_setup_directory_add: lost entry");
+ WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
pagedep->pd_state |= NEWBLOCK;
+ pagedep->pd_newdirblk = newdirblk;
newdirblk->db_pagedep = pagedep;
- WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
+ FREE_LOCK(&lk);
+ /*
+ * If we extended into an indirect signal direnter to sync.
+ */
+ if (isindir)
+ return (1);
+ return (0);
}
FREE_LOCK(&lk);
return (0);
@@ -3141,7 +6418,8 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
* occur while the move is in progress.
*/
void
-softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
+softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
+ struct buf *bp; /* Buffer holding directory block. */
struct inode *dp; /* inode for directory */
caddr_t base; /* address of dp->i_offset */
caddr_t oldloc; /* address of old directory location */
@@ -3150,40 +6428,204 @@ softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
{
int offset, oldoffset, newoffset;
struct pagedep *pagedep;
+ struct jmvref *jmvref;
struct diradd *dap;
+ struct direct *de;
+ struct mount *mp;
ufs_lbn_t lbn;
+ int flags;
- ACQUIRE_LOCK(&lk);
+ mp = UFSTOVFS(dp->i_ump);
+ de = (struct direct *)oldloc;
+ jmvref = NULL;
+ flags = 0;
+ /*
+ * Moves are always journaled as it would be too complex to
+ * determine if any affected adds or removes are present in the
+ * journal.
+ */
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ flags = DEPALLOC;
+ jmvref = newjmvref(dp, de->d_ino,
+ dp->i_offset + (oldloc - base),
+ dp->i_offset + (newloc - base));
+ }
lbn = lblkno(dp->i_fs, dp->i_offset);
offset = blkoff(dp->i_fs, dp->i_offset);
- if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
- goto done;
oldoffset = offset + (oldloc - base);
newoffset = offset + (newloc - base);
-
- LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
- if (dap->da_offset != oldoffset)
- continue;
+ ACQUIRE_LOCK(&lk);
+ if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {
+ if (pagedep)
+ WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
+ goto done;
+ }
+ dap = diradd_lookup(pagedep, oldoffset);
+ if (dap) {
dap->da_offset = newoffset;
- if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
- break;
- LIST_REMOVE(dap, da_pdlist);
- LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
- dap, da_pdlist);
- break;
+ newoffset = DIRADDHASH(newoffset);
+ oldoffset = DIRADDHASH(oldoffset);
+ if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
+ newoffset != oldoffset) {
+ LIST_REMOVE(dap, da_pdlist);
+ LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
+ dap, da_pdlist);
+ }
}
- if (dap == NULL) {
+done:
+ if (jmvref) {
+ jmvref->jm_pagedep = pagedep;
+ LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
+ add_to_journal(&jmvref->jm_list);
+ }
+ bcopy(oldloc, newloc, entrysize);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Move the mkdir dependencies and journal work from one diradd to another
+ * when renaming a directory. The new name must depend on the mkdir deps
+ * completing as the old name did. Directories can only have one valid link
+ * at a time so one must be canonical.
+ */
+static void
+merge_diradd(inodedep, newdap)
+ struct inodedep *inodedep;
+ struct diradd *newdap;
+{
+ struct diradd *olddap;
+ struct mkdir *mkdir, *nextmd;
+ short state;
- LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
- if (dap->da_offset == oldoffset) {
- dap->da_offset = newoffset;
+ olddap = inodedep->id_mkdiradd;
+ inodedep->id_mkdiradd = newdap;
+ if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
+ newdap->da_state &= ~DEPCOMPLETE;
+ for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
+ nextmd = LIST_NEXT(mkdir, md_mkdirs);
+ if (mkdir->md_diradd != olddap)
+ continue;
+ mkdir->md_diradd = newdap;
+ state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
+ newdap->da_state |= state;
+ olddap->da_state &= ~state;
+ if ((olddap->da_state &
+ (MKDIR_PARENT | MKDIR_BODY)) == 0)
break;
+ }
+ if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
+ panic("merge_diradd: unfound ref");
+ }
+ /*
+ * Any mkdir related journal items are not safe to be freed until
+ * the new name is stable.
+ */
+ jwork_move(&newdap->da_jwork, &olddap->da_jwork);
+ olddap->da_state |= DEPCOMPLETE;
+ complete_diradd(olddap);
+}
+
+/*
+ * Move the diradd to the pending list when all diradd dependencies are
+ * complete.
+ */
+static void
+complete_diradd(dap)
+ struct diradd *dap;
+{
+ struct pagedep *pagedep;
+
+ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ if (dap->da_state & DIRCHG)
+ pagedep = dap->da_previous->dm_pagedep;
+ else
+ pagedep = dap->da_pagedep;
+ LIST_REMOVE(dap, da_pdlist);
+ LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
+ }
+}
+
+/*
+ * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal
+ * add entries and conditonally journal the remove.
+ */
+static void
+cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
+ struct diradd *dap;
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+ struct jremref *dotremref;
+ struct jremref *dotdotremref;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct inoref *inoref;
+ struct mkdir *mkdir;
+
+ /*
+ * If no remove references were allocated we're on a non-journaled
+ * filesystem and can skip the cancel step.
+ */
+ if (jremref == NULL) {
+ free_diradd(dap, NULL);
+ return;
+ }
+ /*
+ * Cancel the primary name an free it if it does not require
+ * journaling.
+ */
+ if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
+ 0, &inodedep) != 0) {
+ /* Abort the addref that reference this diradd. */
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if (inoref->if_list.wk_type != D_JADDREF)
+ continue;
+ jaddref = (struct jaddref *)inoref;
+ if (jaddref->ja_diradd != dap)
+ continue;
+ if (cancel_jaddref(jaddref, inodedep,
+ &dirrem->dm_jwork) == 0) {
+ free_jremref(jremref);
+ jremref = NULL;
}
+ break;
}
}
-done:
- bcopy(oldloc, newloc, entrysize);
- FREE_LOCK(&lk);
+ /*
+ * Cancel subordinate names and free them if they do not require
+ * journaling.
+ */
+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
+ LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
+ if (mkdir->md_diradd != dap)
+ continue;
+ if ((jaddref = mkdir->md_jaddref) == NULL)
+ continue;
+ mkdir->md_jaddref = NULL;
+ if (mkdir->md_state & MKDIR_PARENT) {
+ if (cancel_jaddref(jaddref, NULL,
+ &dirrem->dm_jwork) == 0) {
+ free_jremref(dotdotremref);
+ dotdotremref = NULL;
+ }
+ } else {
+ if (cancel_jaddref(jaddref, inodedep,
+ &dirrem->dm_jwork) == 0) {
+ free_jremref(dotremref);
+ dotremref = NULL;
+ }
+ }
+ }
+ }
+
+ if (jremref)
+ journal_jremref(dirrem, jremref, inodedep);
+ if (dotremref)
+ journal_jremref(dirrem, dotremref, inodedep);
+ if (dotdotremref)
+ journal_jremref(dirrem, dotdotremref, NULL);
+ jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
+ free_diradd(dap, &dirrem->dm_jwork);
}
/*
@@ -3191,8 +6633,9 @@ done:
* with splbio interrupts blocked.
*/
static void
-free_diradd(dap)
+free_diradd(dap, wkhd)
struct diradd *dap;
+ struct workhead *wkhd;
{
struct dirrem *dirrem;
struct pagedep *pagedep;
@@ -3200,32 +6643,48 @@ free_diradd(dap)
struct mkdir *mkdir, *nextmd;
mtx_assert(&lk, MA_OWNED);
- WORKLIST_REMOVE(&dap->da_list);
LIST_REMOVE(dap, da_pdlist);
+ if (dap->da_state & ONWORKLIST)
+ WORKLIST_REMOVE(&dap->da_list);
if ((dap->da_state & DIRCHG) == 0) {
pagedep = dap->da_pagedep;
} else {
dirrem = dap->da_previous;
pagedep = dirrem->dm_pagedep;
dirrem->dm_dirinum = pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ dirrem->dm_state |= COMPLETE;
+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+ add_to_worklist(&dirrem->dm_list, 0);
}
if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
0, &inodedep) != 0)
- (void) free_inodedep(inodedep);
+ if (inodedep->id_mkdiradd == dap)
+ inodedep->id_mkdiradd = NULL;
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
nextmd = LIST_NEXT(mkdir, md_mkdirs);
if (mkdir->md_diradd != dap)
continue;
- dap->da_state &= ~mkdir->md_state;
- WORKLIST_REMOVE(&mkdir->md_list);
+ dap->da_state &=
+ ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
LIST_REMOVE(mkdir, md_mkdirs);
+ if (mkdir->md_state & ONWORKLIST)
+ WORKLIST_REMOVE(&mkdir->md_list);
+ if (mkdir->md_jaddref != NULL)
+ panic("free_diradd: Unexpected jaddref");
WORKITEM_FREE(mkdir, D_MKDIR);
+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
+ break;
}
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
panic("free_diradd: unfound ref");
}
+ if (inodedep)
+ free_inodedep(inodedep);
+ /*
+ * Free any journal segments waiting for the directory write.
+ */
+ handle_jwork(&dap->da_jwork);
WORKITEM_FREE(dap, D_DIRADD);
}
@@ -3254,11 +6713,24 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
int isrmdir; /* indicates if doing RMDIR */
{
struct dirrem *dirrem, *prevdirrem;
+ struct inodedep *inodedep;
+ int direct;
/*
- * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
+ * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want
+ * newdirrem() to setup the full directory remove which requires
+ * isrmdir > 1.
*/
- dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
+ dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem);
+ /*
+ * Add the dirrem to the inodedep's pending remove list for quick
+ * discovery later.
+ */
+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+ &inodedep) == 0)
+ panic("softdep_setup_remove: Lost inodedep.");
+ dirrem->dm_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
/*
* If the COMPLETE flag is clear, then there were no active
@@ -3280,9 +6752,146 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
prevdirrem, dm_next);
dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
+ direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
FREE_LOCK(&lk);
- handle_workitem_remove(dirrem, NULL);
+ if (direct)
+ handle_workitem_remove(dirrem, NULL);
+ }
+}
+
+/*
+ * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
+ * pd_pendinghd list of a pagedep.
+ */
+static struct diradd *
+diradd_lookup(pagedep, offset)
+ struct pagedep *pagedep;
+ int offset;
+{
+ struct diradd *dap;
+
+ LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
+ if (dap->da_offset == offset)
+ return (dap);
+ LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
+ if (dap->da_offset == offset)
+ return (dap);
+ return (NULL);
+}
+
+/*
+ * Search for a .. diradd dependency in a directory that is being removed.
+ * If the directory was renamed to a new parent we have a diradd rather
+ * than a mkdir for the .. entry. We need to cancel it now before
+ * it is found in truncate().
+ */
+static struct jremref *
+cancel_diradd_dotdot(ip, dirrem, jremref)
+ struct inode *ip;
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+{
+ struct pagedep *pagedep;
+ struct diradd *dap;
+ struct worklist *wk;
+
+ if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,
+ &pagedep) == 0)
+ return (jremref);
+ dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
+ if (dap == NULL)
+ return (jremref);
+ cancel_diradd(dap, dirrem, jremref, NULL, NULL);
+ /*
+ * Mark any journal work as belonging to the parent so it is freed
+ * with the .. reference.
+ */
+ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
+ wk->wk_state |= MKDIR_PARENT;
+ return (NULL);
+}
+
+/*
+ * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
+ * replace it with a dirrem/diradd pair as a result of re-parenting a
+ * directory. This ensures that we don't simultaneously have a mkdir and
+ * a diradd for the same .. entry.
+ */
+static struct jremref *
+cancel_mkdir_dotdot(ip, dirrem, jremref)
+ struct inode *ip;
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct mkdir *mkdir;
+ struct diradd *dap;
+
+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+ &inodedep) == 0)
+ panic("cancel_mkdir_dotdot: Lost inodedep");
+ dap = inodedep->id_mkdiradd;
+ if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
+ return (jremref);
+ for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
+ mkdir = LIST_NEXT(mkdir, md_mkdirs))
+ if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
+ break;
+ if (mkdir == NULL)
+ panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
+ if ((jaddref = mkdir->md_jaddref) != NULL) {
+ mkdir->md_jaddref = NULL;
+ jaddref->ja_state &= ~MKDIR_PARENT;
+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
+ &inodedep) == 0)
+ panic("cancel_mkdir_dotdot: Lost parent inodedep");
+ if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
+ journal_jremref(dirrem, jremref, inodedep);
+ jremref = NULL;
+ }
}
+ if (mkdir->md_state & ONWORKLIST)
+ WORKLIST_REMOVE(&mkdir->md_list);
+ mkdir->md_state |= ALLCOMPLETE;
+ complete_mkdir(mkdir);
+ return (jremref);
+}
+
+static void
+journal_jremref(dirrem, jremref, inodedep)
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+ struct inodedep *inodedep;
+{
+
+ if (inodedep == NULL)
+ if (inodedep_lookup(jremref->jr_list.wk_mp,
+ jremref->jr_ref.if_ino, 0, &inodedep) == 0)
+ panic("journal_jremref: Lost inodedep");
+ LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
+ add_to_journal(&jremref->jr_list);
+}
+
+static void
+dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+ struct jremref *dotremref;
+ struct jremref *dotdotremref;
+{
+ struct inodedep *inodedep;
+
+
+ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
+ &inodedep) == 0)
+ panic("dirrem_journal: Lost inodedep");
+ journal_jremref(dirrem, jremref, inodedep);
+ if (dotremref)
+ journal_jremref(dirrem, dotremref, inodedep);
+ if (dotdotremref)
+ journal_jremref(dirrem, dotdotremref, NULL);
}
/*
@@ -3303,12 +6912,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
struct diradd *dap;
struct dirrem *dirrem;
struct pagedep *pagedep;
+ struct jremref *jremref;
+ struct jremref *dotremref;
+ struct jremref *dotdotremref;
+ struct vnode *dvp;
/*
* Whiteouts have no deletion dependencies.
*/
if (ip == NULL)
panic("newdirrem: whiteout");
+ dvp = ITOV(dp);
/*
* If we are over our limit, try to improve the situation.
* Limiting the number of dirrem structures will also limit
@@ -3321,34 +6935,75 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
FREE_LOCK(&lk);
dirrem = malloc(sizeof(struct dirrem),
M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
- workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
+ workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
+ LIST_INIT(&dirrem->dm_jremrefhd);
+ LIST_INIT(&dirrem->dm_jwork);
dirrem->dm_state = isrmdir ? RMDIR : 0;
dirrem->dm_oldinum = ip->i_number;
*prevdirremp = NULL;
-
+ /*
+ * Allocate remove reference structures to track journal write
+ * dependencies. We will always have one for the link and
+ * when doing directories we will always have one more for dot.
+ * When renaming a directory we skip the dotdot link change so
+ * this is not needed.
+ */
+ jremref = dotremref = dotdotremref = NULL;
+ if (DOINGSUJ(dvp)) {
+ if (isrmdir) {
+ jremref = newjremref(dirrem, dp, ip, dp->i_offset,
+ ip->i_effnlink + 2);
+ dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
+ ip->i_effnlink + 1);
+ } else
+ jremref = newjremref(dirrem, dp, ip, dp->i_offset,
+ ip->i_effnlink + 1);
+ if (isrmdir > 1) {
+ dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
+ dp->i_effnlink + 1);
+ dotdotremref->jr_state |= MKDIR_PARENT;
+ }
+ }
ACQUIRE_LOCK(&lk);
lbn = lblkno(dp->i_fs, dp->i_offset);
offset = blkoff(dp->i_fs, dp->i_offset);
- if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
+ if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,
+ &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dirrem->dm_pagedep = pagedep;
/*
+ * If we're renaming a .. link to a new directory, cancel any
+ * existing MKDIR_PARENT mkdir. If it has already been canceled
+ * the jremref is preserved for any potential diradd in this
+ * location. This can not coincide with a rmdir.
+ */
+ if (dp->i_offset == DOTDOT_OFFSET) {
+ if (isrmdir)
+ panic("newdirrem: .. directory change during remove?");
+ jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
+ }
+ /*
+ * If we're removing a directory search for the .. dependency now and
+ * cancel it. Any pending journal work will be added to the dirrem
+ * to be completed when the workitem remove completes.
+ */
+ if (isrmdir > 1)
+ dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
+ /*
* Check for a diradd dependency for the same directory entry.
* If present, then both dependencies become obsolete and can
- * be de-allocated. Check for an entry on both the pd_dirraddhd
- * list and the pd_pendinghd list.
+ * be de-allocated.
*/
-
- LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
- if (dap->da_offset == offset)
- break;
+ dap = diradd_lookup(pagedep, offset);
if (dap == NULL) {
-
- LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
- if (dap->da_offset == offset)
- break;
- if (dap == NULL)
- return (dirrem);
+ /*
+ * Link the jremref structures into the dirrem so they are
+ * written prior to the pagedep.
+ */
+ if (jremref)
+ dirrem_journal(dirrem, jremref, dotremref,
+ dotdotremref);
+ return (dirrem);
}
/*
* Must be ATTACHED at this point.
@@ -3373,7 +7028,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
* Mark it COMPLETE so we can delete its inode immediately.
*/
dirrem->dm_state |= COMPLETE;
- free_diradd(dap);
+ cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
+#ifdef SUJ_DEBUG
+ if (isrmdir == 0) {
+ struct worklist *wk;
+
+ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
+ if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
+ panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
+ }
+#endif
+
return (dirrem);
}
@@ -3407,6 +7072,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
struct dirrem *dirrem, *prevdirrem;
struct pagedep *pagedep;
struct inodedep *inodedep;
+ struct jaddref *jaddref;
struct mount *mp;
offset = blkoff(dp->i_fs, dp->i_offset);
@@ -3422,6 +7088,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
dap->da_offset = offset;
dap->da_newinum = newinum;
+ LIST_INIT(&dap->da_jwork);
}
/*
@@ -3454,11 +7121,21 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
dm_next);
} else {
dirrem->dm_dirinum = pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+ add_to_worklist(&dirrem->dm_list, 0);
}
FREE_LOCK(&lk);
return;
}
+ /*
+ * Add the dirrem to the inodedep's pending remove list for quick
+ * discovery later. A valid nlinkdelta ensures that this lookup
+ * will not fail.
+ */
+ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
+ panic("softdep_setup_directory_change: Lost inodedep.");
+ dirrem->dm_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
/*
* If the COMPLETE flag is clear, then there were no active
@@ -3483,15 +7160,29 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
dap->da_pagedep = pagedep;
}
dirrem->dm_dirinum = pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+ add_to_worklist(&dirrem->dm_list, 0);
}
/*
- * Link into its inodedep. Put it on the id_bufwait list if the inode
+ * Lookup the jaddref for this journal entry. We must finish
+ * initializing it and make the diradd write dependent on it.
+ * If we're not journaling Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
- if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
- (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+ ("softdep_setup_directory_change: bad jaddref %p",
+ jaddref));
+ jaddref->ja_diroff = dp->i_offset;
+ jaddref->ja_diradd = dap;
+ LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
+ dap, da_pdlist);
+ add_to_journal(&jaddref->ja_list);
+ } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
dap->da_state |= COMPLETE;
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
@@ -3500,6 +7191,13 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
}
+ /*
+ * If we're making a new name for a directory that has not been
+ * committed when need to move the dot and dotdot references to
+ * this new name.
+ */
+ if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
+ merge_diradd(inodedep, dap);
FREE_LOCK(&lk);
}
@@ -3516,8 +7214,7 @@ softdep_change_linkcnt(ip)
struct inodedep *inodedep;
ACQUIRE_LOCK(&lk);
- (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
- DEPALLOC, &inodedep);
+ inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
if (ip->i_nlink < ip->i_effnlink)
panic("softdep_change_linkcnt: bad delta");
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
@@ -3574,6 +7271,305 @@ softdep_releasefile(ip)
}
/*
+ * Attach a sbdep dependency to the superblock buf so that we can keep
+ * track of the head of the linked list of referenced but unlinked inodes.
+ */
+void
+softdep_setup_sbupdate(ump, fs, bp)
+ struct ufsmount *ump;
+ struct fs *fs;
+ struct buf *bp;
+{
+ struct sbdep *sbdep;
+ struct worklist *wk;
+
+ if ((fs->fs_flags & FS_SUJ) == 0)
+ return;
+ LIST_FOREACH(wk, &bp->b_dep, wk_list)
+ if (wk->wk_type == D_SBDEP)
+ break;
+ if (wk != NULL)
+ return;
+ sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
+ workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
+ sbdep->sb_fs = fs;
+ sbdep->sb_ump = ump;
+ ACQUIRE_LOCK(&lk);
+ WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Return the first unlinked inodedep which is ready to be the head of the
+ * list. The inodedep and all those after it must have valid next pointers.
+ */
+static struct inodedep *
+first_unlinked_inodedep(ump)
+ struct ufsmount *ump;
+{
+ struct inodedep *inodedep;
+ struct inodedep *idp;
+
+ for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
+ inodedep; inodedep = idp) {
+ if ((inodedep->id_state & UNLINKNEXT) == 0)
+ return (NULL);
+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+ if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
+ break;
+ if ((inodedep->id_state & UNLINKPREV) == 0)
+ panic("first_unlinked_inodedep: prev != next");
+ }
+ if (inodedep == NULL)
+ return (NULL);
+
+ return (inodedep);
+}
+
+/*
+ * Set the sujfree unlinked head pointer prior to writing a superblock.
+ */
+static void
+initiate_write_sbdep(sbdep)
+ struct sbdep *sbdep;
+{
+ struct inodedep *inodedep;
+ struct fs *bpfs;
+ struct fs *fs;
+
+ bpfs = sbdep->sb_fs;
+ fs = sbdep->sb_ump->um_fs;
+ inodedep = first_unlinked_inodedep(sbdep->sb_ump);
+ if (inodedep) {
+ fs->fs_sujfree = inodedep->id_ino;
+ inodedep->id_state |= UNLINKPREV;
+ } else
+ fs->fs_sujfree = 0;
+ bpfs->fs_sujfree = fs->fs_sujfree;
+}
+
+/*
+ * After a superblock is written determine whether it must be written again
+ * due to a changing unlinked list head.
+ */
+static int
+handle_written_sbdep(sbdep, bp)
+ struct sbdep *sbdep;
+ struct buf *bp;
+{
+ struct inodedep *inodedep;
+ struct mount *mp;
+ struct fs *fs;
+
+ fs = sbdep->sb_fs;
+ mp = UFSTOVFS(sbdep->sb_ump);
+ inodedep = first_unlinked_inodedep(sbdep->sb_ump);
+ if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
+ (inodedep == NULL && fs->fs_sujfree != 0)) {
+ bdirty(bp);
+ return (1);
+ }
+ WORKITEM_FREE(sbdep, D_SBDEP);
+ if (fs->fs_sujfree == 0)
+ return (0);
+ if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
+ panic("handle_written_sbdep: lost inodedep");
+ /*
+ * Now that we have a record of this indode in stable store allow it
+ * to be written to free up pending work. Inodes may see a lot of
+ * write activity after they are unlinked which we must not hold up.
+ */
+ for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
+ if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
+ panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
+ inodedep, inodedep->id_state);
+ if (inodedep->id_state & UNLINKONLIST)
+ break;
+ inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
+ }
+
+ return (0);
+}
+
+/*
+ * Mark an inodedep has unlinked and insert it into the in-memory unlinked
+ * list.
+ */
+static void
+unlinked_inodedep(mp, inodedep)
+ struct mount *mp;
+ struct inodedep *inodedep;
+{
+ struct ufsmount *ump;
+
+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
+ return;
+ ump = VFSTOUFS(mp);
+ ump->um_fs->fs_fmod = 1;
+ inodedep->id_state |= UNLINKED;
+ TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
+}
+
+/*
+ * Remove an inodedep from the unlinked inodedep list. This may require
+ * disk writes if the inode has made it that far.
+ */
+static void
+clear_unlinked_inodedep(inodedep)
+ struct inodedep *inodedep;
+{
+ struct ufsmount *ump;
+ struct inodedep *idp;
+ struct inodedep *idn;
+ struct fs *fs;
+ struct buf *bp;
+ ino_t ino;
+ ino_t nino;
+ ino_t pino;
+ int error;
+
+ ump = VFSTOUFS(inodedep->id_list.wk_mp);
+ fs = ump->um_fs;
+ ino = inodedep->id_ino;
+ error = 0;
+ for (;;) {
+ /*
+ * If nothing has yet been written simply remove us from
+ * the in memory list and return. This is the most common
+ * case where handle_workitem_remove() loses the final
+ * reference.
+ */
+ if ((inodedep->id_state & UNLINKLINKS) == 0)
+ break;
+ /*
+ * If we have a NEXT pointer and no PREV pointer we can simply
+ * clear NEXT's PREV and remove ourselves from the list. Be
+ * careful not to clear PREV if the superblock points at
+ * next as well.
+ */
+ idn = TAILQ_NEXT(inodedep, id_unlinked);
+ if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
+ if (idn && fs->fs_sujfree != idn->id_ino)
+ idn->id_state &= ~UNLINKPREV;
+ break;
+ }
+ /*
+ * Here we have an inodedep which is actually linked into
+ * the list. We must remove it by forcing a write to the
+ * link before us, whether it be the superblock or an inode.
+ * Unfortunately the list may change while we're waiting
+ * on the buf lock for either resource so we must loop until
+ * we lock. the right one. If both the superblock and an
+ * inode point to this inode we must clear the inode first
+ * followed by the superblock.
+ */
+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+ pino = 0;
+ if (idp && (idp->id_state & UNLINKNEXT))
+ pino = idp->id_ino;
+ FREE_LOCK(&lk);
+ if (pino == 0)
+ bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
+ (int)fs->fs_sbsize, 0, 0, 0);
+ else
+ error = bread(ump->um_devvp,
+ fsbtodb(fs, ino_to_fsba(fs, pino)),
+ (int)fs->fs_bsize, NOCRED, &bp);
+ ACQUIRE_LOCK(&lk);
+ if (error)
+ break;
+ /* If the list has changed restart the loop. */
+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+ nino = 0;
+ if (idp && (idp->id_state & UNLINKNEXT))
+ nino = idp->id_ino;
+ if (nino != pino ||
+ (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
+ FREE_LOCK(&lk);
+ brelse(bp);
+ ACQUIRE_LOCK(&lk);
+ continue;
+ }
+ /*
+ * Remove us from the in memory list. After this we cannot
+ * access the inodedep.
+ */
+ idn = TAILQ_NEXT(inodedep, id_unlinked);
+ inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
+ TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
+ /*
+ * Determine the next inode number.
+ */
+ nino = 0;
+ if (idn) {
+ /*
+ * If next isn't on the list we can just clear prev's
+ * state and schedule it to be fixed later. No need
+ * to synchronously write if we're not in the real
+ * list.
+ */
+ if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
+ idp->id_state &= ~UNLINKNEXT;
+ if ((idp->id_state & ONWORKLIST) == 0)
+ WORKLIST_INSERT(&bp->b_dep,
+ &idp->id_list);
+ FREE_LOCK(&lk);
+ bawrite(bp);
+ ACQUIRE_LOCK(&lk);
+ return;
+ }
+ nino = idn->id_ino;
+ }
+ FREE_LOCK(&lk);
+ /*
+ * The predecessor's next pointer is manually updated here
+ * so that the NEXT flag is never cleared for an element
+ * that is in the list.
+ */
+ if (pino == 0) {
+ bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
+ ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
+ softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
+ bp);
+ } else if (fs->fs_magic == FS_UFS1_MAGIC)
+ ((struct ufs1_dinode *)bp->b_data +
+ ino_to_fsbo(fs, pino))->di_freelink = nino;
+ else
+ ((struct ufs2_dinode *)bp->b_data +
+ ino_to_fsbo(fs, pino))->di_freelink = nino;
+ /*
+ * If the bwrite fails we have no recourse to recover. The
+ * filesystem is corrupted already.
+ */
+ bwrite(bp);
+ ACQUIRE_LOCK(&lk);
+ /*
+ * If the superblock pointer still needs to be cleared force
+ * a write here.
+ */
+ if (fs->fs_sujfree == ino) {
+ FREE_LOCK(&lk);
+ bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
+ (int)fs->fs_sbsize, 0, 0, 0);
+ bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
+ ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
+ softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
+ bp);
+ bwrite(bp);
+ ACQUIRE_LOCK(&lk);
+ }
+ if (fs->fs_sujfree != ino)
+ return;
+ panic("clear_unlinked_inodedep: Failed to clear free head");
+ }
+ if (inodedep->id_ino == fs->fs_sujfree)
+ panic("clear_unlinked_inodedep: Freeing head of free list");
+ inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
+ TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
+ return;
+}
+
+/*
* This workitem decrements the inode's link count.
* If the link count reaches zero, the file is removed.
*/
@@ -3584,22 +7580,54 @@ handle_workitem_remove(dirrem, xp)
{
struct thread *td = curthread;
struct inodedep *inodedep;
+ struct workhead dotdotwk;
+ struct worklist *wk;
+ struct ufsmount *ump;
+ struct mount *mp;
struct vnode *vp;
struct inode *ip;
ino_t oldinum;
int error;
+ if (dirrem->dm_state & ONWORKLIST)
+ panic("handle_workitem_remove: dirrem %p still on worklist",
+ dirrem);
+ oldinum = dirrem->dm_oldinum;
+ mp = dirrem->dm_list.wk_mp;
+ ump = VFSTOUFS(mp);
if ((vp = xp) == NULL &&
- (error = ffs_vgetf(dirrem->dm_list.wk_mp,
- dirrem->dm_oldinum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) {
+ (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,
+ FFSV_FORCEINSMQ)) != 0) {
softdep_error("handle_workitem_remove: vget", error);
return;
}
ip = VTOI(vp);
ACQUIRE_LOCK(&lk);
- if ((inodedep_lookup(dirrem->dm_list.wk_mp,
- dirrem->dm_oldinum, 0, &inodedep)) == 0)
+ if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
panic("handle_workitem_remove: lost inodedep");
+ if (dirrem->dm_state & ONDEPLIST)
+ LIST_REMOVE(dirrem, dm_inonext);
+ KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
+ ("handle_workitem_remove: Journal entries not written."));
+
+ /*
+ * Move all dependencies waiting on the remove to complete
+ * from the dirrem to the inode inowait list to be completed
+ * after the inode has been updated and written to disk. Any
+ * marked MKDIR_PARENT are saved to be completed when the .. ref
+ * is removed.
+ */
+ LIST_INIT(&dotdotwk);
+ while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ if (wk->wk_state & MKDIR_PARENT) {
+ wk->wk_state &= ~MKDIR_PARENT;
+ WORKLIST_INSERT(&dotdotwk, wk);
+ continue;
+ }
+ WORKLIST_INSERT(&inodedep->id_inowait, wk);
+ }
+ LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
/*
* Normal file deletion.
*/
@@ -3609,12 +7637,16 @@ handle_workitem_remove(dirrem, xp)
ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad file delta");
+ if (ip->i_nlink == 0)
+ unlinked_inodedep(mp, inodedep);
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
num_dirrem -= 1;
+ KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
+ ("handle_workitem_remove: worklist not empty. %s",
+ TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
WORKITEM_FREE(dirrem, D_DIRREM);
FREE_LOCK(&lk);
- vput(vp);
- return;
+ goto out;
}
/*
* Directory deletion. Decrement reference count for both the
@@ -3628,6 +7660,8 @@ handle_workitem_remove(dirrem, xp)
ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad dir delta");
+ if (ip->i_nlink == 0)
+ unlinked_inodedep(mp, inodedep);
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
FREE_LOCK(&lk);
if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
@@ -3639,36 +7673,47 @@ handle_workitem_remove(dirrem, xp)
* directory should not change. Thus we skip the followup dirrem.
*/
if (dirrem->dm_state & DIRCHG) {
+ KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
+ ("handle_workitem_remove: DIRCHG and worklist not empty."));
num_dirrem -= 1;
WORKITEM_FREE(dirrem, D_DIRREM);
FREE_LOCK(&lk);
- vput(vp);
- return;
+ goto out;
}
+ dirrem->dm_state = ONDEPLIST;
+ dirrem->dm_oldinum = dirrem->dm_dirinum;
/*
- * If the inodedep does not exist, then the zero'ed inode has
- * been written to disk. If the allocated inode has never been
- * written to disk, then the on-disk inode is zero'ed. In either
- * case we can remove the file immediately.
+ * Place the dirrem on the parent's diremhd list.
*/
- dirrem->dm_state = 0;
- oldinum = dirrem->dm_oldinum;
- dirrem->dm_oldinum = dirrem->dm_dirinum;
- if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
- 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
+ if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
+ panic("handle_workitem_remove: lost dir inodedep");
+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
+ /*
+ * If the allocated inode has never been written to disk, then
+ * the on-disk inode is zero'ed and we can remove the file
+ * immediately. When journaling if the inode has been marked
+ * unlinked and not DEPCOMPLETE we know it can never be written.
+ */
+ inodedep_lookup(mp, oldinum, 0, &inodedep);
+ if (inodedep == NULL ||
+ (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
+ check_inode_unwritten(inodedep)) {
if (xp != NULL)
- add_to_worklist(&dirrem->dm_list);
+ add_to_worklist(&dirrem->dm_list, 0);
FREE_LOCK(&lk);
- vput(vp);
- if (xp == NULL)
+ if (xp == NULL) {
+ vput(vp);
handle_workitem_remove(dirrem, NULL);
+ }
return;
}
WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
FREE_LOCK(&lk);
ip->i_flag |= IN_CHANGE;
+out:
ffs_update(vp, 0);
- vput(vp);
+ if (xp == NULL)
+ vput(vp);
}
/*
@@ -3689,6 +7734,7 @@ static void
handle_workitem_freefile(freefile)
struct freefile *freefile;
{
+ struct workhead wkhd;
struct fs *fs;
struct inodedep *idp;
struct ufsmount *ump;
@@ -3701,13 +7747,15 @@ handle_workitem_freefile(freefile)
error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
FREE_LOCK(&lk);
if (error)
- panic("handle_workitem_freefile: inodedep survived");
+ panic("handle_workitem_freefile: inodedep %p survived", idp);
#endif
UFS_LOCK(ump);
fs->fs_pendinginodes -= 1;
UFS_UNLOCK(ump);
+ LIST_INIT(&wkhd);
+ LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
- freefile->fx_oldinum, freefile->fx_mode)) != 0)
+ freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
softdep_error("handle_workitem_freefile", error);
ACQUIRE_LOCK(&lk);
WORKITEM_FREE(freefile, D_FREEFILE);
@@ -3757,8 +7805,10 @@ softdep_disk_io_initiation(bp)
{
struct worklist *wk;
struct worklist marker;
- struct indirdep *indirdep;
struct inodedep *inodedep;
+ struct freeblks *freeblks;
+ struct jfreeblk *jfreeblk;
+ struct newblk *newblk;
/*
* We only care about write operations. There should never
@@ -3767,6 +7817,10 @@ softdep_disk_io_initiation(bp)
if (bp->b_iocmd != BIO_WRITE)
panic("softdep_disk_io_initiation: not write");
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("softdep_disk_io_initiation: Writing buffer with "
+ "background write in progress: %p", bp);
+
marker.wk_type = D_LAST + 1; /* Not a normal workitem */
PHOLD(curproc); /* Don't swap out kernel stack */
@@ -3792,46 +7846,58 @@ softdep_disk_io_initiation(bp)
continue;
case D_INDIRDEP:
- indirdep = WK_INDIRDEP(wk);
- if (indirdep->ir_state & GOINGAWAY)
- panic("disk_io_initiation: indirdep gone");
+ initiate_write_indirdep(WK_INDIRDEP(wk), bp);
+ continue;
+
+ case D_BMSAFEMAP:
+ initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
+ continue;
+
+ case D_JSEG:
+ WK_JSEG(wk)->js_buf = NULL;
+ continue;
+
+ case D_FREEBLKS:
+ freeblks = WK_FREEBLKS(wk);
+ jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);
/*
- * If there are no remaining dependencies, this
- * will be writing the real pointers, so the
- * dependency can be freed.
+ * We have to wait for the jfreeblks to be journaled
+ * before we can write an inodeblock with updated
+ * pointers. Be careful to arrange the marker so
+ * we revisit the jfreeblk if it's not removed by
+ * the first jwait().
*/
- if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
- struct buf *bp;
-
- bp = indirdep->ir_savebp;
- bp->b_flags |= B_INVAL | B_NOCACHE;
- /* inline expand WORKLIST_REMOVE(wk); */
- wk->wk_state &= ~ONWORKLIST;
- LIST_REMOVE(wk, wk_list);
- WORKITEM_FREE(indirdep, D_INDIRDEP);
- FREE_LOCK(&lk);
- brelse(bp);
- ACQUIRE_LOCK(&lk);
- continue;
+ if (jfreeblk != NULL) {
+ LIST_REMOVE(&marker, wk_list);
+ LIST_INSERT_BEFORE(wk, &marker, wk_list);
+ jwait(&jfreeblk->jf_list);
}
+ continue;
+ case D_ALLOCDIRECT:
+ case D_ALLOCINDIR:
/*
- * Replace up-to-date version with safe version.
+ * We have to wait for the jnewblk to be journaled
+ * before we can write to a block otherwise the
+ * contents may be confused with an earlier file
+ * at recovery time. Handle the marker as described
+ * above.
*/
- FREE_LOCK(&lk);
- indirdep->ir_saveddata = malloc(bp->b_bcount,
- M_INDIRDEP, M_SOFTDEP_FLAGS);
- ACQUIRE_LOCK(&lk);
- indirdep->ir_state &= ~ATTACHED;
- indirdep->ir_state |= UNDONE;
- bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
- bcopy(indirdep->ir_savebp->b_data, bp->b_data,
- bp->b_bcount);
+ newblk = WK_NEWBLK(wk);
+ if (newblk->nb_jnewblk != NULL) {
+ LIST_REMOVE(&marker, wk_list);
+ LIST_INSERT_BEFORE(wk, &marker, wk_list);
+ jwait(&newblk->nb_jnewblk->jn_list);
+ }
+ continue;
+
+ case D_SBDEP:
+ initiate_write_sbdep(WK_SBDEP(wk));
continue;
case D_MKDIR:
- case D_BMSAFEMAP:
- case D_ALLOCDIRECT:
- case D_ALLOCINDIR:
+ case D_FREEWORK:
+ case D_FREEDEP:
+ case D_JSEGDEP:
continue;
default:
@@ -3855,6 +7921,9 @@ initiate_write_filepage(pagedep, bp)
struct pagedep *pagedep;
struct buf *bp;
{
+ struct jremref *jremref;
+ struct jmvref *jmvref;
+ struct dirrem *dirrem;
struct diradd *dap;
struct direct *ep;
int i;
@@ -3869,6 +7938,22 @@ initiate_write_filepage(pagedep, bp)
return;
}
pagedep->pd_state |= IOSTARTED;
+ /*
+ * Wait for all journal remove dependencies to hit the disk.
+ * We can not allow any potentially conflicting directory adds
+ * to be visible before removes and rollback is too difficult.
+ * lk may be dropped and re-acquired, however we hold the buf
+ * locked so the dependency can not go away.
+ */
+ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
+ while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
+ stat_jwait_filepage++;
+ jwait(&jremref->jr_list);
+ }
+ while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
+ stat_jwait_filepage++;
+ jwait(&jmvref->jm_list);
+ }
for (i = 0; i < DAHASHSZ; i++) {
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
ep = (struct direct *)
@@ -3905,6 +7990,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
struct allocdirect *adp, *lastadp;
struct ufs1_dinode *dp;
struct ufs1_dinode *sip;
+ struct inoref *inoref;
struct fs *fs;
ufs_lbn_t i;
#ifdef INVARIANTS
@@ -3918,6 +8004,17 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
fs = inodedep->id_fs;
dp = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, inodedep->id_ino);
+
+ /*
+ * If we're on the unlinked list but have not yet written our
+ * next pointer initialize it here.
+ */
+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+ struct inodedep *inon;
+
+ inon = TAILQ_NEXT(inodedep, id_unlinked);
+ dp->di_freelink = inon ? inon->id_ino : 0;
+ }
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
@@ -3933,6 +8030,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
*inodedep->id_savedino1 = *dp;
bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
dp->di_gen = inodedep->id_savedino1->di_gen;
+ dp->di_freelink = inodedep->id_savedino1->di_freelink;
return;
}
/*
@@ -3940,32 +8038,40 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
*/
inodedep->id_savedsize = dp->di_size;
inodedep->id_savedextsize = 0;
- if (TAILQ_EMPTY(&inodedep->id_inoupdt))
+ inodedep->id_savednlink = dp->di_nlink;
+ if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
+ TAILQ_EMPTY(&inodedep->id_inoreflst))
return;
/*
+ * Revert the link count to that of the first unwritten journal entry.
+ */
+ inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
+ if (inoref)
+ dp->di_nlink = inoref->if_nlink;
+ /*
* Set the dependencies to busy.
*/
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef INVARIANTS
- if (deplist != 0 && prevlbn >= adp->ad_lbn)
+ if (deplist != 0 && prevlbn >= adp->ad_offset)
panic("softdep_write_inodeblock: lbn order");
- prevlbn = adp->ad_lbn;
- if (adp->ad_lbn < NDADDR &&
- dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
+ prevlbn = adp->ad_offset;
+ if (adp->ad_offset < NDADDR &&
+ dp->di_db[adp->ad_offset] != adp->ad_newblkno)
panic("%s: direct pointer #%jd mismatch %d != %jd",
"softdep_write_inodeblock",
- (intmax_t)adp->ad_lbn,
- dp->di_db[adp->ad_lbn],
+ (intmax_t)adp->ad_offset,
+ dp->di_db[adp->ad_offset],
(intmax_t)adp->ad_newblkno);
- if (adp->ad_lbn >= NDADDR &&
- dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
+ if (adp->ad_offset >= NDADDR &&
+ dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
panic("%s: indirect pointer #%jd mismatch %d != %jd",
"softdep_write_inodeblock",
- (intmax_t)adp->ad_lbn - NDADDR,
- dp->di_ib[adp->ad_lbn - NDADDR],
+ (intmax_t)adp->ad_offset - NDADDR,
+ dp->di_ib[adp->ad_offset - NDADDR],
(intmax_t)adp->ad_newblkno);
- deplist |= 1 << adp->ad_lbn;
+ deplist |= 1 << adp->ad_offset;
if ((adp->ad_state & ATTACHED) == 0)
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
@@ -3981,14 +8087,14 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
- if (adp->ad_lbn >= NDADDR)
+ if (adp->ad_offset >= NDADDR)
break;
- dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
+ dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
- dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
- for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
+ dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+ for (i = adp->ad_offset + 1; i < NDADDR; i++) {
#ifdef INVARIANTS
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
panic("softdep_write_inodeblock: lost dep1");
@@ -4012,8 +8118,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
- dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
- for (i = lastadp->ad_lbn; i >= 0; i--)
+ dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+ for (i = lastadp->ad_offset; i >= 0; i--)
if (dp->di_db[i] != 0)
break;
dp->di_size = (i + 1) * fs->fs_bsize;
@@ -4030,7 +8136,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
* postpone fsck, we are stuck with this argument.
*/
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
- dp->di_ib[adp->ad_lbn - NDADDR] = 0;
+ dp->di_ib[adp->ad_offset - NDADDR] = 0;
}
/*
@@ -4051,6 +8157,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
struct allocdirect *adp, *lastadp;
struct ufs2_dinode *dp;
struct ufs2_dinode *sip;
+ struct inoref *inoref;
struct fs *fs;
ufs_lbn_t i;
#ifdef INVARIANTS
@@ -4064,6 +8171,29 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
fs = inodedep->id_fs;
dp = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, inodedep->id_ino);
+
+ /*
+ * If we're on the unlinked list but have not yet written our
+ * next pointer initialize it here.
+ */
+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+ struct inodedep *inon;
+
+ inon = TAILQ_NEXT(inodedep, id_unlinked);
+ dp->di_freelink = inon ? inon->id_ino : 0;
+ }
+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==
+ (UNLINKED | UNLINKNEXT)) {
+ struct inodedep *inon;
+ ino_t freelink;
+
+ inon = TAILQ_NEXT(inodedep, id_unlinked);
+ freelink = inon ? inon->id_ino : 0;
+ if (freelink != dp->di_freelink)
+ panic("ino %p(0x%X) %d, %d != %d",
+ inodedep, inodedep->id_state, inodedep->id_ino,
+ freelink, dp->di_freelink);
+ }
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
@@ -4079,6 +8209,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
*inodedep->id_savedino2 = *dp;
bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
dp->di_gen = inodedep->id_savedino2->di_gen;
+ dp->di_freelink = inodedep->id_savedino2->di_freelink;
return;
}
/*
@@ -4086,25 +8217,34 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
*/
inodedep->id_savedsize = dp->di_size;
inodedep->id_savedextsize = dp->di_extsize;
+ inodedep->id_savednlink = dp->di_nlink;
if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
- TAILQ_EMPTY(&inodedep->id_extupdt))
+ TAILQ_EMPTY(&inodedep->id_extupdt) &&
+ TAILQ_EMPTY(&inodedep->id_inoreflst))
return;
/*
+ * Revert the link count to that of the first unwritten journal entry.
+ */
+ inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
+ if (inoref)
+ dp->di_nlink = inoref->if_nlink;
+
+ /*
* Set the ext data dependencies to busy.
*/
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef INVARIANTS
- if (deplist != 0 && prevlbn >= adp->ad_lbn)
+ if (deplist != 0 && prevlbn >= adp->ad_offset)
panic("softdep_write_inodeblock: lbn order");
- prevlbn = adp->ad_lbn;
- if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
+ prevlbn = adp->ad_offset;
+ if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
panic("%s: direct pointer #%jd mismatch %jd != %jd",
"softdep_write_inodeblock",
- (intmax_t)adp->ad_lbn,
- (intmax_t)dp->di_extb[adp->ad_lbn],
+ (intmax_t)adp->ad_offset,
+ (intmax_t)dp->di_extb[adp->ad_offset],
(intmax_t)adp->ad_newblkno);
- deplist |= 1 << adp->ad_lbn;
+ deplist |= 1 << adp->ad_offset;
if ((adp->ad_state & ATTACHED) == 0)
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
@@ -4120,12 +8260,12 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
- dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
+ dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
- dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
- for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
+ dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+ for (i = adp->ad_offset + 1; i < NXADDR; i++) {
#ifdef INVARIANTS
if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
panic("softdep_write_inodeblock: lost dep1");
@@ -4142,8 +8282,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
- dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
- for (i = lastadp->ad_lbn; i >= 0; i--)
+ dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+ for (i = lastadp->ad_offset; i >= 0; i--)
if (dp->di_extb[i] != 0)
break;
dp->di_extsize = (i + 1) * fs->fs_bsize;
@@ -4154,24 +8294,24 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef INVARIANTS
- if (deplist != 0 && prevlbn >= adp->ad_lbn)
+ if (deplist != 0 && prevlbn >= adp->ad_offset)
panic("softdep_write_inodeblock: lbn order");
- prevlbn = adp->ad_lbn;
- if (adp->ad_lbn < NDADDR &&
- dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
+ prevlbn = adp->ad_offset;
+ if (adp->ad_offset < NDADDR &&
+ dp->di_db[adp->ad_offset] != adp->ad_newblkno)
panic("%s: direct pointer #%jd mismatch %jd != %jd",
"softdep_write_inodeblock",
- (intmax_t)adp->ad_lbn,
- (intmax_t)dp->di_db[adp->ad_lbn],
+ (intmax_t)adp->ad_offset,
+ (intmax_t)dp->di_db[adp->ad_offset],
(intmax_t)adp->ad_newblkno);
- if (adp->ad_lbn >= NDADDR &&
- dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
+ if (adp->ad_offset >= NDADDR &&
+ dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
panic("%s indirect pointer #%jd mismatch %jd != %jd",
"softdep_write_inodeblock:",
- (intmax_t)adp->ad_lbn - NDADDR,
- (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
+ (intmax_t)adp->ad_offset - NDADDR,
+ (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
(intmax_t)adp->ad_newblkno);
- deplist |= 1 << adp->ad_lbn;
+ deplist |= 1 << adp->ad_offset;
if ((adp->ad_state & ATTACHED) == 0)
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
@@ -4187,14 +8327,14 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
- if (adp->ad_lbn >= NDADDR)
+ if (adp->ad_offset >= NDADDR)
break;
- dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
+ dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
- dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
- for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
+ dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+ for (i = adp->ad_offset + 1; i < NDADDR; i++) {
#ifdef INVARIANTS
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
panic("softdep_write_inodeblock: lost dep2");
@@ -4218,8 +8358,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
- dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
- for (i = lastadp->ad_lbn; i >= 0; i--)
+ dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+ for (i = lastadp->ad_offset; i >= 0; i--)
if (dp->di_db[i] != 0)
break;
dp->di_size = (i + 1) * fs->fs_bsize;
@@ -4236,7 +8376,355 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
* postpone fsck, we are stuck with this argument.
*/
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
- dp->di_ib[adp->ad_lbn - NDADDR] = 0;
+ dp->di_ib[adp->ad_offset - NDADDR] = 0;
+}
+
+/*
+ * Cancel an indirdep as a result of truncation. Release all of the
+ * children allocindirs and place their journal work on the appropriate
+ * list.
+ */
+static void
+cancel_indirdep(indirdep, bp, inodedep, freeblks)
+ struct indirdep *indirdep;
+ struct buf *bp;
+ struct inodedep *inodedep;
+ struct freeblks *freeblks;
+{
+ struct allocindir *aip;
+
+ /*
+ * None of the indirect pointers will ever be visible,
+ * so they can simply be tossed. GOINGAWAY ensures
+ * that allocated pointers will be saved in the buffer
+ * cache until they are freed. Note that they will
+ * only be able to be found by their physical address
+ * since the inode mapping the logical address will
+ * be gone. The save buffer used for the safe copy
+ * was allocated in setup_allocindir_phase2 using
+ * the physical address so it could be used for this
+ * purpose. Hence we swap the safe copy with the real
+ * copy, allowing the safe copy to be freed and holding
+ * on to the real copy for later use in indir_trunc.
+ */
+ if (indirdep->ir_state & GOINGAWAY)
+ panic("cancel_indirdep: already gone");
+ if (indirdep->ir_state & ONDEPLIST) {
+ indirdep->ir_state &= ~ONDEPLIST;
+ LIST_REMOVE(indirdep, ir_next);
+ }
+ indirdep->ir_state |= GOINGAWAY;
+ VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
+ while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
+ cancel_allocindir(aip, inodedep, freeblks);
+ while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
+ cancel_allocindir(aip, inodedep, freeblks);
+ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
+ cancel_allocindir(aip, inodedep, freeblks);
+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
+ cancel_allocindir(aip, inodedep, freeblks);
+ bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
+ WORKLIST_REMOVE(&indirdep->ir_list);
+ WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
+ indirdep->ir_savebp = NULL;
+}
+
+/*
+ * Free an indirdep once it no longer has new pointers to track.
+ */
+static void
+free_indirdep(indirdep)
+ struct indirdep *indirdep;
+{
+
+ KASSERT(LIST_EMPTY(&indirdep->ir_jwork),
+ ("free_indirdep: Journal work not empty."));
+ KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
+ ("free_indirdep: Complete head not empty."));
+ KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
+ ("free_indirdep: write head not empty."));
+ KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
+ ("free_indirdep: done head not empty."));
+ KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
+ ("free_indirdep: deplist head not empty."));
+ KASSERT(indirdep->ir_savebp == NULL,
+ ("free_indirdep: %p ir_savebp != NULL", indirdep));
+ KASSERT((indirdep->ir_state & ONDEPLIST) == 0,
+ ("free_indirdep: %p still on deplist.", indirdep));
+ if (indirdep->ir_state & ONWORKLIST)
+ WORKLIST_REMOVE(&indirdep->ir_list);
+ WORKITEM_FREE(indirdep, D_INDIRDEP);
+}
+
+/*
+ * Called before a write to an indirdep. This routine is responsible for
+ * rolling back pointers to a safe state which includes only those
+ * allocindirs which have been completed.
+ */
+static void
+initiate_write_indirdep(indirdep, bp)
+ struct indirdep *indirdep;
+ struct buf *bp;
+{
+
+ if (indirdep->ir_state & GOINGAWAY)
+ panic("disk_io_initiation: indirdep gone");
+
+ /*
+ * If there are no remaining dependencies, this will be writing
+ * the real pointers.
+ */
+ if (LIST_EMPTY(&indirdep->ir_deplisthd))
+ return;
+ /*
+ * Replace up-to-date version with safe version.
+ */
+ FREE_LOCK(&lk);
+ indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
+ M_SOFTDEP_FLAGS);
+ ACQUIRE_LOCK(&lk);
+ indirdep->ir_state &= ~ATTACHED;
+ indirdep->ir_state |= UNDONE;
+ bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
+ bcopy(indirdep->ir_savebp->b_data, bp->b_data,
+ bp->b_bcount);
+}
+
+/*
+ * Called when an inode has been cleared in a cg bitmap. This finally
+ * eliminates any canceled jaddrefs
+ */
+void
+softdep_setup_inofree(mp, bp, ino, wkhd)
+ struct mount *mp;
+ struct buf *bp;
+ ino_t ino;
+ struct workhead *wkhd;
+{
+ struct worklist *wk, *wkn;
+ struct inodedep *inodedep;
+ uint8_t *inosused;
+ struct cg *cgp;
+ struct fs *fs;
+
+ ACQUIRE_LOCK(&lk);
+ fs = VFSTOUFS(mp)->um_fs;
+ cgp = (struct cg *)bp->b_data;
+ inosused = cg_inosused(cgp);
+ if (isset(inosused, ino % fs->fs_ipg))
+ panic("softdep_setup_inofree: inode %d not freed.", ino);
+ if (inodedep_lookup(mp, ino, 0, &inodedep))
+ panic("softdep_setup_inofree: ino %d has existing inodedep %p",
+ ino, inodedep);
+ if (wkhd) {
+ LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
+ if (wk->wk_type != D_JADDREF)
+ continue;
+ WORKLIST_REMOVE(wk);
+ /*
+ * We can free immediately even if the jaddref
+ * isn't attached in a background write as now
+ * the bitmaps are reconciled.
+ */
+ wk->wk_state |= COMPLETE | ATTACHED;
+ free_jaddref(WK_JADDREF(wk));
+ }
+ jwork_move(&bp->b_dep, wkhd);
+ }
+ FREE_LOCK(&lk);
+}
+
+
+/*
+ * Called via ffs_blkfree() after a set of frags has been cleared from a cg
+ * map. Any dependencies waiting for the write to clear are added to the
+ * buf's list and any jnewblks that are being canceled are discarded
+ * immediately.
+ */
+void
+softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
+ struct mount *mp;
+ struct buf *bp;
+ ufs2_daddr_t blkno;
+ int frags;
+ struct workhead *wkhd;
+{
+ struct jnewblk *jnewblk;
+ struct worklist *wk, *wkn;
+#ifdef SUJ_DEBUG
+ struct bmsafemap *bmsafemap;
+ struct fs *fs;
+ uint8_t *blksfree;
+ struct cg *cgp;
+ ufs2_daddr_t jstart;
+ ufs2_daddr_t jend;
+ ufs2_daddr_t end;
+ long bno;
+ int i;
+#endif
+
+ ACQUIRE_LOCK(&lk);
+ /*
+ * Detach any jnewblks which have been canceled. They must linger
+ * until the bitmap is cleared again by ffs_blkfree() to prevent
+ * an unjournaled allocation from hitting the disk.
+ */
+ if (wkhd) {
+ LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
+ if (wk->wk_type != D_JNEWBLK)
+ continue;
+ jnewblk = WK_JNEWBLK(wk);
+ KASSERT(jnewblk->jn_state & GOINGAWAY,
+ ("softdep_setup_blkfree: jnewblk not canceled."));
+ WORKLIST_REMOVE(wk);
+#ifdef SUJ_DEBUG
+ /*
+ * Assert that this block is free in the bitmap
+ * before we discard the jnewblk.
+ */
+ fs = VFSTOUFS(mp)->um_fs;
+ cgp = (struct cg *)bp->b_data;
+ blksfree = cg_blksfree(cgp);
+ bno = dtogd(fs, jnewblk->jn_blkno);
+ for (i = jnewblk->jn_oldfrags;
+ i < jnewblk->jn_frags; i++) {
+ if (isset(blksfree, bno + i))
+ continue;
+ panic("softdep_setup_blkfree: not free");
+ }
+#endif
+ /*
+ * Even if it's not attached we can free immediately
+ * as the new bitmap is correct.
+ */
+ wk->wk_state |= COMPLETE | ATTACHED;
+ free_jnewblk(jnewblk);
+ }
+ /*
+ * The buf must be locked by the caller otherwise these could
+ * be added while it's being written and the write would
+ * complete them before they made it to disk.
+ */
+ jwork_move(&bp->b_dep, wkhd);
+ }
+
+#ifdef SUJ_DEBUG
+ /*
+ * Assert that we are not freeing a block which has an outstanding
+ * allocation dependency.
+ */
+ fs = VFSTOUFS(mp)->um_fs;
+ bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));
+ end = blkno + frags;
+ LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
+ /*
+ * Don't match against blocks that will be freed when the
+ * background write is done.
+ */
+ if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
+ (COMPLETE | DEPCOMPLETE))
+ continue;
+ jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
+ jend = jnewblk->jn_blkno + jnewblk->jn_frags;
+ if ((blkno >= jstart && blkno < jend) ||
+ (end > jstart && end <= jend)) {
+ printf("state 0x%X %jd - %d %d dep %p\n",
+ jnewblk->jn_state, jnewblk->jn_blkno,
+ jnewblk->jn_oldfrags, jnewblk->jn_frags,
+ jnewblk->jn_newblk);
+ panic("softdep_setup_blkfree: "
+ "%jd-%jd(%d) overlaps with %jd-%jd",
+ blkno, end, frags, jstart, jend);
+ }
+ }
+#endif
+ FREE_LOCK(&lk);
+}
+
+static void
+initiate_write_bmsafemap(bmsafemap, bp)
+ struct bmsafemap *bmsafemap;
+ struct buf *bp; /* The cg block. */
+{
+ struct jaddref *jaddref;
+ struct jnewblk *jnewblk;
+ uint8_t *inosused;
+ uint8_t *blksfree;
+ struct cg *cgp;
+ struct fs *fs;
+ int cleared;
+ ino_t ino;
+ long bno;
+ int i;
+
+ if (bmsafemap->sm_state & IOSTARTED)
+ panic("initiate_write_bmsafemap: Already started\n");
+ bmsafemap->sm_state |= IOSTARTED;
+ /*
+ * Clear any inode allocations which are pending journal writes.
+ */
+ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
+ cgp = (struct cg *)bp->b_data;
+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+ inosused = cg_inosused(cgp);
+ LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
+ ino = jaddref->ja_ino % fs->fs_ipg;
+ /*
+ * If this is a background copy the inode may not
+ * be marked used yet.
+ */
+ if (isset(inosused, ino)) {
+ if ((jaddref->ja_mode & IFMT) == IFDIR)
+ cgp->cg_cs.cs_ndir--;
+ cgp->cg_cs.cs_nifree++;
+ clrbit(inosused, ino);
+ jaddref->ja_state &= ~ATTACHED;
+ jaddref->ja_state |= UNDONE;
+ stat_jaddref++;
+ } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
+ panic("initiate_write_bmsafemap: inode %d "
+ "marked free", jaddref->ja_ino);
+ }
+ }
+ /*
+ * Clear any block allocations which are pending journal writes.
+ */
+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
+ cgp = (struct cg *)bp->b_data;
+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+ blksfree = cg_blksfree(cgp);
+ LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
+ bno = dtogd(fs, jnewblk->jn_blkno);
+ cleared = 0;
+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+ i++) {
+ if (isclr(blksfree, bno + i)) {
+ cleared = 1;
+ setbit(blksfree, bno + i);
+ }
+ }
+ /*
+ * We may not clear the block if it's a background
+ * copy. In that case there is no reason to detach
+ * it.
+ */
+ if (cleared) {
+ stat_jnewblk++;
+ jnewblk->jn_state &= ~ATTACHED;
+ jnewblk->jn_state |= UNDONE;
+ } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
+ panic("initiate_write_bmsafemap: block %jd "
+ "marked free", jnewblk->jn_blkno);
+ }
+ }
+ /*
+ * Move allocation lists to the written lists so they can be
+ * cleared once the block write is complete.
+ */
+ LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
+ inodedep, id_deps);
+ LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
+ newblk, nb_deps);
}
/*
@@ -4246,6 +8734,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
* a request completion). It should be called early in this
* procedure, before the block is made available to other
* processes or other routines are called.
+ *
*/
static void
softdep_disk_write_complete(bp)
@@ -4254,12 +8743,7 @@ softdep_disk_write_complete(bp)
struct worklist *wk;
struct worklist *owk;
struct workhead reattach;
- struct newblk *newblk;
- struct allocindir *aip;
- struct allocdirect *adp;
- struct indirdep *indirdep;
- struct inodedep *inodedep;
- struct bmsafemap *bmsafemap;
+ struct buf *sbp;
/*
* If an error occurred while doing the write, then the data
@@ -4271,8 +8755,9 @@ softdep_disk_write_complete(bp)
/*
* This lock must not be released anywhere in this code segment.
*/
- ACQUIRE_LOCK(&lk);
+ sbp = NULL;
owk = NULL;
+ ACQUIRE_LOCK(&lk);
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
WORKLIST_REMOVE(wk);
if (wk == owk)
@@ -4291,33 +8776,8 @@ softdep_disk_write_complete(bp)
continue;
case D_BMSAFEMAP:
- bmsafemap = WK_BMSAFEMAP(wk);
- while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
- newblk->nb_state |= DEPCOMPLETE;
- newblk->nb_bmsafemap = NULL;
- LIST_REMOVE(newblk, nb_deps);
- }
- while ((adp =
- LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
- adp->ad_state |= DEPCOMPLETE;
- adp->ad_buf = NULL;
- LIST_REMOVE(adp, ad_deps);
- handle_allocdirect_partdone(adp);
- }
- while ((aip =
- LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
- aip->ai_state |= DEPCOMPLETE;
- aip->ai_buf = NULL;
- LIST_REMOVE(aip, ai_deps);
- handle_allocindir_partdone(aip);
- }
- while ((inodedep =
- LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
- inodedep->id_state |= DEPCOMPLETE;
- LIST_REMOVE(inodedep, id_deps);
- inodedep->id_buf = NULL;
- }
- WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+ if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
+ WORKLIST_INSERT(&reattach, wk);
continue;
case D_MKDIR:
@@ -4325,35 +8785,45 @@ softdep_disk_write_complete(bp)
continue;
case D_ALLOCDIRECT:
- adp = WK_ALLOCDIRECT(wk);
- adp->ad_state |= COMPLETE;
- handle_allocdirect_partdone(adp);
+ wk->wk_state |= COMPLETE;
+ handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
continue;
case D_ALLOCINDIR:
- aip = WK_ALLOCINDIR(wk);
- aip->ai_state |= COMPLETE;
- handle_allocindir_partdone(aip);
+ wk->wk_state |= COMPLETE;
+ handle_allocindir_partdone(WK_ALLOCINDIR(wk));
continue;
case D_INDIRDEP:
- indirdep = WK_INDIRDEP(wk);
- if (indirdep->ir_state & GOINGAWAY)
- panic("disk_write_complete: indirdep gone");
- bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
- free(indirdep->ir_saveddata, M_INDIRDEP);
- indirdep->ir_saveddata = 0;
- indirdep->ir_state &= ~UNDONE;
- indirdep->ir_state |= ATTACHED;
- while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
- handle_allocindir_partdone(aip);
- if (aip == LIST_FIRST(&indirdep->ir_donehd))
- panic("disk_write_complete: not gone");
- }
- WORKLIST_INSERT(&reattach, wk);
- if ((bp->b_flags & B_DELWRI) == 0)
- stat_indir_blk_ptrs++;
- bdirty(bp);
+ if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
+ WORKLIST_INSERT(&reattach, wk);
+ continue;
+
+ case D_FREEBLKS:
+ wk->wk_state |= COMPLETE;
+ if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
+ add_to_worklist(wk, 1);
+ continue;
+
+ case D_FREEWORK:
+ handle_written_freework(WK_FREEWORK(wk));
+ break;
+
+ case D_FREEDEP:
+ free_freedep(WK_FREEDEP(wk));
+ continue;
+
+ case D_JSEGDEP:
+ free_jsegdep(WK_JSEGDEP(wk));
+ continue;
+
+ case D_JSEG:
+ handle_written_jseg(WK_JSEG(wk), bp);
+ continue;
+
+ case D_SBDEP:
+ if (handle_written_sbdep(WK_SBDEP(wk), bp))
+ WORKLIST_INSERT(&reattach, wk);
continue;
default:
@@ -4370,6 +8840,8 @@ softdep_disk_write_complete(bp)
WORKLIST_INSERT(&bp->b_dep, wk);
}
FREE_LOCK(&lk);
+ if (sbp)
+ brelse(sbp);
}
/*
@@ -4378,18 +8850,17 @@ softdep_disk_write_complete(bp)
* splbio interrupts blocked.
*/
static void
-handle_allocdirect_partdone(adp)
+handle_allocdirect_partdone(adp, wkhd)
struct allocdirect *adp; /* the completed allocdirect */
+ struct workhead *wkhd; /* Work to do when inode is writtne. */
{
struct allocdirectlst *listhead;
struct allocdirect *listadp;
struct inodedep *inodedep;
- long bsize, delay;
+ long bsize;
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
- if (adp->ad_buf != NULL)
- panic("handle_allocdirect_partdone: dangling dep");
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
@@ -4439,25 +8910,27 @@ handle_allocdirect_partdone(adp)
return;
}
/*
- * If we have found the just finished dependency, then free
+ * If we have found the just finished dependency, then queue
* it along with anything that follows it that is complete.
- * If the inode still has a bitmap dependency, then it has
- * never been written to disk, hence the on-disk inode cannot
- * reference the old fragment so we can free it without delay.
+ * Since the pointer has not yet been written in the inode
+ * as the dependency prevents it, place the allocdirect on the
+ * bufwait list where it will be freed once the pointer is
+ * valid.
*/
- delay = (inodedep->id_state & DEPCOMPLETE);
+ if (wkhd == NULL)
+ wkhd = &inodedep->id_bufwait;
for (; adp; adp = listadp) {
listadp = TAILQ_NEXT(adp, ad_next);
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
- free_allocdirect(listhead, adp, delay);
+ TAILQ_REMOVE(listhead, adp, ad_next);
+ WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
}
}
/*
- * Called from within softdep_disk_write_complete above. Note that
- * this routine is always called from interrupt level with further
- * splbio interrupts blocked.
+ * Called from within softdep_disk_write_complete above. This routine
+ * completes successfully written allocindirs.
*/
static void
handle_allocindir_partdone(aip)
@@ -4467,11 +8940,9 @@ handle_allocindir_partdone(aip)
if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
- if (aip->ai_buf != NULL)
- panic("handle_allocindir_partdone: dangling dependency");
indirdep = aip->ai_indirdep;
+ LIST_REMOVE(aip, ai_next);
if (indirdep->ir_state & UNDONE) {
- LIST_REMOVE(aip, ai_next);
LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
return;
}
@@ -4481,13 +8952,130 @@ handle_allocindir_partdone(aip)
else
((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
aip->ai_newblkno;
- LIST_REMOVE(aip, ai_next);
- if (aip->ai_freefrag != NULL)
- add_to_worklist(&aip->ai_freefrag->ff_list);
- WORKITEM_FREE(aip, D_ALLOCINDIR);
+ /*
+ * Await the pointer write before freeing the allocindir.
+ */
+ LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
}
/*
+ * Release segments held on a jwork list.
+ */
+static void
+handle_jwork(wkhd)
+ struct workhead *wkhd;
+{
+ struct worklist *wk;
+
+ while ((wk = LIST_FIRST(wkhd)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ switch (wk->wk_type) {
+ case D_JSEGDEP:
+ free_jsegdep(WK_JSEGDEP(wk));
+ continue;
+ default:
+ panic("handle_jwork: Unknown type %s\n",
+ TYPENAME(wk->wk_type));
+ }
+ }
+}
+
+/*
+ * Handle the bufwait list on an inode when it is safe to release items
+ * held there. This normally happens after an inode block is written but
+ * may be delayed and handle later if there are pending journal items that
+ * are not yet safe to be released.
+ */
+static struct freefile *
+handle_bufwait(inodedep, refhd)
+ struct inodedep *inodedep;
+ struct workhead *refhd;
+{
+ struct jaddref *jaddref;
+ struct freefile *freefile;
+ struct worklist *wk;
+
+ freefile = NULL;
+ while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ switch (wk->wk_type) {
+ case D_FREEFILE:
+ /*
+ * We defer adding freefile to the worklist
+ * until all other additions have been made to
+ * ensure that it will be done after all the
+ * old blocks have been freed.
+ */
+ if (freefile != NULL)
+ panic("handle_bufwait: freefile");
+ freefile = WK_FREEFILE(wk);
+ continue;
+
+ case D_MKDIR:
+ handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
+ continue;
+
+ case D_DIRADD:
+ diradd_inode_written(WK_DIRADD(wk), inodedep);
+ continue;
+
+ case D_FREEFRAG:
+ wk->wk_state |= COMPLETE;
+ if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
+ add_to_worklist(wk, 0);
+ continue;
+
+ case D_DIRREM:
+ wk->wk_state |= COMPLETE;
+ add_to_worklist(wk, 0);
+ continue;
+
+ case D_ALLOCDIRECT:
+ case D_ALLOCINDIR:
+ free_newblk(WK_NEWBLK(wk));
+ continue;
+
+ case D_JNEWBLK:
+ wk->wk_state |= COMPLETE;
+ free_jnewblk(WK_JNEWBLK(wk));
+ continue;
+
+ /*
+ * Save freed journal segments and add references on
+ * the supplied list which will delay their release
+ * until the cg bitmap is cleared on disk.
+ */
+ case D_JSEGDEP:
+ if (refhd == NULL)
+ free_jsegdep(WK_JSEGDEP(wk));
+ else
+ WORKLIST_INSERT(refhd, wk);
+ continue;
+
+ case D_JADDREF:
+ jaddref = WK_JADDREF(wk);
+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ /*
+ * Transfer any jaddrefs to the list to be freed with
+ * the bitmap if we're handling a removed file.
+ */
+ if (refhd == NULL) {
+ wk->wk_state |= COMPLETE;
+ free_jaddref(jaddref);
+ } else
+ WORKLIST_INSERT(refhd, wk);
+ continue;
+
+ default:
+ panic("handle_bufwait: Unknown type %p(%s)",
+ wk, TYPENAME(wk->wk_type));
+ /* NOTREACHED */
+ }
+ }
+ return (freefile);
+}
+/*
* Called from within softdep_disk_write_complete above to restore
* in-memory inode block contents to their most up-to-date state. Note
* that this routine is always called from interrupt level with further
@@ -4498,12 +9086,17 @@ handle_written_inodeblock(inodedep, bp)
struct inodedep *inodedep;
struct buf *bp; /* buffer containing the inode block */
{
- struct worklist *wk, *filefree;
+ struct freefile *freefile;
struct allocdirect *adp, *nextadp;
struct ufs1_dinode *dp1 = NULL;
struct ufs2_dinode *dp2 = NULL;
+ struct workhead wkhd;
int hadchanges, fstype;
+ ino_t freelink;
+ LIST_INIT(&wkhd);
+ hadchanges = 0;
+ freefile = NULL;
if ((inodedep->id_state & IOSTARTED) == 0)
panic("handle_written_inodeblock: not started");
inodedep->id_state &= ~IOSTARTED;
@@ -4511,11 +9104,32 @@ handle_written_inodeblock(inodedep, bp)
fstype = UFS1;
dp1 = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
+ freelink = dp1->di_freelink;
} else {
fstype = UFS2;
dp2 = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
+ freelink = dp2->di_freelink;
+ }
+ /*
+ * If we wrote a valid freelink pointer during the last write
+ * record it here.
+ */
+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+ struct inodedep *inon;
+
+ inon = TAILQ_NEXT(inodedep, id_unlinked);
+ if ((inon == NULL && freelink == 0) ||
+ (inon && inon->id_ino == freelink)) {
+ if (inon)
+ inon->id_state |= UNLINKPREV;
+ inodedep->id_state |= UNLINKNEXT;
+ } else
+ hadchanges = 1;
}
+ /* Leave this inodeblock dirty until it's in the list. */
+ if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED)
+ hadchanges = 1;
/*
* If we had to rollback the inode allocation because of
* bitmaps being incomplete, then simply restore it.
@@ -4524,6 +9138,7 @@ handle_written_inodeblock(inodedep, bp)
* corresponding updates written to disk.
*/
if (inodedep->id_savedino1 != NULL) {
+ hadchanges = 1;
if (fstype == UFS1)
*dp1 = *inodedep->id_savedino1;
else
@@ -4533,6 +9148,13 @@ handle_written_inodeblock(inodedep, bp)
if ((bp->b_flags & B_DELWRI) == 0)
stat_inode_bitmap++;
bdirty(bp);
+ /*
+ * If the inode is clear here and GOINGAWAY it will never
+ * be written. Process the bufwait and clear any pending
+ * work which may include the freefile.
+ */
+ if (inodedep->id_state & GOINGAWAY)
+ goto bufwait;
return (1);
}
inodedep->id_state |= COMPLETE;
@@ -4540,50 +9162,49 @@ handle_written_inodeblock(inodedep, bp)
* Roll forward anything that had to be rolled back before
* the inode could be updated.
*/
- hadchanges = 0;
for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
nextadp = TAILQ_NEXT(adp, ad_next);
if (adp->ad_state & ATTACHED)
panic("handle_written_inodeblock: new entry");
if (fstype == UFS1) {
- if (adp->ad_lbn < NDADDR) {
- if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
+ if (adp->ad_offset < NDADDR) {
+ if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
panic("%s %s #%jd mismatch %d != %jd",
"handle_written_inodeblock:",
"direct pointer",
- (intmax_t)adp->ad_lbn,
- dp1->di_db[adp->ad_lbn],
+ (intmax_t)adp->ad_offset,
+ dp1->di_db[adp->ad_offset],
(intmax_t)adp->ad_oldblkno);
- dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
+ dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
} else {
- if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
+ if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
panic("%s: %s #%jd allocated as %d",
"handle_written_inodeblock",
"indirect pointer",
- (intmax_t)adp->ad_lbn - NDADDR,
- dp1->di_ib[adp->ad_lbn - NDADDR]);
- dp1->di_ib[adp->ad_lbn - NDADDR] =
+ (intmax_t)adp->ad_offset - NDADDR,
+ dp1->di_ib[adp->ad_offset - NDADDR]);
+ dp1->di_ib[adp->ad_offset - NDADDR] =
adp->ad_newblkno;
}
} else {
- if (adp->ad_lbn < NDADDR) {
- if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
+ if (adp->ad_offset < NDADDR) {
+ if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
panic("%s: %s #%jd %s %jd != %jd",
"handle_written_inodeblock",
"direct pointer",
- (intmax_t)adp->ad_lbn, "mismatch",
- (intmax_t)dp2->di_db[adp->ad_lbn],
+ (intmax_t)adp->ad_offset, "mismatch",
+ (intmax_t)dp2->di_db[adp->ad_offset],
(intmax_t)adp->ad_oldblkno);
- dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
+ dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
} else {
- if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
+ if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
panic("%s: %s #%jd allocated as %jd",
"handle_written_inodeblock",
"indirect pointer",
- (intmax_t)adp->ad_lbn - NDADDR,
+ (intmax_t)adp->ad_offset - NDADDR,
(intmax_t)
- dp2->di_ib[adp->ad_lbn - NDADDR]);
- dp2->di_ib[adp->ad_lbn - NDADDR] =
+ dp2->di_ib[adp->ad_offset - NDADDR]);
+ dp2->di_ib[adp->ad_offset - NDADDR] =
adp->ad_newblkno;
}
}
@@ -4595,13 +9216,13 @@ handle_written_inodeblock(inodedep, bp)
nextadp = TAILQ_NEXT(adp, ad_next);
if (adp->ad_state & ATTACHED)
panic("handle_written_inodeblock: new entry");
- if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
+ if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
panic("%s: direct pointers #%jd %s %jd != %jd",
"handle_written_inodeblock",
- (intmax_t)adp->ad_lbn, "mismatch",
- (intmax_t)dp2->di_extb[adp->ad_lbn],
+ (intmax_t)adp->ad_offset, "mismatch",
+ (intmax_t)dp2->di_extb[adp->ad_offset],
(intmax_t)adp->ad_oldblkno);
- dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
+ dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
adp->ad_state &= ~UNDONE;
adp->ad_state |= ATTACHED;
hadchanges = 1;
@@ -4613,12 +9234,23 @@ handle_written_inodeblock(inodedep, bp)
*/
if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
panic("handle_written_inodeblock: bad size");
+ if (inodedep->id_savednlink > LINK_MAX)
+ panic("handle_written_inodeblock: Invalid link count "
+ "%d for inodedep %p", inodedep->id_savednlink, inodedep);
if (fstype == UFS1) {
+ if (dp1->di_nlink != inodedep->id_savednlink) {
+ dp1->di_nlink = inodedep->id_savednlink;
+ hadchanges = 1;
+ }
if (dp1->di_size != inodedep->id_savedsize) {
dp1->di_size = inodedep->id_savedsize;
hadchanges = 1;
}
} else {
+ if (dp2->di_nlink != inodedep->id_savednlink) {
+ dp2->di_nlink = inodedep->id_savednlink;
+ hadchanges = 1;
+ }
if (dp2->di_size != inodedep->id_savedsize) {
dp2->di_size = inodedep->id_savedsize;
hadchanges = 1;
@@ -4630,6 +9262,7 @@ handle_written_inodeblock(inodedep, bp)
}
inodedep->id_savedsize = -1;
inodedep->id_savedextsize = -1;
+ inodedep->id_savednlink = -1;
/*
* If there were any rollbacks in the inode block, then it must be
* marked dirty so that its will eventually get written back in
@@ -4637,69 +9270,49 @@ handle_written_inodeblock(inodedep, bp)
*/
if (hadchanges)
bdirty(bp);
+bufwait:
/*
* Process any allocdirects that completed during the update.
*/
if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
- handle_allocdirect_partdone(adp);
+ handle_allocdirect_partdone(adp, &wkhd);
if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
- handle_allocdirect_partdone(adp);
+ handle_allocdirect_partdone(adp, &wkhd);
/*
* Process deallocations that were held pending until the
* inode had been written to disk. Freeing of the inode
* is delayed until after all blocks have been freed to
* avoid creation of new <vfsid, inum, lbn> triples
- * before the old ones have been deleted.
+ * before the old ones have been deleted. Completely
+ * unlinked inodes are not processed until the unlinked
+ * inode list is written or the last reference is removed.
*/
- filefree = NULL;
- while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
- WORKLIST_REMOVE(wk);
- switch (wk->wk_type) {
-
- case D_FREEFILE:
- /*
- * We defer adding filefree to the worklist until
- * all other additions have been made to ensure
- * that it will be done after all the old blocks
- * have been freed.
- */
- if (filefree != NULL)
- panic("handle_written_inodeblock: filefree");
- filefree = wk;
- continue;
-
- case D_MKDIR:
- handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
- continue;
-
- case D_DIRADD:
- diradd_inode_written(WK_DIRADD(wk), inodedep);
- continue;
-
- case D_FREEBLKS:
- wk->wk_state |= COMPLETE;
- if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
- continue;
- /* -- fall through -- */
- case D_FREEFRAG:
- case D_DIRREM:
- add_to_worklist(wk);
- continue;
-
- case D_NEWDIRBLK:
- free_newdirblk(WK_NEWDIRBLK(wk));
- continue;
-
- default:
- panic("handle_written_inodeblock: Unknown type %s",
- TYPENAME(wk->wk_type));
- /* NOTREACHED */
+ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
+ freefile = handle_bufwait(inodedep, NULL);
+ if (freefile && !LIST_EMPTY(&wkhd)) {
+ WORKLIST_INSERT(&wkhd, &freefile->fx_list);
+ freefile = NULL;
}
}
- if (filefree != NULL) {
+ /*
+ * Move rolled forward dependency completions to the bufwait list
+ * now that those that were already written have been processed.
+ */
+ if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
+ panic("handle_written_inodeblock: bufwait but no changes");
+ jwork_move(&inodedep->id_bufwait, &wkhd);
+
+ if (freefile != NULL) {
+ /*
+ * If the inode is goingaway it was never written. Fake up
+ * the state here so free_inodedep() can succeed.
+ */
+ if (inodedep->id_state & GOINGAWAY)
+ inodedep->id_state |= COMPLETE | DEPCOMPLETE;
if (free_inodedep(inodedep) == 0)
- panic("handle_written_inodeblock: live inodedep");
- add_to_worklist(filefree);
+ panic("handle_written_inodeblock: live inodedep %p",
+ inodedep);
+ add_to_worklist(&freefile->fx_list, 0);
return (0);
}
@@ -4707,12 +9320,101 @@ handle_written_inodeblock(inodedep, bp)
* If no outstanding dependencies, free it.
*/
if (free_inodedep(inodedep) ||
- (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
- TAILQ_FIRST(&inodedep->id_extupdt) == 0))
+ (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
+ TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
+ TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
+ LIST_FIRST(&inodedep->id_bufwait) == 0))
return (0);
return (hadchanges);
}
+static int
+handle_written_indirdep(indirdep, bp, bpp)
+ struct indirdep *indirdep;
+ struct buf *bp;
+ struct buf **bpp;
+{
+ struct allocindir *aip;
+ int chgs;
+
+ if (indirdep->ir_state & GOINGAWAY)
+ panic("disk_write_complete: indirdep gone");
+ chgs = 0;
+ /*
+ * If there were rollbacks revert them here.
+ */
+ if (indirdep->ir_saveddata) {
+ bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
+ free(indirdep->ir_saveddata, M_INDIRDEP);
+ indirdep->ir_saveddata = 0;
+ chgs = 1;
+ }
+ indirdep->ir_state &= ~UNDONE;
+ indirdep->ir_state |= ATTACHED;
+ /*
+ * Move allocindirs with written pointers to the completehd if
+ * the the indirdep's pointer is not yet written. Otherwise
+ * free them here.
+ */
+ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
+ LIST_REMOVE(aip, ai_next);
+ if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
+ LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
+ ai_next);
+ continue;
+ }
+ free_newblk(&aip->ai_block);
+ }
+ /*
+ * Move allocindirs that have finished dependency processing from
+ * the done list to the write list after updating the pointers.
+ */
+ while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
+ handle_allocindir_partdone(aip);
+ if (aip == LIST_FIRST(&indirdep->ir_donehd))
+ panic("disk_write_complete: not gone");
+ chgs = 1;
+ }
+ /*
+ * If this indirdep has been detached from its newblk during
+ * I/O we need to keep this dep attached to the buffer so
+ * deallocate_dependencies can find it and properly resolve
+ * any outstanding dependencies.
+ */
+ if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)
+ chgs = 1;
+ if ((bp->b_flags & B_DELWRI) == 0)
+ stat_indir_blk_ptrs++;
+ /*
+ * If there were no changes we can discard the savedbp and detach
+ * ourselves from the buf. We are only carrying completed pointers
+ * in this case.
+ */
+ if (chgs == 0) {
+ struct buf *sbp;
+
+ sbp = indirdep->ir_savebp;
+ sbp->b_flags |= B_INVAL | B_NOCACHE;
+ indirdep->ir_savebp = NULL;
+ if (*bpp != NULL)
+ panic("handle_written_indirdep: bp already exists.");
+ *bpp = sbp;
+ } else
+ bdirty(bp);
+ /*
+ * If there are no fresh dependencies and none waiting on writes
+ * we can free the indirdep.
+ */
+ if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {
+ if (indirdep->ir_state & ONDEPLIST)
+ LIST_REMOVE(indirdep, ir_next);
+ free_indirdep(indirdep);
+ return (0);
+ }
+
+ return (chgs);
+}
+
/*
* Process a diradd entry after its dependent inode has been written.
* This routine must be called with splbio interrupts blocked.
@@ -4722,50 +9424,200 @@ diradd_inode_written(dap, inodedep)
struct diradd *dap;
struct inodedep *inodedep;
{
- struct pagedep *pagedep;
dap->da_state |= COMPLETE;
- if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
- if (dap->da_state & DIRCHG)
- pagedep = dap->da_previous->dm_pagedep;
- else
- pagedep = dap->da_pagedep;
- LIST_REMOVE(dap, da_pdlist);
- LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
- }
+ complete_diradd(dap);
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
}
/*
- * Handle the completion of a mkdir dependency.
+ * Returns true if the bmsafemap will have rollbacks when written. Must
+ * only be called with lk and the buf lock on the cg held.
+ */
+static int
+bmsafemap_rollbacks(bmsafemap)
+ struct bmsafemap *bmsafemap;
+{
+
+ return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
+ !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
+}
+
+/*
+ * Complete a write to a bmsafemap structure. Roll forward any bitmap
+ * changes if it's not a background write. Set all written dependencies
+ * to DEPCOMPLETE and free the structure if possible.
+ */
+static int
+handle_written_bmsafemap(bmsafemap, bp)
+ struct bmsafemap *bmsafemap;
+ struct buf *bp;
+{
+ struct newblk *newblk;
+ struct inodedep *inodedep;
+ struct jaddref *jaddref, *jatmp;
+ struct jnewblk *jnewblk, *jntmp;
+ uint8_t *inosused;
+ uint8_t *blksfree;
+ struct cg *cgp;
+ struct fs *fs;
+ ino_t ino;
+ long bno;
+ int chgs;
+ int i;
+
+ if ((bmsafemap->sm_state & IOSTARTED) == 0)
+ panic("initiate_write_bmsafemap: Not started\n");
+ chgs = 0;
+ bmsafemap->sm_state &= ~IOSTARTED;
+ /*
+ * Restore unwritten inode allocation pending jaddref writes.
+ */
+ if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
+ cgp = (struct cg *)bp->b_data;
+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+ inosused = cg_inosused(cgp);
+ LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
+ ja_bmdeps, jatmp) {
+ if ((jaddref->ja_state & UNDONE) == 0)
+ continue;
+ ino = jaddref->ja_ino % fs->fs_ipg;
+ if (isset(inosused, ino))
+ panic("handle_written_bmsafemap: "
+ "re-allocated inode");
+ if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
+ if ((jaddref->ja_mode & IFMT) == IFDIR)
+ cgp->cg_cs.cs_ndir++;
+ cgp->cg_cs.cs_nifree--;
+ setbit(inosused, ino);
+ chgs = 1;
+ }
+ jaddref->ja_state &= ~UNDONE;
+ jaddref->ja_state |= ATTACHED;
+ free_jaddref(jaddref);
+ }
+ }
+ /*
+ * Restore any block allocations which are pending journal writes.
+ */
+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
+ cgp = (struct cg *)bp->b_data;
+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+ blksfree = cg_blksfree(cgp);
+ LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
+ jntmp) {
+ if ((jnewblk->jn_state & UNDONE) == 0)
+ continue;
+ bno = dtogd(fs, jnewblk->jn_blkno);
+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+ i++) {
+ if (bp->b_xflags & BX_BKGRDMARKER)
+ break;
+ if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
+ isclr(blksfree, bno + i))
+ panic("handle_written_bmsafemap: "
+ "re-allocated fragment");
+ clrbit(blksfree, bno + i);
+ chgs = 1;
+ }
+ jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
+ jnewblk->jn_state |= ATTACHED;
+ free_jnewblk(jnewblk);
+ }
+ }
+ while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
+ newblk->nb_state |= DEPCOMPLETE;
+ newblk->nb_state &= ~ONDEPLIST;
+ newblk->nb_bmsafemap = NULL;
+ LIST_REMOVE(newblk, nb_deps);
+ if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
+ handle_allocdirect_partdone(
+ WK_ALLOCDIRECT(&newblk->nb_list), NULL);
+ else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
+ handle_allocindir_partdone(
+ WK_ALLOCINDIR(&newblk->nb_list));
+ else if (newblk->nb_list.wk_type != D_NEWBLK)
+ panic("handle_written_bmsafemap: Unexpected type: %s",
+ TYPENAME(newblk->nb_list.wk_type));
+ }
+ while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
+ inodedep->id_state |= DEPCOMPLETE;
+ inodedep->id_state &= ~ONDEPLIST;
+ LIST_REMOVE(inodedep, id_deps);
+ inodedep->id_bmsafemap = NULL;
+ }
+ if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
+ LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
+ LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
+ LIST_EMPTY(&bmsafemap->sm_inodedephd)) {
+ if (chgs)
+ bdirty(bp);
+ LIST_REMOVE(bmsafemap, sm_hash);
+ WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+ return (0);
+ }
+ bdirty(bp);
+ return (1);
+}
+
+/*
+ * Try to free a mkdir dependency.
*/
static void
-handle_written_mkdir(mkdir, type)
+complete_mkdir(mkdir)
struct mkdir *mkdir;
- int type;
{
struct diradd *dap;
- struct pagedep *pagedep;
- if (mkdir->md_state != type)
- panic("handle_written_mkdir: bad type");
+ if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
+ return;
+ LIST_REMOVE(mkdir, md_mkdirs);
dap = mkdir->md_diradd;
- dap->da_state &= ~type;
- if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
+ dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
dap->da_state |= DEPCOMPLETE;
- if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
- if (dap->da_state & DIRCHG)
- pagedep = dap->da_previous->dm_pagedep;
- else
- pagedep = dap->da_pagedep;
- LIST_REMOVE(dap, da_pdlist);
- LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
+ complete_diradd(dap);
}
- LIST_REMOVE(mkdir, md_mkdirs);
WORKITEM_FREE(mkdir, D_MKDIR);
}
/*
+ * Handle the completion of a mkdir dependency.
+ */
+static void
+handle_written_mkdir(mkdir, type)
+ struct mkdir *mkdir;
+ int type;
+{
+
+ if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
+ panic("handle_written_mkdir: bad type");
+ mkdir->md_state |= COMPLETE;
+ complete_mkdir(mkdir);
+}
+
+static void
+free_pagedep(pagedep)
+ struct pagedep *pagedep;
+{
+ int i;
+
+ if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))
+ return;
+ for (i = 0; i < DAHASHSZ; i++)
+ if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
+ return;
+ if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
+ return;
+ if (!LIST_EMPTY(&pagedep->pd_dirremhd))
+ return;
+ if (!LIST_EMPTY(&pagedep->pd_pendinghd))
+ return;
+ LIST_REMOVE(pagedep, pd_hash);
+ WORKITEM_FREE(pagedep, D_PAGEDEP);
+}
+
+/*
* Called from within softdep_disk_write_complete above.
* A write operation was just completed. Removed inodes can
* now be freed and associated block pointers may be committed.
@@ -4790,8 +9642,11 @@ handle_written_filepage(pagedep, bp)
*/
while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
LIST_REMOVE(dirrem, dm_next);
+ dirrem->dm_state |= COMPLETE;
dirrem->dm_dirinum = pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
+ ("handle_written_filepage: Journal entries not written."));
+ add_to_worklist(&dirrem->dm_list, 0);
}
/*
* Free any directory additions that have been committed.
@@ -4800,7 +9655,7 @@ handle_written_filepage(pagedep, bp)
*/
if ((pagedep->pd_state & NEWBLOCK) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
- free_diradd(dap);
+ free_diradd(dap, NULL);
/*
* Uncommitted directory entries must be restored.
*/
@@ -4845,7 +9700,8 @@ handle_written_filepage(pagedep, bp)
* Otherwise it will remain to track any new entries on
* the page in case they are fsync'ed.
*/
- if ((pagedep->pd_state & NEWBLOCK) == 0) {
+ if ((pagedep->pd_state & NEWBLOCK) == 0 &&
+ LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
}
@@ -4880,8 +9736,8 @@ softdep_load_inodeblock(ip)
*/
ip->i_effnlink = ip->i_nlink;
ACQUIRE_LOCK(&lk);
- if (inodedep_lookup(UFSTOVFS(ip->i_ump),
- ip->i_number, 0, &inodedep) == 0) {
+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+ &inodedep) == 0) {
FREE_LOCK(&lk);
return;
}
@@ -4908,11 +9764,26 @@ softdep_update_inodeblock(ip, bp, waitfor)
int waitfor; /* nonzero => update must be allowed */
{
struct inodedep *inodedep;
+ struct inoref *inoref;
struct worklist *wk;
struct mount *mp;
struct buf *ibp;
+ struct fs *fs;
int error;
+ mp = UFSTOVFS(ip->i_ump);
+ fs = ip->i_fs;
+ /*
+ * Preserve the freelink that is on disk. clear_unlinked_inodedep()
+ * does not have access to the in-core ip so must write directly into
+ * the inode block buffer when setting freelink.
+ */
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
+ ino_to_fsbo(fs, ip->i_number))->di_freelink);
+ else
+ DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
+ ino_to_fsbo(fs, ip->i_number))->di_freelink);
/*
* If the effective link count is not equal to the actual link
* count, then we must track the difference in an inodedep while
@@ -4920,8 +9791,8 @@ softdep_update_inodeblock(ip, bp, waitfor)
* if there is no existing inodedep, then there are no dependencies
* to track.
*/
- mp = UFSTOVFS(ip->i_ump);
ACQUIRE_LOCK(&lk);
+again:
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
if (ip->i_effnlink != ip->i_nlink)
@@ -4931,6 +9802,20 @@ softdep_update_inodeblock(ip, bp, waitfor)
if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
panic("softdep_update_inodeblock: bad delta");
/*
+ * If we're flushing all dependencies we must also move any waiting
+ * for journal writes onto the bufwait list prior to I/O.
+ */
+ if (waitfor) {
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+ == DEPCOMPLETE) {
+ stat_jwait_inode++;
+ jwait(&inoref->if_list);
+ goto again;
+ }
+ }
+ }
+ /*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
*/
@@ -4945,10 +9830,12 @@ softdep_update_inodeblock(ip, bp, waitfor)
*/
merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
- handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
+ handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
+ NULL);
merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
if (!TAILQ_EMPTY(&inodedep->id_extupdt))
- handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
+ handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
+ NULL);
/*
* Now that the inode has been pushed into the buffer, the
* operations dependent on the inode being written to disk
@@ -4971,11 +9858,11 @@ softdep_update_inodeblock(ip, bp, waitfor)
return;
}
retry:
- if ((inodedep->id_state & DEPCOMPLETE) != 0) {
+ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
FREE_LOCK(&lk);
return;
}
- ibp = inodedep->id_buf;
+ ibp = inodedep->id_bmsafemap->sm_buf;
ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
if (ibp == NULL) {
/*
@@ -5007,13 +9894,13 @@ merge_inode_lists(newlisthead, oldlisthead)
newadp = TAILQ_FIRST(newlisthead);
for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
- if (listadp->ad_lbn < newadp->ad_lbn) {
+ if (listadp->ad_offset < newadp->ad_offset) {
listadp = TAILQ_NEXT(listadp, ad_next);
continue;
}
TAILQ_REMOVE(newlisthead, newadp, ad_next);
TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
- if (listadp->ad_lbn == newadp->ad_lbn) {
+ if (listadp->ad_offset == newadp->ad_offset) {
allocdirect_merge(oldlisthead, newadp,
listadp);
listadp = newadp;
@@ -5036,6 +9923,7 @@ softdep_fsync(vp)
{
struct inodedep *inodedep;
struct pagedep *pagedep;
+ struct inoref *inoref;
struct worklist *wk;
struct diradd *dap;
struct mount *mp;
@@ -5052,17 +9940,25 @@ softdep_fsync(vp)
fs = ip->i_fs;
mp = vp->v_mount;
ACQUIRE_LOCK(&lk);
+restart:
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
return (0);
}
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+ == DEPCOMPLETE) {
+ stat_jwait_inode++;
+ jwait(&inoref->if_list);
+ goto restart;
+ }
+ }
if (!LIST_EMPTY(&inodedep->id_inowait) ||
- !LIST_EMPTY(&inodedep->id_bufwait) ||
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
!TAILQ_EMPTY(&inodedep->id_inoupdt) ||
!TAILQ_EMPTY(&inodedep->id_newinoupdt))
- panic("softdep_fsync: pending ops");
+ panic("softdep_fsync: pending ops %p", inodedep);
for (error = 0, flushparent = 0; ; ) {
if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
break;
@@ -5254,8 +10150,8 @@ int
softdep_sync_metadata(struct vnode *vp)
{
struct pagedep *pagedep;
- struct allocdirect *adp;
struct allocindir *aip;
+ struct newblk *newblk;
struct buf *bp, *nbp;
struct worklist *wk;
struct bufobj *bo;
@@ -5319,27 +10215,16 @@ loop:
switch (wk->wk_type) {
case D_ALLOCDIRECT:
- adp = WK_ALLOCDIRECT(wk);
- if (adp->ad_state & DEPCOMPLETE)
- continue;
- nbp = adp->ad_buf;
- nbp = getdirtybuf(nbp, &lk, waitfor);
- if (nbp == NULL)
- continue;
- FREE_LOCK(&lk);
- if (waitfor == MNT_NOWAIT) {
- bawrite(nbp);
- } else if ((error = bwrite(nbp)) != 0) {
- break;
- }
- ACQUIRE_LOCK(&lk);
- continue;
-
case D_ALLOCINDIR:
- aip = WK_ALLOCINDIR(wk);
- if (aip->ai_state & DEPCOMPLETE)
+ newblk = WK_NEWBLK(wk);
+ if (newblk->nb_jnewblk != NULL) {
+ stat_jwait_newblk++;
+ jwait(&newblk->nb_jnewblk->jn_list);
+ goto restart;
+ }
+ if (newblk->nb_state & DEPCOMPLETE)
continue;
- nbp = aip->ai_buf;
+ nbp = newblk->nb_bmsafemap->sm_buf;
nbp = getdirtybuf(nbp, &lk, waitfor);
if (nbp == NULL)
continue;
@@ -5355,10 +10240,17 @@ loop:
case D_INDIRDEP:
restart:
- LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
- if (aip->ai_state & DEPCOMPLETE)
+ LIST_FOREACH(aip,
+ &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
+ newblk = (struct newblk *)aip;
+ if (newblk->nb_jnewblk != NULL) {
+ stat_jwait_newblk++;
+ jwait(&newblk->nb_jnewblk->jn_list);
+ goto restart;
+ }
+ if (newblk->nb_state & DEPCOMPLETE)
continue;
- nbp = aip->ai_buf;
+ nbp = newblk->nb_bmsafemap->sm_buf;
nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
if (nbp == NULL)
goto restart;
@@ -5371,14 +10263,6 @@ loop:
}
continue;
- case D_INODEDEP:
- if ((error = flush_inodedep_deps(wk->wk_mp,
- WK_INODEDEP(wk)->id_ino)) != 0) {
- FREE_LOCK(&lk);
- break;
- }
- continue;
-
case D_PAGEDEP:
/*
* We are trying to sync a directory that may
@@ -5400,48 +10284,6 @@ loop:
}
continue;
- case D_MKDIR:
- /*
- * This case should never happen if the vnode has
- * been properly sync'ed. However, if this function
- * is used at a place where the vnode has not yet
- * been sync'ed, this dependency can show up. So,
- * rather than panic, just flush it.
- */
- nbp = WK_MKDIR(wk)->md_buf;
- nbp = getdirtybuf(nbp, &lk, waitfor);
- if (nbp == NULL)
- continue;
- FREE_LOCK(&lk);
- if (waitfor == MNT_NOWAIT) {
- bawrite(nbp);
- } else if ((error = bwrite(nbp)) != 0) {
- break;
- }
- ACQUIRE_LOCK(&lk);
- continue;
-
- case D_BMSAFEMAP:
- /*
- * This case should never happen if the vnode has
- * been properly sync'ed. However, if this function
- * is used at a place where the vnode has not yet
- * been sync'ed, this dependency can show up. So,
- * rather than panic, just flush it.
- */
- nbp = WK_BMSAFEMAP(wk)->sm_buf;
- nbp = getdirtybuf(nbp, &lk, waitfor);
- if (nbp == NULL)
- continue;
- FREE_LOCK(&lk);
- if (waitfor == MNT_NOWAIT) {
- bawrite(nbp);
- } else if ((error = bwrite(nbp)) != 0) {
- break;
- }
- ACQUIRE_LOCK(&lk);
- continue;
-
default:
panic("softdep_sync_metadata: Unknown type %s",
TYPENAME(wk->wk_type));
@@ -5489,7 +10331,8 @@ loop:
BO_LOCK(bo);
drain_output(vp);
BO_UNLOCK(bo);
- return (0);
+ return ffs_update(vp, 1);
+ /* return (0); */
}
/*
@@ -5502,6 +10345,7 @@ flush_inodedep_deps(mp, ino)
ino_t ino;
{
struct inodedep *inodedep;
+ struct inoref *inoref;
int error, waitfor;
/*
@@ -5522,8 +10366,17 @@ flush_inodedep_deps(mp, ino)
return (error);
FREE_LOCK(&lk);
ACQUIRE_LOCK(&lk);
+restart:
if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
return (0);
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+ == DEPCOMPLETE) {
+ stat_jwait_inode++;
+ jwait(&inoref->if_list);
+ goto restart;
+ }
+ }
if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
@@ -5555,13 +10408,20 @@ flush_deplist(listhead, waitfor, errorp)
int *errorp;
{
struct allocdirect *adp;
+ struct newblk *newblk;
struct buf *bp;
mtx_assert(&lk, MA_OWNED);
TAILQ_FOREACH(adp, listhead, ad_next) {
- if (adp->ad_state & DEPCOMPLETE)
+ newblk = (struct newblk *)adp;
+ if (newblk->nb_jnewblk != NULL) {
+ stat_jwait_newblk++;
+ jwait(&newblk->nb_jnewblk->jn_list);
+ return (1);
+ }
+ if (newblk->nb_state & DEPCOMPLETE)
continue;
- bp = adp->ad_buf;
+ bp = newblk->nb_bmsafemap->sm_buf;
bp = getdirtybuf(bp, &lk, waitfor);
if (bp == NULL) {
if (waitfor == MNT_NOWAIT)
@@ -5582,6 +10442,101 @@ flush_deplist(listhead, waitfor, errorp)
}
/*
+ * Flush dependencies associated with an allocdirect block.
+ */
+static int
+flush_newblk_dep(vp, mp, lbn)
+ struct vnode *vp;
+ struct mount *mp;
+ ufs_lbn_t lbn;
+{
+ struct newblk *newblk;
+ struct bufobj *bo;
+ struct inode *ip;
+ struct buf *bp;
+ ufs2_daddr_t blkno;
+ int error;
+
+ error = 0;
+ bo = &vp->v_bufobj;
+ ip = VTOI(vp);
+ blkno = DIP(ip, i_db[lbn]);
+ if (blkno == 0)
+ panic("flush_newblk_dep: Missing block");
+ ACQUIRE_LOCK(&lk);
+ /*
+ * Loop until all dependencies related to this block are satisfied.
+ * We must be careful to restart after each sleep in case a write
+ * completes some part of this process for us.
+ */
+ for (;;) {
+ if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
+ FREE_LOCK(&lk);
+ break;
+ }
+ if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
+ panic("flush_newblk_deps: Bad newblk %p", newblk);
+ /*
+ * Flush the journal.
+ */
+ if (newblk->nb_jnewblk != NULL) {
+ stat_jwait_newblk++;
+ jwait(&newblk->nb_jnewblk->jn_list);
+ continue;
+ }
+ /*
+ * Write the bitmap dependency.
+ */
+ if ((newblk->nb_state & DEPCOMPLETE) == 0) {
+ bp = newblk->nb_bmsafemap->sm_buf;
+ bp = getdirtybuf(bp, &lk, MNT_WAIT);
+ if (bp == NULL)
+ continue;
+ FREE_LOCK(&lk);
+ error = bwrite(bp);
+ if (error)
+ break;
+ ACQUIRE_LOCK(&lk);
+ continue;
+ }
+ /*
+ * Write the buffer.
+ */
+ FREE_LOCK(&lk);
+ BO_LOCK(bo);
+ bp = gbincore(bo, lbn);
+ if (bp != NULL) {
+ error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
+ LK_INTERLOCK, BO_MTX(bo));
+ if (error == ENOLCK) {
+ ACQUIRE_LOCK(&lk);
+ continue; /* Slept, retry */
+ }
+ if (error != 0)
+ break; /* Failed */
+ if (bp->b_flags & B_DELWRI) {
+ bremfree(bp);
+ error = bwrite(bp);
+ if (error)
+ break;
+ } else
+ BUF_UNLOCK(bp);
+ } else
+ BO_UNLOCK(bo);
+ /*
+ * We have to wait for the direct pointers to
+ * point at the newdirblk before the dependency
+ * will go away.
+ */
+ error = ffs_update(vp, MNT_WAIT);
+ if (error)
+ break;
+ ACQUIRE_LOCK(&lk);
+ }
+ return (error);
+}
+
+/*
* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
* Called with splbio blocked.
*/
@@ -5592,16 +10547,16 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
struct diraddhd *diraddhdp;
{
struct inodedep *inodedep;
+ struct inoref *inoref;
struct ufsmount *ump;
struct diradd *dap;
struct vnode *vp;
- struct bufobj *bo;
int error = 0;
struct buf *bp;
ino_t inum;
- struct worklist *wk;
ump = VFSTOUFS(mp);
+restart:
while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
/*
* Flush ourselves if this directory entry
@@ -5609,7 +10564,7 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
*/
if (dap->da_state & MKDIR_PARENT) {
FREE_LOCK(&lk);
- if ((error = ffs_update(pvp, 1)) != 0)
+ if ((error = ffs_update(pvp, MNT_WAIT)) != 0)
break;
ACQUIRE_LOCK(&lk);
/*
@@ -5623,84 +10578,52 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
/*
* A newly allocated directory must have its "." and
* ".." entries written out before its name can be
- * committed in its parent. We do not want or need
- * the full semantics of a synchronous ffs_syncvnode as
- * that may end up here again, once for each directory
- * level in the filesystem. Instead, we push the blocks
- * and wait for them to clear. We have to fsync twice
- * because the first call may choose to defer blocks
- * that still have dependencies, but deferral will
- * happen at most once.
+ * committed in its parent.
*/
inum = dap->da_newinum;
+ if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
+ panic("flush_pagedep_deps: lost inode1");
+ /*
+ * Wait for any pending journal adds to complete so we don't
+ * cause rollbacks while syncing.
+ */
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+ == DEPCOMPLETE) {
+ stat_jwait_inode++;
+ jwait(&inoref->if_list);
+ goto restart;
+ }
+ }
if (dap->da_state & MKDIR_BODY) {
FREE_LOCK(&lk);
if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
FFSV_FORCEINSMQ)))
break;
- if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
- (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
- vput(vp);
- break;
- }
- bo = &vp->v_bufobj;
- BO_LOCK(bo);
- drain_output(vp);
+ error = flush_newblk_dep(vp, mp, 0);
/*
- * If first block is still dirty with a D_MKDIR
- * dependency then it needs to be written now.
+ * If we still have the dependency we might need to
+ * update the vnode to sync the new link count to
+ * disk.
*/
- for (;;) {
- error = 0;
- bp = gbincore(bo, 0);
- if (bp == NULL)
- break; /* First block not present */
- error = BUF_LOCK(bp,
- LK_EXCLUSIVE |
- LK_SLEEPFAIL |
- LK_INTERLOCK,
- BO_MTX(bo));
- BO_LOCK(bo);
- if (error == ENOLCK)
- continue; /* Slept, retry */
- if (error != 0)
- break; /* Failed */
- if ((bp->b_flags & B_DELWRI) == 0) {
- BUF_UNLOCK(bp);
- break; /* Buffer not dirty */
- }
- for (wk = LIST_FIRST(&bp->b_dep);
- wk != NULL;
- wk = LIST_NEXT(wk, wk_list))
- if (wk->wk_type == D_MKDIR)
- break;
- if (wk == NULL)
- BUF_UNLOCK(bp); /* Dependency gone */
- else {
- /*
- * D_MKDIR dependency remains,
- * must write buffer to stable
- * storage.
- */
- BO_UNLOCK(bo);
- bremfree(bp);
- error = bwrite(bp);
- BO_LOCK(bo);
- }
- break;
- }
- BO_UNLOCK(bo);
+ if (error == 0 && dap == LIST_FIRST(diraddhdp))
+ error = ffs_update(vp, MNT_WAIT);
vput(vp);
if (error != 0)
- break; /* Flushing of first block failed */
+ break;
ACQUIRE_LOCK(&lk);
/*
* If that cleared dependencies, go on to next.
*/
if (dap != LIST_FIRST(diraddhdp))
continue;
- if (dap->da_state & MKDIR_BODY)
- panic("flush_pagedep_deps: MKDIR_BODY");
+ if (dap->da_state & MKDIR_BODY) {
+ inodedep_lookup(UFSTOVFS(ump), inum, 0,
+ &inodedep);
+ panic("flush_pagedep_deps: MKDIR_BODY "
+ "inodedep %p dap %p vp %p",
+ inodedep, dap, vp);
+ }
}
/*
* Flush the inode on which the directory entry depends.
@@ -5719,8 +10642,8 @@ retry:
* If the inode still has bitmap dependencies,
* push them to disk.
*/
- if ((inodedep->id_state & DEPCOMPLETE) == 0) {
- bp = inodedep->id_buf;
+ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
+ bp = inodedep->id_bmsafemap->sm_buf;
bp = getdirtybuf(bp, &lk, MNT_WAIT);
if (bp == NULL)
goto retry;
@@ -5733,24 +10656,29 @@ retry:
}
/*
* If the inode is still sitting in a buffer waiting
- * to be written, push it to disk.
+ * to be written or waiting for the link count to be
+ * adjusted update it here to flush it to disk.
*/
- FREE_LOCK(&lk);
- if ((error = bread(ump->um_devvp,
- fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
- (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
- brelse(bp);
- break;
+ if (dap == LIST_FIRST(diraddhdp)) {
+ FREE_LOCK(&lk);
+ if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
+ FFSV_FORCEINSMQ)))
+ break;
+ error = ffs_update(vp, MNT_WAIT);
+ vput(vp);
+ if (error)
+ break;
+ ACQUIRE_LOCK(&lk);
}
- if ((error = bwrite(bp)) != 0)
- break;
- ACQUIRE_LOCK(&lk);
/*
* If we have failed to get rid of all the dependencies
* then something is seriously wrong.
*/
- if (dap == LIST_FIRST(diraddhdp))
- panic("flush_pagedep_deps: flush failed");
+ if (dap == LIST_FIRST(diraddhdp)) {
+ inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
+ panic("flush_pagedep_deps: failed to flush "
+ "inodedep %p ino %d dap %p", inodedep, inum, dap);
+ }
}
if (error)
ACQUIRE_LOCK(&lk);
@@ -5828,6 +10756,7 @@ softdep_request_cleanup(fs, vp)
return (0);
UFS_UNLOCK(ump);
ACQUIRE_LOCK(&lk);
+ process_removes(vp);
if (ump->softdep_on_worklist > 0 &&
process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
stat_worklist_push += 1;
@@ -6100,10 +11029,15 @@ softdep_count_dependencies(bp, wantcount)
int wantcount;
{
struct worklist *wk;
+ struct bmsafemap *bmsafemap;
struct inodedep *inodedep;
struct indirdep *indirdep;
+ struct freeblks *freeblks;
struct allocindir *aip;
struct pagedep *pagedep;
+ struct dirrem *dirrem;
+ struct newblk *newblk;
+ struct mkdir *mkdir;
struct diradd *dap;
int i, retval;
@@ -6132,6 +11066,12 @@ softdep_count_dependencies(bp, wantcount)
if (!wantcount)
goto out;
}
+ if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
+ /* Add reference dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
continue;
case D_INDIRDEP:
@@ -6147,6 +11087,14 @@ softdep_count_dependencies(bp, wantcount)
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
+ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
+ if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
+ /* Journal remove ref dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ }
for (i = 0; i < DAHASHSZ; i++) {
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
@@ -6159,14 +11107,62 @@ softdep_count_dependencies(bp, wantcount)
continue;
case D_BMSAFEMAP:
+ bmsafemap = WK_BMSAFEMAP(wk);
+ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
+ /* Add reference dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
+ /* Allocate block dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
+ case D_FREEBLKS:
+ freeblks = WK_FREEBLKS(wk);
+ if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {
+ /* Freeblk journal dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
case D_ALLOCDIRECT:
case D_ALLOCINDIR:
+ newblk = WK_NEWBLK(wk);
+ if (newblk->nb_jnewblk) {
+ /* Journal allocate dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
case D_MKDIR:
+ mkdir = WK_MKDIR(wk);
+ if (mkdir->md_jaddref) {
+ /* Journal reference dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
+ case D_FREEWORK:
+ case D_FREEDEP:
+ case D_JSEGDEP:
+ case D_JSEG:
+ case D_SBDEP:
/* never a dependency on these blocks */
continue;
default:
- panic("softdep_check_for_rollback: Unexpected type %s",
+ panic("softdep_count_dependencies: Unexpected type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
@@ -6382,6 +11378,45 @@ softdep_error(func, error)
#ifdef DDB
+static void
+inodedep_print(struct inodedep *inodedep, int verbose)
+{
+ db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
+ " saveino %p\n",
+ inodedep, inodedep->id_fs, inodedep->id_state,
+ (intmax_t)inodedep->id_ino,
+ (intmax_t)fsbtodb(inodedep->id_fs,
+ ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
+ inodedep->id_nlinkdelta, inodedep->id_savednlink,
+ inodedep->id_savedino1);
+
+ if (verbose == 0)
+ return;
+
+ db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
+ "mkdiradd %p\n",
+ LIST_FIRST(&inodedep->id_pendinghd),
+ LIST_FIRST(&inodedep->id_bufwait),
+ LIST_FIRST(&inodedep->id_inowait),
+ TAILQ_FIRST(&inodedep->id_inoreflst),
+ inodedep->id_mkdiradd);
+ db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
+ TAILQ_FIRST(&inodedep->id_inoupdt),
+ TAILQ_FIRST(&inodedep->id_newinoupdt),
+ TAILQ_FIRST(&inodedep->id_extupdt),
+ TAILQ_FIRST(&inodedep->id_newextupdt));
+}
+
+DB_SHOW_COMMAND(inodedep, db_show_inodedep)
+{
+
+ if (have_addr == 0) {
+ db_printf("Address required\n");
+ return;
+ }
+ inodedep_print((struct inodedep*)addr, 1);
+}
+
DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
{
struct inodedep_hashhead *inodedephd;
@@ -6395,15 +11430,62 @@ DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
LIST_FOREACH(inodedep, inodedephd, id_hash) {
if (fs != NULL && fs != inodedep->id_fs)
continue;
- db_printf("%p fs %p st %x ino %jd inoblk %jd\n",
- inodedep, inodedep->id_fs, inodedep->id_state,
- (intmax_t)inodedep->id_ino,
- (intmax_t)fsbtodb(inodedep->id_fs,
- ino_to_fsba(inodedep->id_fs, inodedep->id_ino)));
+ inodedep_print(inodedep, 0);
}
}
}
+DB_SHOW_COMMAND(worklist, db_show_worklist)
+{
+ struct worklist *wk;
+
+ if (have_addr == 0) {
+ db_printf("Address required\n");
+ return;
+ }
+ wk = (struct worklist *)addr;
+ printf("worklist: %p type %s state 0x%X\n",
+ wk, TYPENAME(wk->wk_type), wk->wk_state);
+}
+
+DB_SHOW_COMMAND(workhead, db_show_workhead)
+{
+ struct workhead *wkhd;
+ struct worklist *wk;
+ int i;
+
+ if (have_addr == 0) {
+ db_printf("Address required\n");
+ return;
+ }
+ wkhd = (struct workhead *)addr;
+ wk = LIST_FIRST(wkhd);
+ for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
+ db_printf("worklist: %p type %s state 0x%X",
+ wk, TYPENAME(wk->wk_type), wk->wk_state);
+ if (i == 100)
+ db_printf("workhead overflow");
+ printf("\n");
+}
+
+
+DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
+{
+ struct jaddref *jaddref;
+ struct diradd *diradd;
+ struct mkdir *mkdir;
+
+ LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
+ diradd = mkdir->md_diradd;
+ db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
+ mkdir, mkdir->md_state, diradd, diradd->da_state);
+ if ((jaddref = mkdir->md_jaddref) != NULL)
+ db_printf(" jaddref %p jaddref state 0x%X",
+ jaddref, jaddref->ja_state);
+ db_printf("\n");
+ }
+}
+
#endif /* DDB */
#endif /* SOFTUPDATES */
diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c
index e34bc1372a2e..e2460a36be2d 100644
--- a/sys/ufs/ffs/ffs_subr.c
+++ b/sys/ufs/ffs/ffs_subr.c
@@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");
#ifndef _KERNEL
#include <ufs/ufs/dinode.h>
#include <ufs/ffs/fs.h>
-#include "fsck.h"
#else
#include <sys/systm.h>
#include <sys/lock.h>
@@ -223,7 +222,38 @@ ffs_isblock(fs, cp, h)
mask = 0x01 << (h & 0x7);
return ((cp[h >> 3] & mask) == mask);
default:
+#ifdef _KERNEL
panic("ffs_isblock");
+#endif
+ break;
+ }
+ return (0);
+}
+
+/*
+ * check if a block is free
+ */
+int
+ffs_isfreeblock(fs, cp, h)
+ struct fs *fs;
+ u_char *cp;
+ ufs1_daddr_t h;
+{
+
+ switch ((int)fs->fs_frag) {
+ case 8:
+ return (cp[h] == 0);
+ case 4:
+ return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
+ case 2:
+ return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
+ case 1:
+ return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
+ default:
+#ifdef _KERNEL
+ panic("ffs_isfreeblock");
+#endif
+ break;
}
return (0);
}
@@ -252,7 +282,10 @@ ffs_clrblock(fs, cp, h)
cp[h >> 3] &= ~(0x01 << (h & 0x7));
return;
default:
+#ifdef _KERNEL
panic("ffs_clrblock");
+#endif
+ break;
}
}
@@ -281,6 +314,101 @@ ffs_setblock(fs, cp, h)
cp[h >> 3] |= (0x01 << (h & 0x7));
return;
default:
+#ifdef _KERNEL
panic("ffs_setblock");
+#endif
+ break;
+ }
+}
+
+/*
+ * Update the cluster map because of an allocation or free.
+ *
+ * Cnt == 1 means free; cnt == -1 means allocating.
+ */
+void
+ffs_clusteracct(fs, cgp, blkno, cnt)
+ struct fs *fs;
+ struct cg *cgp;
+ ufs1_daddr_t blkno;
+ int cnt;
+{
+ int32_t *sump;
+ int32_t *lp;
+ u_char *freemapp, *mapp;
+ int i, start, end, forw, back, map, bit;
+
+ if (fs->fs_contigsumsize <= 0)
+ return;
+ freemapp = cg_clustersfree(cgp);
+ sump = cg_clustersum(cgp);
+ /*
+ * Allocate or clear the actual block.
+ */
+ if (cnt > 0)
+ setbit(freemapp, blkno);
+ else
+ clrbit(freemapp, blkno);
+ /*
+ * Find the size of the cluster going forward.
+ */
+ start = blkno + 1;
+ end = start + fs->fs_contigsumsize;
+ if (end >= cgp->cg_nclusterblks)
+ end = cgp->cg_nclusterblks;
+ mapp = &freemapp[start / NBBY];
+ map = *mapp++;
+ bit = 1 << (start % NBBY);
+ for (i = start; i < end; i++) {
+ if ((map & bit) == 0)
+ break;
+ if ((i & (NBBY - 1)) != (NBBY - 1)) {
+ bit <<= 1;
+ } else {
+ map = *mapp++;
+ bit = 1;
+ }
+ }
+ forw = i - start;
+ /*
+ * Find the size of the cluster going backward.
+ */
+ start = blkno - 1;
+ end = start - fs->fs_contigsumsize;
+ if (end < 0)
+ end = -1;
+ mapp = &freemapp[start / NBBY];
+ map = *mapp--;
+ bit = 1 << (start % NBBY);
+ for (i = start; i > end; i--) {
+ if ((map & bit) == 0)
+ break;
+ if ((i & (NBBY - 1)) != 0) {
+ bit >>= 1;
+ } else {
+ map = *mapp--;
+ bit = 1 << (NBBY - 1);
+ }
}
+ back = start - i;
+ /*
+ * Account for old cluster and the possibly new forward and
+ * back clusters.
+ */
+ i = back + forw + 1;
+ if (i > fs->fs_contigsumsize)
+ i = fs->fs_contigsumsize;
+ sump[i] += cnt;
+ if (back > 0)
+ sump[back] -= cnt;
+ if (forw > 0)
+ sump[forw] -= cnt;
+ /*
+ * Update cluster summary information.
+ */
+ lp = &sump[fs->fs_contigsumsize];
+ for (i = fs->fs_contigsumsize; i > 0; i--)
+ if (*lp-- > 0)
+ break;
+ fs->fs_maxcluster[cgp->cg_cgx] = i;
}
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 656c03666c5e..e40336863248 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -79,7 +79,6 @@ static int ffs_reload(struct mount *, struct thread *);
static int ffs_mountfs(struct vnode *, struct mount *, struct thread *);
static void ffs_oldfscompat_read(struct fs *, struct ufsmount *,
ufs2_daddr_t);
-static void ffs_oldfscompat_write(struct fs *, struct ufsmount *);
static void ffs_ifree(struct ufsmount *ump, struct inode *ip);
static vfs_init_t ffs_init;
static vfs_uninit_t ffs_uninit;
@@ -299,7 +298,8 @@ ffs_mount(struct mount *mp)
if (fs->fs_clean == 0) {
fs->fs_flags |= FS_UNCLEAN;
if ((mp->mnt_flag & MNT_FORCE) ||
- ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
+ ((fs->fs_flags &
+ (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
(fs->fs_flags & FS_DOSOFTDEP))) {
printf("WARNING: %s was not %s\n",
fs->fs_fsmnt, "properly dismounted");
@@ -307,6 +307,9 @@ ffs_mount(struct mount *mp)
printf(
"WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n",
fs->fs_fsmnt);
+ if (fs->fs_flags & FS_SUJ)
+ printf(
+"WARNING: Forced mount will invalidated journal contents\n");
return (EPERM);
}
}
@@ -330,17 +333,18 @@ ffs_mount(struct mount *mp)
MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_RDONLY;
MNT_IUNLOCK(mp);
- fs->fs_clean = 0;
- if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
- vn_finished_write(mp);
- return (error);
- }
+ fs->fs_mtime = time_second;
/* check to see if we need to start softdep */
if ((fs->fs_flags & FS_DOSOFTDEP) &&
(error = softdep_mount(devvp, mp, fs, td->td_ucred))){
vn_finished_write(mp);
return (error);
}
+ fs->fs_clean = 0;
+ if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
+ vn_finished_write(mp);
+ return (error);
+ }
if (fs->fs_snapinum[0] != 0)
ffs_snapshot_mount(mp);
vn_finished_write(mp);
@@ -705,7 +709,7 @@ ffs_mountfs(devvp, mp, td)
if (fs->fs_clean == 0) {
fs->fs_flags |= FS_UNCLEAN;
if (ronly || (mp->mnt_flag & MNT_FORCE) ||
- ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
+ ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
(fs->fs_flags & FS_DOSOFTDEP))) {
printf(
"WARNING: %s was not properly dismounted\n",
@@ -714,6 +718,9 @@ ffs_mountfs(devvp, mp, td)
printf(
"WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n",
fs->fs_fsmnt);
+ if (fs->fs_flags & FS_SUJ)
+ printf(
+"WARNING: Forced mount will invalidated journal contents\n");
error = EPERM;
goto out;
}
@@ -896,6 +903,7 @@ ffs_mountfs(devvp, mp, td)
*/
bzero(fs->fs_fsmnt, MAXMNTLEN);
strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
+ mp->mnt_stat.f_iosize = fs->fs_bsize;
if( mp->mnt_flag & MNT_ROOTFS) {
/*
@@ -907,6 +915,7 @@ ffs_mountfs(devvp, mp, td)
}
if (ronly == 0) {
+ fs->fs_mtime = time_second;
if ((fs->fs_flags & FS_DOSOFTDEP) &&
(error = softdep_mount(devvp, mp, fs, cred)) != 0) {
free(fs->fs_csp, M_UFSMNT);
@@ -937,7 +946,6 @@ ffs_mountfs(devvp, mp, td)
* This would all happen while the filesystem was busy/not
* available, so would effectively be "atomic".
*/
- mp->mnt_stat.f_iosize = fs->fs_bsize;
(void) ufs_extattr_autostart(mp, td);
#endif /* !UFS_EXTATTR_AUTOSTART */
#endif /* !UFS_EXTATTR */
@@ -1037,7 +1045,7 @@ ffs_oldfscompat_read(fs, ump, sblockloc)
* XXX - Parts get retired eventually.
* Unfortunately new bits get added.
*/
-static void
+void
ffs_oldfscompat_write(fs, ump)
struct fs *fs;
struct ufsmount *ump;
@@ -1132,6 +1140,7 @@ ffs_unmount(mp, mntflags)
fs->fs_pendinginodes = 0;
}
UFS_UNLOCK(ump);
+ softdep_unmount(mp);
if (fs->fs_ronly == 0) {
fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
error = ffs_sbupdate(ump, MNT_WAIT, 0);
@@ -1573,16 +1582,6 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
DIP_SET(ip, i_gen, ip->i_gen);
}
}
- /*
- * Ensure that uid and gid are correct. This is a temporary
- * fix until fsck has been changed to do the update.
- */
- if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */
- fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */
- ip->i_uid = ip->i_din1->di_ouid; /* XXX */
- ip->i_gid = ip->i_din1->di_ogid; /* XXX */
- } /* XXX */
-
#ifdef MAC
if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
/*
@@ -1726,6 +1725,8 @@ ffs_sbupdate(mp, waitfor, suspended)
}
fs->fs_fmod = 0;
fs->fs_time = time_second;
+ if (fs->fs_flags & FS_DOSOFTDEP)
+ softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp);
bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
if (suspended)
@@ -1867,9 +1868,6 @@ ffs_bufwrite(struct buf *bp)
}
BO_UNLOCK(bp->b_bufobj);
- /* Mark the buffer clean */
- bundirty(bp);
-
/*
* If this buffer is marked for background writing and we
* do not have to wait for it, make a copy and write the
@@ -1910,9 +1908,16 @@ ffs_bufwrite(struct buf *bp)
newbp->b_flags &= ~B_INVAL;
#ifdef SOFTUPDATES
- /* move over the dependencies */
- if (!LIST_EMPTY(&bp->b_dep))
- softdep_move_dependencies(bp, newbp);
+ /*
+ * Move over the dependencies. If there are rollbacks,
+ * leave the parent buffer dirtied as it will need to
+ * be written again.
+ */
+ if (LIST_EMPTY(&bp->b_dep) ||
+ softdep_move_dependencies(bp, newbp) == 0)
+ bundirty(bp);
+#else
+ bundirty(bp);
#endif
/*
@@ -1925,7 +1930,10 @@ ffs_bufwrite(struct buf *bp)
*/
bqrelse(bp);
bp = newbp;
- }
+ } else
+ /* Mark the buffer clean */
+ bundirty(bp);
+
/* Let the normal bufwrite do the rest for us */
normal_write:
@@ -1939,6 +1947,7 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
struct vnode *vp;
int error;
struct buf *tbp;
+ int nocopy;
vp = bo->__bo_vnode;
if (bp->b_iocmd == BIO_WRITE) {
@@ -1946,8 +1955,9 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
(bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
panic("ffs_geom_strategy: bad I/O");
- bp->b_flags &= ~B_VALIDSUSPWRT;
- if ((vp->v_vflag & VV_COPYONWRITE) &&
+ nocopy = bp->b_flags & B_NOCOPY;
+ bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY);
+ if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&
vp->v_rdev->si_snapdata != NULL) {
if ((bp->b_flags & B_CLUSTER) != 0) {
runningbufwakeup(bp);
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 464a7613e162..e6617cbcdfa8 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -225,6 +225,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor)
wait = (waitfor == MNT_WAIT);
lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
bo = &vp->v_bufobj;
+ ip->i_flag &= ~IN_NEEDSYNC;
/*
* Flush all dirty buffers associated with a vnode.
diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h
index 5452e2be6de2..e863b961c620 100644
--- a/sys/ufs/ffs/fs.h
+++ b/sys/ufs/ffs/fs.h
@@ -340,7 +340,9 @@ struct fs {
u_int32_t fs_avgfilesize; /* expected average file size */
u_int32_t fs_avgfpdir; /* expected # of files per directory */
int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */
- int32_t fs_sparecon32[26]; /* reserved for future constants */
+ ufs_time_t fs_mtime; /* Last mount or fsck time. */
+ int32_t fs_sujfree; /* SUJ free list */
+ int32_t fs_sparecon32[23]; /* reserved for future constants */
int32_t fs_flags; /* see FS_ flags below */
int32_t fs_contigsumsize; /* size of cluster summary array */
int32_t fs_maxsymlinklen; /* max length of an internal symlink */
@@ -408,12 +410,13 @@ CTASSERT(sizeof(struct fs) == 1376);
#define FS_UNCLEAN 0x0001 /* filesystem not clean at mount */
#define FS_DOSOFTDEP 0x0002 /* filesystem using soft dependencies */
#define FS_NEEDSFSCK 0x0004 /* filesystem needs sync fsck before mount */
-#define FS_INDEXDIRS 0x0008 /* kernel supports indexed directories */
+#define FS_SUJ 0x0008 /* Filesystem using softupdate journal */
#define FS_ACLS 0x0010 /* file system has POSIX.1e ACLs enabled */
#define FS_MULTILABEL 0x0020 /* file system is MAC multi-label */
#define FS_GJOURNAL 0x0040 /* gjournaled file system */
#define FS_FLAGS_UPDATED 0x0080 /* flags have been moved to new location */
#define FS_NFS4ACLS 0x0100 /* file system has NFSv4 ACLs enabled */
+#define FS_INDEXDIRS 0x0200 /* kernel supports indexed directories */
/*
* Macros to access bits in the fs_active array.
@@ -603,7 +606,31 @@ struct cg {
? (fs)->fs_bsize \
: (fragroundup(fs, blkoff(fs, (size)))))
-
+/*
+ * Indirect lbns are aligned on NDADDR addresses where single indirects
+ * are the negated address of the lowest lbn reachable, double indirects
+ * are this lbn - 1 and triple indirects are this lbn - 2. This yields
+ * an unusual bit order to determine level.
+ */
+static inline int
+lbn_level(ufs_lbn_t lbn)
+{
+ if (lbn >= 0)
+ return 0;
+ switch (lbn & 0x3) {
+ case 0:
+ return (0);
+ case 1:
+ break;
+ case 2:
+ return (2);
+ case 3:
+ return (1);
+ default:
+ break;
+ }
+ return (-1);
+}
/*
* Number of inodes in a secondary storage block/fragment.
*/
@@ -615,6 +642,108 @@ struct cg {
*/
#define NINDIR(fs) ((fs)->fs_nindir)
+/*
+ * Softdep journal record format.
+ */
+
+#define JOP_ADDREF 1 /* Add a reference to an inode. */
+#define JOP_REMREF 2 /* Remove a reference from an inode. */
+#define JOP_NEWBLK 3 /* Allocate a block. */
+#define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */
+#define JOP_MVREF 5 /* Move a reference from one off to another. */
+#define JOP_TRUNC 6 /* Partial truncation record. */
+
+#define JREC_SIZE 32 /* Record and segment header size. */
+
+#define SUJ_MIN (4 * 1024 * 1024) /* Minimum journal size */
+#define SUJ_MAX (32 * 1024 * 1024) /* Maximum journal size */
+#define SUJ_FILE ".sujournal" /* Journal file name */
+
+/*
+ * Size of the segment record header. There is at most one for each disk
+ * block n the journal. The segment header is followed by an array of
+ * records. fsck depends on the first element in each record being 'op'
+ * and the second being 'ino'. Segments may span multiple disk blocks but
+ * the header is present on each.
+ */
+struct jsegrec {
+ uint64_t jsr_seq; /* Our sequence number */
+ uint64_t jsr_oldest; /* Oldest valid sequence number */
+ uint16_t jsr_cnt; /* Count of valid records */
+ uint16_t jsr_blocks; /* Count of DEV_BSIZE blocks. */
+ uint32_t jsr_crc; /* 32bit crc of the valid space */
+ ufs_time_t jsr_time; /* timestamp for mount instance */
+};
+
+/*
+ * Reference record. Records a single link count modification.
+ */
+struct jrefrec {
+ uint32_t jr_op;
+ ino_t jr_ino;
+ ino_t jr_parent;
+ uint16_t jr_nlink;
+ uint16_t jr_mode;
+ off_t jr_diroff;
+ uint64_t jr_unused;
+};
+
+/*
+ * Move record. Records a reference moving within a directory block. The
+ * nlink is unchanged but we must search both locations.
+ */
+struct jmvrec {
+ uint32_t jm_op;
+ ino_t jm_ino;
+ ino_t jm_parent;
+ uint16_t jm_unused;
+ off_t jm_oldoff;
+ off_t jm_newoff;
+};
+
+/*
+ * Block record. A set of frags or tree of blocks starting at an indirect are
+ * freed or a set of frags are allocated.
+ */
+struct jblkrec {
+ uint32_t jb_op;
+ uint32_t jb_ino;
+ ufs2_daddr_t jb_blkno;
+ ufs_lbn_t jb_lbn;
+ uint16_t jb_frags;
+ uint16_t jb_oldfrags;
+ uint32_t jb_unused;
+};
+
+/*
+ * Truncation record. Records a partial truncation so that it may be
+ * completed later.
+ */
+struct jtrncrec {
+ uint32_t jt_op;
+ uint32_t jt_ino;
+ off_t jt_size;
+ uint32_t jt_extsize;
+ uint32_t jt_pad[3];
+};
+
+union jrec {
+ struct jsegrec rec_jsegrec;
+ struct jrefrec rec_jrefrec;
+ struct jmvrec rec_jmvrec;
+ struct jblkrec rec_jblkrec;
+ struct jtrncrec rec_jtrncrec;
+};
+
+#ifdef CTASSERT
+CTASSERT(sizeof(struct jsegrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jrefrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jmvrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jblkrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE);
+CTASSERT(sizeof(union jrec) == JREC_SIZE);
+#endif
+
extern int inside[], around[];
extern u_char *fragtbl[];
diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h
index b00183bcfd2c..5d8a8691b170 100644
--- a/sys/ufs/ffs/softdep.h
+++ b/sys/ufs/ffs/softdep.h
@@ -94,22 +94,29 @@
* The ONWORKLIST flag shows whether the structure is currently linked
* onto a worklist.
*/
-#define ATTACHED 0x0001
-#define UNDONE 0x0002
-#define COMPLETE 0x0004
-#define DEPCOMPLETE 0x0008
-#define MKDIR_PARENT 0x0010 /* diradd & mkdir only */
-#define MKDIR_BODY 0x0020 /* diradd & mkdir only */
-#define RMDIR 0x0040 /* dirrem only */
-#define DIRCHG 0x0080 /* diradd & dirrem only */
-#define GOINGAWAY 0x0100 /* indirdep only */
-#define IOSTARTED 0x0200 /* inodedep & pagedep only */
-#define SPACECOUNTED 0x0400 /* inodedep only */
-#define NEWBLOCK 0x0800 /* pagedep only */
-#define INPROGRESS 0x1000 /* dirrem, freeblks, freefrag, freefile only */
-#define UFS1FMT 0x2000 /* indirdep only */
-#define EXTDATA 0x4000 /* allocdirect only */
-#define ONWORKLIST 0x8000
+#define ATTACHED 0x000001
+#define UNDONE 0x000002
+#define COMPLETE 0x000004
+#define DEPCOMPLETE 0x000008
+#define MKDIR_PARENT 0x000010 /* diradd, mkdir, jaddref, jsegdep only */
+#define MKDIR_BODY 0x000020 /* diradd, mkdir, jaddref only */
+#define RMDIR 0x000040 /* dirrem only */
+#define DIRCHG 0x000080 /* diradd, dirrem only */
+#define GOINGAWAY 0x000100 /* indirdep, jremref only */
+#define IOSTARTED 0x000200 /* inodedep, pagedep, bmsafemap only */
+#define SPACECOUNTED 0x000400 /* inodedep only */
+#define NEWBLOCK 0x000800 /* pagedep, jaddref only */
+#define INPROGRESS 0x001000 /* dirrem, freeblks, freefrag, freefile only */
+#define UFS1FMT 0x002000 /* indirdep only */
+#define EXTDATA 0x004000 /* allocdirect only */
+#define ONWORKLIST 0x008000
+#define IOWAITING 0x010000 /* Thread is waiting for IO to complete. */
+#define ONDEPLIST 0x020000 /* Structure is on a dependency list. */
+#define UNLINKED 0x040000 /* inodedep has been unlinked. */
+#define UNLINKNEXT 0x080000 /* inodedep has valid di_freelink */
+#define UNLINKPREV 0x100000 /* inodedep is pointed at in the unlink list */
+#define UNLINKONLIST 0x200000 /* inodedep is in the unlinked list on disk */
+#define UNLINKLINKS (UNLINKNEXT | UNLINKPREV)
#define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE)
@@ -135,25 +142,38 @@
* and the macros below changed to use it.
*/
struct worklist {
- struct mount *wk_mp; /* Mount we live in */
LIST_ENTRY(worklist) wk_list; /* list of work requests */
- unsigned short wk_type; /* type of request */
- unsigned short wk_state; /* state flags */
+ struct mount *wk_mp; /* Mount we live in */
+ unsigned int wk_type:8, /* type of request */
+ wk_state:24; /* state flags */
};
#define WK_DATA(wk) ((void *)(wk))
#define WK_PAGEDEP(wk) ((struct pagedep *)(wk))
#define WK_INODEDEP(wk) ((struct inodedep *)(wk))
#define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk))
+#define WK_NEWBLK(wk) ((struct newblk *)(wk))
#define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk))
#define WK_INDIRDEP(wk) ((struct indirdep *)(wk))
#define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk))
#define WK_FREEFRAG(wk) ((struct freefrag *)(wk))
#define WK_FREEBLKS(wk) ((struct freeblks *)(wk))
+#define WK_FREEWORK(wk) ((struct freework *)(wk))
#define WK_FREEFILE(wk) ((struct freefile *)(wk))
#define WK_DIRADD(wk) ((struct diradd *)(wk))
#define WK_MKDIR(wk) ((struct mkdir *)(wk))
#define WK_DIRREM(wk) ((struct dirrem *)(wk))
#define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk))
+#define WK_JADDREF(wk) ((struct jaddref *)(wk))
+#define WK_JREMREF(wk) ((struct jremref *)(wk))
+#define WK_JMVREF(wk) ((struct jmvref *)(wk))
+#define WK_JSEGDEP(wk) ((struct jsegdep *)(wk))
+#define WK_JSEG(wk) ((struct jseg *)(wk))
+#define WK_JNEWBLK(wk) ((struct jnewblk *)(wk))
+#define WK_JFREEBLK(wk) ((struct jfreeblk *)(wk))
+#define WK_FREEDEP(wk) ((struct freedep *)(wk))
+#define WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk))
+#define WK_SBDEP(wk) ((struct sbdep *)wk)
+#define WK_JTRUNC(wk) ((struct jtrunc *)(wk))
/*
* Various types of lists
@@ -165,6 +185,15 @@ LIST_HEAD(inodedephd, inodedep);
LIST_HEAD(allocindirhd, allocindir);
LIST_HEAD(allocdirecthd, allocdirect);
TAILQ_HEAD(allocdirectlst, allocdirect);
+LIST_HEAD(indirdephd, indirdep);
+LIST_HEAD(jaddrefhd, jaddref);
+LIST_HEAD(jremrefhd, jremref);
+LIST_HEAD(jmvrefhd, jmvref);
+LIST_HEAD(jnewblkhd, jnewblk);
+LIST_HEAD(jfreeblkhd, jfreeblk);
+LIST_HEAD(freeworkhd, freework);
+TAILQ_HEAD(jseglst, jseg);
+TAILQ_HEAD(inoreflst, inoref);
/*
* The "pagedep" structure tracks the various dependencies related to
@@ -192,9 +221,11 @@ struct pagedep {
LIST_ENTRY(pagedep) pd_hash; /* hashed lookup */
ino_t pd_ino; /* associated file */
ufs_lbn_t pd_lbn; /* block within file */
+ struct newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */
struct dirremhd pd_dirremhd; /* dirrem's waiting for page */
struct diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */
struct diraddhd pd_pendinghd; /* directory entries awaiting write */
+ struct jmvrefhd pd_jmvrefhd; /* Dependent journal writes. */
};
/*
@@ -248,13 +279,18 @@ struct inodedep {
struct worklist id_list; /* buffer holding inode block */
# define id_state id_list.wk_state /* inode dependency state */
LIST_ENTRY(inodedep) id_hash; /* hashed lookup */
+ TAILQ_ENTRY(inodedep) id_unlinked; /* Unlinked but ref'd inodes */
struct fs *id_fs; /* associated filesystem */
ino_t id_ino; /* dependent inode */
nlink_t id_nlinkdelta; /* saved effective link count */
+ nlink_t id_savednlink; /* Link saved during rollback */
LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */
- struct buf *id_buf; /* related bmsafemap (if pending) */
+ struct bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */
+ struct diradd *id_mkdiradd; /* diradd for a mkdir. */
+ struct inoreflst id_inoreflst; /* Inode reference adjustments. */
long id_savedextsize; /* ext size saved during rollback */
off_t id_savedsize; /* file size saved during rollback */
+ struct dirremhd id_dirremhd; /* Removals pending. */
struct workhead id_pendinghd; /* entries awaiting directory write */
struct workhead id_bufwait; /* operations after inode written */
struct workhead id_inowait; /* operations waiting inode update */
@@ -271,23 +307,6 @@ struct inodedep {
#define id_savedino2 id_un.idu_savedino2
/*
- * A "newblk" structure is attached to a bmsafemap structure when a block
- * or fragment is allocated from a cylinder group. Its state is set to
- * DEPCOMPLETE when its cylinder group map is written. It is consumed by
- * an associated allocdirect or allocindir allocation which will attach
- * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag
- * is not set (i.e., its cylinder group map has not been written).
- */
-struct newblk {
- LIST_ENTRY(newblk) nb_hash; /* hashed lookup */
- struct fs *nb_fs; /* associated filesystem */
- int nb_state; /* state of bitmap dependency */
- ufs2_daddr_t nb_newblkno; /* allocated block number */
- LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblk's */
- struct bmsafemap *nb_bmsafemap; /* associated bmsafemap */
-};
-
-/*
* A "bmsafemap" structure maintains a list of dependency structures
* that depend on the update of a particular cylinder group map.
* It has lists for newblks, allocdirects, allocindirs, and inodedeps.
@@ -299,11 +318,41 @@ struct newblk {
*/
struct bmsafemap {
struct worklist sm_list; /* cylgrp buffer */
+# define sm_state sm_list.wk_state
+ int sm_cg;
+ LIST_ENTRY(bmsafemap) sm_hash; /* Hash links. */
struct buf *sm_buf; /* associated buffer */
struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */
+ struct allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */
struct allocindirhd sm_allocindirhd; /* allocindir deps */
+ struct allocindirhd sm_allocindirwr; /* writing allocindir deps */
struct inodedephd sm_inodedephd; /* inodedep deps */
+ struct inodedephd sm_inodedepwr; /* writing inodedep deps */
struct newblkhd sm_newblkhd; /* newblk deps */
+ struct newblkhd sm_newblkwr; /* writing newblk deps */
+ struct jaddrefhd sm_jaddrefhd; /* Pending inode allocations. */
+ struct jnewblkhd sm_jnewblkhd; /* Pending block allocations. */
+};
+
+/*
+ * A "newblk" structure is attached to a bmsafemap structure when a block
+ * or fragment is allocated from a cylinder group. Its state is set to
+ * DEPCOMPLETE when its cylinder group map is written. It is converted to
+ * an allocdirect or allocindir allocation once the allocator calls the
+ * appropriate setup function.
+ */
+struct newblk {
+ struct worklist nb_list;
+# define nb_state nb_list.wk_state
+ LIST_ENTRY(newblk) nb_hash; /* hashed lookup */
+ LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblks */
+ struct jnewblk *nb_jnewblk; /* New block journal entry. */
+ struct bmsafemap *nb_bmsafemap;/* cylgrp dep (if pending) */
+ struct freefrag *nb_freefrag; /* fragment to be freed (if any) */
+ struct indirdephd nb_indirdeps; /* Children indirect blocks. */
+ struct workhead nb_newdirblk; /* dir block to notify when written */
+ struct workhead nb_jwork; /* Journal work pending. */
+ ufs2_daddr_t nb_newblkno; /* new value of block pointer */
};
/*
@@ -334,20 +383,18 @@ struct bmsafemap {
* and inodedep->id_pendinghd lists.
*/
struct allocdirect {
- struct worklist ad_list; /* buffer holding block */
-# define ad_state ad_list.wk_state /* block pointer state */
+ struct newblk ad_block; /* Common block logic */
+# define ad_state ad_block.nb_list.wk_state /* block pointer state */
TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */
- ufs_lbn_t ad_lbn; /* block within file */
- ufs2_daddr_t ad_newblkno; /* new value of block pointer */
- ufs2_daddr_t ad_oldblkno; /* old value of block pointer */
- long ad_newsize; /* size of new block */
- long ad_oldsize; /* size of old block */
- LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */
- struct buf *ad_buf; /* cylgrp buffer (if pending) */
struct inodedep *ad_inodedep; /* associated inodedep */
- struct freefrag *ad_freefrag; /* fragment to be freed (if any) */
- struct workhead ad_newdirblk; /* dir block to notify when written */
+ ufs2_daddr_t ad_oldblkno; /* old value of block pointer */
+ int ad_offset; /* Pointer offset in parent. */
+ long ad_newsize; /* size of new block */
+ long ad_oldsize; /* size of old block */
};
+#define ad_newblkno ad_block.nb_newblkno
+#define ad_freefrag ad_block.nb_freefrag
+#define ad_newdirblk ad_block.nb_newdirblk
/*
* A single "indirdep" structure manages all allocation dependencies for
@@ -369,10 +416,14 @@ struct allocdirect {
struct indirdep {
struct worklist ir_list; /* buffer holding indirect block */
# define ir_state ir_list.wk_state /* indirect block pointer state */
- caddr_t ir_saveddata; /* buffer cache contents */
+ LIST_ENTRY(indirdep) ir_next; /* alloc{direct,indir} list */
+ caddr_t ir_saveddata; /* buffer cache contents */
struct buf *ir_savebp; /* buffer holding safe copy */
+ struct allocindirhd ir_completehd; /* waiting for indirdep complete */
+ struct allocindirhd ir_writehd; /* Waiting for the pointer write. */
struct allocindirhd ir_donehd; /* done waiting to update safecopy */
struct allocindirhd ir_deplisthd; /* allocindir deps for this block */
+ struct workhead ir_jwork; /* Journal work pending. */
};
/*
@@ -389,16 +440,25 @@ struct indirdep {
* can then be freed as it is no longer applicable.
*/
struct allocindir {
- struct worklist ai_list; /* buffer holding indirect block */
-# define ai_state ai_list.wk_state /* indirect block pointer state */
+ struct newblk ai_block; /* Common block area */
+# define ai_state ai_block.nb_list.wk_state /* indirect pointer state */
LIST_ENTRY(allocindir) ai_next; /* indirdep's list of allocindir's */
- int ai_offset; /* pointer offset in indirect block */
- ufs2_daddr_t ai_newblkno; /* new block pointer value */
- ufs2_daddr_t ai_oldblkno; /* old block pointer value */
- struct freefrag *ai_freefrag; /* block to be freed when complete */
struct indirdep *ai_indirdep; /* address of associated indirdep */
- LIST_ENTRY(allocindir) ai_deps; /* bmsafemap's list of allocindir's */
- struct buf *ai_buf; /* cylgrp buffer (if pending) */
+ ufs2_daddr_t ai_oldblkno; /* old value of block pointer */
+ int ai_offset; /* Pointer offset in parent. */
+};
+#define ai_newblkno ai_block.nb_newblkno
+#define ai_freefrag ai_block.nb_freefrag
+#define ai_newdirblk ai_block.nb_newdirblk
+
+/*
+ * The allblk union is used to size the newblk structure on allocation so
+ * that it may be any one of three types.
+ */
+union allblk {
+ struct allocindir ab_allocindir;
+ struct allocdirect ab_allocdirect;
+ struct newblk ab_newblk;
};
/*
@@ -406,14 +466,13 @@ struct allocindir {
* allocated fragment is replaced with a larger fragment, rather than extended.
* The "freefrag" structure is constructed and attached when the replacement
* block is first allocated. It is processed after the inode claiming the
- * bigger block that replaces it has been written to disk. Note that the
- * ff_state field is is used to store the uid, so may lose data. However,
- * the uid is used only in printing an error message, so is not critical.
- * Keeping it in a short keeps the data structure down to 32 bytes.
+ * bigger block that replaces it has been written to disk.
*/
struct freefrag {
struct worklist ff_list; /* id_inowait or delayed worklist */
-# define ff_state ff_list.wk_state /* owning user; should be uid_t */
+# define ff_state ff_list.wk_state
+ struct jfreefrag *ff_jfreefrag; /* Associated journal entry. */
+ struct workhead ff_jwork; /* Journal work pending. */
ufs2_daddr_t ff_blkno; /* fragment physical block number */
long ff_fragsize; /* size of fragment being deleted */
ino_t ff_inum; /* owning inode number */
@@ -423,20 +482,57 @@ struct freefrag {
* A "freeblks" structure is attached to an "inodedep" when the
* corresponding file's length is reduced to zero. It records all
* the information needed to free the blocks of a file after its
- * zero'ed inode has been written to disk.
+ * zero'ed inode has been written to disk. The actual work is done
+ * by child freework structures which are responsible for individual
+ * inode pointers while freeblks is responsible for retiring the
+ * entire operation when it is complete and holding common members.
*/
struct freeblks {
struct worklist fb_list; /* id_inowait or delayed worklist */
# define fb_state fb_list.wk_state /* inode and dirty block state */
+ struct jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */
+ struct workhead fb_freeworkhd; /* Work items pending */
+ struct workhead fb_jwork; /* Journal work pending */
ino_t fb_previousinum; /* inode of previous owner of blocks */
uid_t fb_uid; /* uid of previous owner of blocks */
struct vnode *fb_devvp; /* filesystem device vnode */
- long fb_oldextsize; /* previous ext data size */
- off_t fb_oldsize; /* previous file size */
ufs2_daddr_t fb_chkcnt; /* used to check cnt of blks released */
- ufs2_daddr_t fb_dblks[NDADDR]; /* direct blk ptrs to deallocate */
- ufs2_daddr_t fb_iblks[NIADDR]; /* indirect blk ptrs to deallocate */
- ufs2_daddr_t fb_eblks[NXADDR]; /* indirect blk ptrs to deallocate */
+ int fb_ref; /* Children outstanding. */
+};
+
+/*
+ * A "freework" structure handles the release of a tree of blocks or a single
+ * block. Each indirect block in a tree is allocated its own freework
+ * structure so that the indrect block may be freed only when all of its
+ * children are freed. In this way we enforce the rule that an allocated
+ * block must have a valid path to a root that is journaled. Each child
+ * block acquires a reference and when the ref hits zero the parent ref
+ * is decremented. If there is no parent the freeblks ref is decremented.
+ */
+struct freework {
+ struct worklist fw_list;
+# define fw_state fw_list.wk_state
+ LIST_ENTRY(freework) fw_next; /* Queue for freeblksk. */
+ struct freeblks *fw_freeblks; /* Root of operation. */
+ struct freework *fw_parent; /* Parent indirect. */
+ ufs2_daddr_t fw_blkno; /* Our block #. */
+ ufs_lbn_t fw_lbn; /* Original lbn before free. */
+ int fw_frags; /* Number of frags. */
+ int fw_ref; /* Number of children out. */
+ int fw_off; /* Current working position. */
+ struct workhead fw_jwork; /* Journal work pending. */
+};
+
+/*
+ * A "freedep" structure is allocated to track the completion of a bitmap
+ * write for a freework. One freedep may cover many freed blocks so long
+ * as they reside in the same cylinder group. When the cg is written
+ * the freedep decrements the ref on the freework which may permit it
+ * to be freed as well.
+ */
+struct freedep {
+ struct worklist fd_list;
+ struct freework *fd_freework; /* Parent freework. */
};
/*
@@ -450,6 +546,7 @@ struct freefile {
mode_t fx_mode; /* mode of inode */
ino_t fx_oldinum; /* inum of the unlinked file */
struct vnode *fx_devvp; /* filesystem device vnode */
+ struct workhead fx_jwork; /* journal work pending. */
};
/*
@@ -482,12 +579,11 @@ struct freefile {
* than zero.
*
* The overlaying of da_pagedep and da_previous is done to keep the
- * structure down to 32 bytes in size on a 32-bit machine. If a
- * da_previous entry is present, the pointer to its pagedep is available
- * in the associated dirrem entry. If the DIRCHG flag is set, the
- * da_previous entry is valid; if not set the da_pagedep entry is valid.
- * The DIRCHG flag never changes; it is set when the structure is created
- * if appropriate and is never cleared.
+ * structure down. If a da_previous entry is present, the pointer to its
+ * pagedep is available in the associated dirrem entry. If the DIRCHG flag
+ * is set, the da_previous entry is valid; if not set the da_pagedep entry
+ * is valid. The DIRCHG flag never changes; it is set when the structure
+ * is created if appropriate and is never cleared.
*/
struct diradd {
struct worklist da_list; /* id_inowait or id_pendinghd list */
@@ -499,6 +595,7 @@ struct diradd {
struct dirrem *dau_previous; /* entry being replaced in dir change */
struct pagedep *dau_pagedep; /* pagedep dependency for addition */
} da_un;
+ struct workhead da_jwork; /* Journal work awaiting completion. */
};
#define da_previous da_un.dau_previous
#define da_pagedep da_un.dau_pagedep
@@ -525,12 +622,13 @@ struct diradd {
* mkdir structures that reference it. The deletion would be faster if the
* diradd structure were simply augmented to have two pointers that referenced
* the associated mkdir's. However, this would increase the size of the diradd
- * structure from 32 to 64-bits to speed a very infrequent operation.
+ * structure to speed a very infrequent operation.
*/
struct mkdir {
struct worklist md_list; /* id_inowait or buffer holding dir */
# define md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */
struct diradd *md_diradd; /* associated diradd */
+ struct jaddref *md_jaddref; /* dependent jaddref. */
struct buf *md_buf; /* MKDIR_BODY: buffer holding dir */
LIST_ENTRY(mkdir) md_mkdirs; /* list of all mkdirs */
};
@@ -542,20 +640,19 @@ LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
* list of the pagedep for the directory page that contains the entry.
* It is processed after the directory page with the deleted entry has
* been written to disk.
- *
- * The overlaying of dm_pagedep and dm_dirinum is done to keep the
- * structure down to 32 bytes in size on a 32-bit machine. It works
- * because they are never used concurrently.
*/
struct dirrem {
struct worklist dm_list; /* delayed worklist */
# define dm_state dm_list.wk_state /* state of the old directory entry */
LIST_ENTRY(dirrem) dm_next; /* pagedep's list of dirrem's */
+ LIST_ENTRY(dirrem) dm_inonext; /* inodedep's list of dirrem's */
+ struct jremrefhd dm_jremrefhd; /* Pending remove reference deps. */
ino_t dm_oldinum; /* inum of the removed dir entry */
union {
struct pagedep *dmu_pagedep; /* pagedep dependency for remove */
ino_t dmu_dirinum; /* parent inode number (for rmdir) */
} dm_un;
+ struct workhead dm_jwork; /* Journal work awaiting completion. */
};
#define dm_pagedep dm_un.dmu_pagedep
#define dm_dirinum dm_un.dmu_dirinum
@@ -577,9 +674,200 @@ struct dirrem {
* blocks using a similar scheme with the allocindir structures. Rather
* than adding this level of complexity, we simply write those newly
* allocated indirect blocks synchronously as such allocations are rare.
+ * In the case of a new directory the . and .. links are tracked with
+ * a mkdir rather than a pagedep. In this case we track the mkdir
+ * so it can be released when it is written. A workhead is used
+ * to simplify canceling a mkdir that is removed by a subsequent dirrem.
*/
struct newdirblk {
struct worklist db_list; /* id_inowait or pg_newdirblk */
# define db_state db_list.wk_state /* unused */
struct pagedep *db_pagedep; /* associated pagedep */
+ struct workhead db_mkdir;
+};
+
+/*
+ * The inoref structure holds the elements common to jaddref and jremref
+ * so they may easily be queued in-order on the inodedep.
+ */
+struct inoref {
+ struct worklist if_list;
+# define if_state if_list.wk_state
+ TAILQ_ENTRY(inoref) if_deps; /* Links for inodedep. */
+ struct jsegdep *if_jsegdep;
+ off_t if_diroff; /* Directory offset. */
+ ino_t if_ino; /* Inode number. */
+ ino_t if_parent; /* Parent inode number. */
+ nlink_t if_nlink; /* nlink before addition. */
+ uint16_t if_mode; /* File mode, needed for IFMT. */
+};
+
+/*
+ * A "jaddref" structure tracks a new reference (link count) on an inode
+ * and prevents the link count increase and bitmap allocation until a
+ * journal entry can be written. Once the journal entry is written,
+ * the inode is put on the pendinghd of the bmsafemap and a diradd or
+ * mkdir entry is placed on the bufwait list of the inode. The DEPCOMPLETE
+ * flag is used to indicate that all of the required information for writing
+ * the journal entry is present. MKDIR_BODY and MKDIR_PARENT are used to
+ * differentiate . and .. links from regular file names. NEWBLOCK indicates
+ * a bitmap is still pending. If a new reference is canceled by a delete
+ * prior to writing the journal the jaddref write is canceled and the
+ * structure persists to prevent any disk-visible changes until it is
+ * ultimately released when the file is freed or the link is dropped again.
+ */
+struct jaddref {
+ struct inoref ja_ref;
+# define ja_list ja_ref.if_list /* Journal pending or jseg entries. */
+# define ja_state ja_ref.if_list.wk_state
+ LIST_ENTRY(jaddref) ja_bmdeps; /* Links for bmsafemap. */
+ union {
+ struct diradd *jau_diradd; /* Pending diradd. */
+ struct mkdir *jau_mkdir; /* MKDIR_{PARENT,BODY} */
+ } ja_un;
+};
+#define ja_diradd ja_un.jau_diradd
+#define ja_mkdir ja_un.jau_mkdir
+#define ja_diroff ja_ref.if_diroff
+#define ja_ino ja_ref.if_ino
+#define ja_parent ja_ref.if_parent
+#define ja_mode ja_ref.if_mode
+
+/*
+ * A "jremref" structure tracks a removed reference (unlink) on an
+ * inode and prevents the directory remove from proceeding until the
+ * journal entry is written. Once the journal has been written the remove
+ * may proceed as normal.
+ */
+struct jremref {
+ struct inoref jr_ref;
+# define jr_list jr_ref.if_list /* Journal pending or jseg entries. */
+# define jr_state jr_ref.if_list.wk_state
+ LIST_ENTRY(jremref) jr_deps; /* Links for pagdep. */
+ struct dirrem *jr_dirrem; /* Back pointer to dirrem. */
+};
+
+struct jmvref {
+ struct worklist jm_list;
+ LIST_ENTRY(jmvref) jm_deps;
+ struct pagedep *jm_pagedep;
+ ino_t jm_parent;
+ ino_t jm_ino;
+ off_t jm_oldoff;
+ off_t jm_newoff;
+};
+
+/*
+ * A "jnewblk" structure tracks a newly allocated block or fragment and
+ * prevents the direct or indirect block pointer as well as the cg bitmap
+ * from being written until it is logged. After it is logged the jsegdep
+ * is attached to the allocdirect or allocindir until the operation is
+ * completed or reverted. If the operation is reverted prior to the journal
+ * write the jnewblk structure is maintained to prevent the bitmaps from
+ * reaching the disk. Ultimately the jnewblk structure will be passed
+ * to the free routine as the in memory cg is modified back to the free
+ * state at which time it can be released.
+ */
+struct jnewblk {
+ struct worklist jn_list;
+# define jn_state jn_list.wk_state
+ struct jsegdep *jn_jsegdep;
+ LIST_ENTRY(jnewblk) jn_deps; /* All jnewblks on bmsafemap */
+ struct newblk *jn_newblk;
+ ino_t jn_ino;
+ ufs_lbn_t jn_lbn;
+ ufs2_daddr_t jn_blkno;
+ int jn_oldfrags;
+ int jn_frags;
+};
+
+/*
+ * A "jfreeblk" structure tracks the journal write for freeing a block
+ * or tree of blocks. The block pointer must not be cleared in the inode
+ * or indirect prior to the jfreeblk being written.
+ */
+struct jfreeblk {
+ struct worklist jf_list;
+# define jf_state jf_list.wk_state
+ struct jsegdep *jf_jsegdep;
+ struct freeblks *jf_freeblks;
+ LIST_ENTRY(jfreeblk) jf_deps;
+ ino_t jf_ino;
+ ufs_lbn_t jf_lbn;
+ ufs2_daddr_t jf_blkno;
+ int jf_frags;
+};
+
+/*
+ * A "jfreefrag" tracks the freeing of a single block when a fragment is
+ * extended or an indirect page is replaced. It is not part of a larger
+ * freeblks operation.
+ */
+struct jfreefrag {
+ struct worklist fr_list;
+# define fr_state fr_list.wk_state
+ struct jsegdep *fr_jsegdep;
+ struct freefrag *fr_freefrag;
+ ino_t fr_ino;
+ ufs_lbn_t fr_lbn;
+ ufs2_daddr_t fr_blkno;
+ int fr_frags;
+};
+
+/*
+ * A "jtrunc" journals the intent to truncate an inode to a non-zero
+ * value. This is done synchronously prior to the synchronous partial
+ * truncation process. The jsegdep is not released until the truncation
+ * is complete and the truncated inode is fsync'd.
+ */
+struct jtrunc {
+ struct worklist jt_list;
+ struct jsegdep *jt_jsegdep;
+ ino_t jt_ino;
+ off_t jt_size;
+ int jt_extsize;
+};
+
+/*
+ * A "jsegdep" structure tracks a single reference to a written journal
+ * segment so the journal space can be reclaimed when all dependencies
+ * have been written.
+ */
+struct jsegdep {
+ struct worklist jd_list;
+# define jd_state jd_list.wk_state
+ struct jseg *jd_seg;
+};
+
+/*
+ * A "jseg" structure contains all of the journal records written in a
+ * single disk write. jaddref and jremref structures are linked into
+ * js_entries so thay may be completed when the write completes. The
+ * js_deps array contains as many entries as there are ref counts to
+ * reduce the number of allocations required per journal write to one.
+ */
+struct jseg {
+ struct worklist js_list; /* b_deps link for journal */
+# define js_state js_list.wk_state
+ struct workhead js_entries; /* Entries awaiting write */
+ TAILQ_ENTRY(jseg) js_next;
+ struct jblocks *js_jblocks; /* Back pointer to block/seg list */
+ struct buf *js_buf; /* Buffer while unwritten */
+ uint64_t js_seq;
+ int js_size; /* Allocated size in bytes */
+ int js_cnt; /* Total items allocated */
+ int js_refs; /* Count of items pending completion */
+};
+
+/*
+ * A 'sbdep' structure tracks the head of the free inode list and
+ * superblock writes. This makes sure the superblock is always pointing at
+ * the first possible unlinked inode for the suj recovery process. If a
+ * block write completes and we discover a new head is available the buf
+ * is dirtied and the dep is kept.
+ */
+struct sbdep {
+ struct worklist sb_list; /* b_dep linkage */
+ struct fs *sb_fs; /* Filesystem pointer within buf. */
+ struct ufsmount *sb_ump;
};
diff --git a/sys/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h
index 7f9e7c56496e..c75257c8e62d 100644
--- a/sys/ufs/ufs/dinode.h
+++ b/sys/ufs/ufs/dinode.h
@@ -146,7 +146,8 @@ struct ufs2_dinode {
ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */
ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */
u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */
- int64_t di_spare[2]; /* 240: Reserved; currently unused */
+ ino_t di_freelink; /* 240: SUJ: Next unlinked inode. */
+ uint32_t di_spare[3]; /* 244: Reserved; currently unused */
};
/*
@@ -167,9 +168,7 @@ struct ufs2_dinode {
struct ufs1_dinode {
u_int16_t di_mode; /* 0: IFMT, permissions; see below. */
int16_t di_nlink; /* 2: File link count. */
- union {
- u_int16_t oldids[2]; /* 4: Ffs: old user and group ids. */
- } di_u;
+ ino_t di_freelink; /* 4: SUJ: Next unlinked inode. */
u_int64_t di_size; /* 8: File byte count. */
int32_t di_atime; /* 16: Last access time. */
int32_t di_atimensec; /* 20: Last access time. */
@@ -186,7 +185,5 @@ struct ufs1_dinode {
u_int32_t di_gid; /* 116: File group. */
u_int64_t di_modrev; /* 120: i_modrev for NFSv4 */
};
-#define di_ogid di_u.oldids[1]
-#define di_ouid di_u.oldids[0]
#endif /* _UFS_UFS_DINODE_H_ */
diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h
index 565580e60460..295b12975e25 100644
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@@ -120,7 +120,7 @@ struct inode {
#define IN_CHANGE 0x0002 /* Inode change time update request. */
#define IN_UPDATE 0x0004 /* Modification time update request. */
#define IN_MODIFIED 0x0008 /* Inode has been modified. */
-#define IN_RENAME 0x0010 /* Inode is being renamed. */
+#define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */
#define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */
#define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */
#define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the
@@ -175,6 +175,7 @@ struct indir {
/* Determine if soft dependencies are being done */
#define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
#define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)
+#define DOINGSUJ(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_SUJ)
/* This overlays the fid structure (see mount.h). */
struct ufid {
diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c
index c85fdc8980f6..d7c1d0ddb821 100644
--- a/sys/ufs/ufs/ufs_dirhash.c
+++ b/sys/ufs/ufs/ufs_dirhash.c
@@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$");
static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");
-static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
-
static int ufs_mindirhashsize = DIRBLKSIZ * 5;
SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
&ufs_mindirhashsize,
diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h
index b2e4a9757305..6658b663fb14 100644
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h
@@ -57,7 +57,7 @@ int ufs_bmap(struct vop_bmap_args *);
int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *,
struct buf *, int *, int *);
int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **);
-int ufs_checkpath(ino_t, struct inode *, struct ucred *);
+int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *);
void ufs_dirbad(struct inode *, doff_t, char *);
int ufs_dirbadentry(struct vnode *, struct direct *, int);
int ufs_dirempty(struct inode *, ino_t, struct ucred *);
@@ -66,9 +66,11 @@ int ufs_extwrite(struct vop_write_args *);
void ufs_makedirentry(struct inode *, struct componentname *,
struct direct *);
int ufs_direnter(struct vnode *, struct vnode *, struct direct *,
- struct componentname *, struct buf *);
+ struct componentname *, struct buf *, int);
int ufs_dirremove(struct vnode *, struct inode *, int, int);
int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int);
+int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *,
+ ino_t *);
int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *);
int ufs_inactive(struct vop_inactive_args *);
int ufs_init(struct vfsconf *);
@@ -81,19 +83,33 @@ vfs_root_t ufs_root;
int ufs_uninit(struct vfsconf *);
int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **);
+#include <sys/sysctl.h>
+SYSCTL_DECL(_vfs_ufs);
+
/*
* Soft update function prototypes.
*/
int softdep_setup_directory_add(struct buf *, struct inode *, off_t,
ino_t, struct buf *, int);
-void softdep_change_directoryentry_offset(struct inode *, caddr_t,
- caddr_t, caddr_t, int);
+void softdep_change_directoryentry_offset(struct buf *, struct inode *,
+ caddr_t, caddr_t, caddr_t, int);
void softdep_setup_remove(struct buf *,struct inode *, struct inode *, int);
void softdep_setup_directory_change(struct buf *, struct inode *,
struct inode *, ino_t, int);
void softdep_change_linkcnt(struct inode *);
void softdep_releasefile(struct inode *);
int softdep_slowdown(struct vnode *);
+void softdep_setup_create(struct inode *, struct inode *);
+void softdep_setup_dotdot_link(struct inode *, struct inode *);
+void softdep_setup_link(struct inode *, struct inode *);
+void softdep_setup_mkdir(struct inode *, struct inode *);
+void softdep_setup_rmdir(struct inode *, struct inode *);
+void softdep_setup_unlink(struct inode *, struct inode *);
+void softdep_revert_create(struct inode *, struct inode *);
+void softdep_revert_dotdot_link(struct inode *, struct inode *);
+void softdep_revert_link(struct inode *, struct inode *);
+void softdep_revert_mkdir(struct inode *, struct inode *);
+void softdep_revert_rmdir(struct inode *, struct inode *);
/*
* Flags to low-level allocation routines. The low 16-bits are reserved
diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
index ab71cf6df550..0030c5264bd1 100644
--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c
@@ -77,9 +77,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, "");
/* true if old FS format...*/
#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0)
-static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *,
- ino_t *);
-
static int
ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred,
struct thread *td)
@@ -189,11 +186,11 @@ ufs_lookup(ap)
} */ *ap;
{
- return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
+ return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
}
-static int
-ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
+int
+ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
ino_t *dd_ino)
{
struct inode *dp; /* inode for directory being searched */
@@ -524,6 +521,8 @@ notfound:
return (ENOENT);
found:
+ if (dd_ino != NULL)
+ *dd_ino = ino;
if (numdirpasses == 2)
nchstats.ncs_pass2++;
/*
@@ -546,11 +545,6 @@ found:
if ((flags & ISLASTCN) && nameiop == LOOKUP)
dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1);
- if (dd_ino != NULL) {
- *dd_ino = ino;
- return (0);
- }
-
/*
* If deleting, and at end of pathname, return
* parameters which can be used to remove file.
@@ -558,17 +552,6 @@ found:
if (nameiop == DELETE && (flags & ISLASTCN)) {
if (flags & LOCKPARENT)
ASSERT_VOP_ELOCKED(vdp, __FUNCTION__);
- if ((error = VFS_VGET(vdp->v_mount, ino,
- LK_EXCLUSIVE, &tdp)) != 0)
- return (error);
-
- error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);
- if (error) {
- vput(tdp);
- return (error);
- }
-
-
/*
* Return pointer to current entry in dp->i_offset,
* and distance past previous entry (if there
@@ -585,6 +568,16 @@ found:
dp->i_count = 0;
else
dp->i_count = dp->i_offset - prevoff;
+ if (dd_ino != NULL)
+ return (0);
+ if ((error = VFS_VGET(vdp->v_mount, ino,
+ LK_EXCLUSIVE, &tdp)) != 0)
+ return (error);
+ error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);
+ if (error) {
+ vput(tdp);
+ return (error);
+ }
if (dp->i_number == ino) {
VREF(vdp);
*vpp = vdp;
@@ -616,6 +609,8 @@ found:
dp->i_offset = i_offset;
if (dp->i_number == ino)
return (EISDIR);
+ if (dd_ino != NULL)
+ return (0);
if ((error = VFS_VGET(vdp->v_mount, ino,
LK_EXCLUSIVE, &tdp)) != 0)
return (error);
@@ -650,6 +645,8 @@ found:
cnp->cn_flags |= SAVENAME;
return (0);
}
+ if (dd_ino != NULL)
+ return (0);
/*
* Step through the translation in the name. We do not `vput' the
@@ -681,7 +678,7 @@ found:
* to the inode we looked up before vdp lock was
* dropped.
*/
- error = ufs_lookup_(pdp, NULL, cnp, &ino1);
+ error = ufs_lookup_ino(pdp, NULL, cnp, &ino1);
if (error) {
vput(tdp);
return (error);
@@ -833,12 +830,13 @@ ufs_makedirentry(ip, cnp, newdirp)
* soft dependency code).
*/
int
-ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
+ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename)
struct vnode *dvp;
struct vnode *tvp;
struct direct *dirp;
struct componentname *cnp;
struct buf *newdirbp;
+ int isrename;
{
struct ucred *cr;
struct thread *td;
@@ -911,22 +909,28 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
blkoff += DIRBLKSIZ;
}
if (softdep_setup_directory_add(bp, dp, dp->i_offset,
- dirp->d_ino, newdirbp, 1) == 0) {
- bdwrite(bp);
+ dirp->d_ino, newdirbp, 1))
+ dp->i_flag |= IN_NEEDSYNC;
+ if (newdirbp)
+ bdwrite(newdirbp);
+ bdwrite(bp);
+ if ((dp->i_flag & IN_NEEDSYNC) == 0)
return (UFS_UPDATE(dvp, 0));
- }
- /* We have just allocated a directory block in an
- * indirect block. Rather than tracking when it gets
- * claimed by the inode, we simply do a VOP_FSYNC
- * now to ensure that it is there (in case the user
- * does a future fsync). Note that we have to unlock
- * the inode for the entry that we just entered, as
- * the VOP_FSYNC may need to lock other inodes which
- * can lead to deadlock if we also hold a lock on
- * the newly entered node.
+ /*
+ * We have just allocated a directory block in an
+ * indirect block. We must prevent holes in the
+ * directory created if directory entries are
+ * written out of order. To accomplish this we
+ * fsync when we extend a directory into indirects.
+ * During rename it's not safe to drop the tvp lock
+ * so sync must be delayed until it is.
+ *
+ * This synchronous step could be removed if fsck and
+ * the kernel were taught to fill in sparse
+ * directories rather than panic.
*/
- if ((error = bwrite(bp)))
- return (error);
+ if (isrename)
+ return (0);
if (tvp != NULL)
VOP_UNLOCK(tvp, 0);
error = VOP_FSYNC(dvp, MNT_WAIT, td);
@@ -1015,7 +1019,7 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
dp->i_offset + ((char *)ep - dirbuf));
#endif
if (DOINGSOFTDEP(dvp))
- softdep_change_directoryentry_offset(dp, dirbuf,
+ softdep_change_directoryentry_offset(bp, dp, dirbuf,
(caddr_t)nep, (caddr_t)ep, dsize);
else
bcopy((caddr_t)nep, (caddr_t)ep, dsize);
@@ -1067,6 +1071,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
(void) softdep_setup_directory_add(bp, dp,
dp->i_offset + (caddr_t)ep - dirbuf,
dirp->d_ino, newdirbp, 0);
+ if (newdirbp != NULL)
+ bdwrite(newdirbp);
bdwrite(bp);
} else {
if (DOINGASYNC(dvp)) {
@@ -1084,7 +1090,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
* lock other inodes which can lead to deadlock if we also hold a
* lock on the newly entered node.
*/
- if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {
+ if (isrename == 0 && error == 0 &&
+ dp->i_endoff && dp->i_endoff < dp->i_size) {
if (tvp != NULL)
VOP_UNLOCK(tvp, 0);
#ifdef UFS_DIRHASH
@@ -1125,6 +1132,19 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
dp = VTOI(dvp);
+ /*
+ * Adjust the link count early so softdep can block if necessary.
+ */
+ if (ip) {
+ ip->i_effnlink--;
+ if (DOINGSOFTDEP(dvp)) {
+ softdep_setup_unlink(dp, ip);
+ } else {
+ ip->i_nlink--;
+ DIP_SET(ip, i_nlink, ip->i_nlink);
+ ip->i_flag |= IN_CHANGE;
+ }
+ }
if (flags & DOWHITEOUT) {
/*
* Whiteout entry: set d_ino to WINO.
@@ -1154,6 +1174,9 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
if (dp->i_dirhash != NULL)
ufsdirhash_remove(dp, rep, dp->i_offset);
#endif
+ if (ip && rep->d_ino != ip->i_number)
+ panic("ufs_dirremove: ip %d does not match dirent ino %d\n",
+ ip->i_number, rep->d_ino);
if (dp->i_count == 0) {
/*
* First entry in block: set d_ino to zero.
@@ -1172,31 +1195,20 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
dp->i_offset & ~(DIRBLKSIZ - 1));
#endif
out:
+ error = 0;
if (DOINGSOFTDEP(dvp)) {
- if (ip) {
- ip->i_effnlink--;
- softdep_change_linkcnt(ip);
+ if (ip)
softdep_setup_remove(bp, dp, ip, isrmdir);
- }
- if (softdep_slowdown(dvp)) {
+ if (softdep_slowdown(dvp))
error = bwrite(bp);
- } else {
+ else
bdwrite(bp);
- error = 0;
- }
} else {
- if (ip) {
- ip->i_effnlink--;
- ip->i_nlink--;
- DIP_SET(ip, i_nlink, ip->i_nlink);
- ip->i_flag |= IN_CHANGE;
- }
if (flags & DOWHITEOUT)
error = bwrite(bp);
- else if (DOINGASYNC(dvp) && dp->i_count != 0) {
+ else if (DOINGASYNC(dvp) && dp->i_count != 0)
bdwrite(bp);
- error = 0;
- } else
+ else
error = bwrite(bp);
}
dp->i_flag |= IN_CHANGE | IN_UPDATE;
@@ -1229,6 +1241,19 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
struct vnode *vdp = ITOV(dp);
int error;
+ /*
+ * Drop the link before we lock the buf so softdep can block if
+ * necessary.
+ */
+ oip->i_effnlink--;
+ if (DOINGSOFTDEP(vdp)) {
+ softdep_setup_unlink(dp, oip);
+ } else {
+ oip->i_nlink--;
+ DIP_SET(oip, i_nlink, oip->i_nlink);
+ oip->i_flag |= IN_CHANGE;
+ }
+
error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp);
if (error)
return (error);
@@ -1240,15 +1265,10 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
ep->d_ino = newinum;
if (!OFSFMT(vdp))
ep->d_type = newtype;
- oip->i_effnlink--;
if (DOINGSOFTDEP(vdp)) {
- softdep_change_linkcnt(oip);
softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
bdwrite(bp);
} else {
- oip->i_nlink--;
- DIP_SET(oip, i_nlink, oip->i_nlink);
- oip->i_flag |= IN_CHANGE;
if (DOINGASYNC(vdp)) {
bdwrite(bp);
error = 0;
@@ -1363,25 +1383,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct ucred *cred, ino_t *dd_ino)
/*
* Check if source directory is in the path of the target directory.
- * Target is supplied locked, source is unlocked.
- * The target is always vput before returning.
*/
int
-ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)
+ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino)
{
- struct vnode *vp, *vp1;
+ struct mount *mp;
+ struct vnode *tvp, *vp, *vp1;
int error;
ino_t dd_ino;
- vp = ITOV(target);
- if (target->i_number == source_ino) {
- error = EEXIST;
- got