src - FreeBSD source tree

diff options


context:
space:
mode:

author	Kirk McKusick <mckusick@FreeBSD.org>	2010-04-28 05:33:59 +0000
committer	Kirk McKusick <mckusick@FreeBSD.org>	2010-04-28 05:33:59 +0000
commit	a4bf5fb987611aeb78c422312b63b185e39982d7 (patch)
tree	a65d36ab57a1e076de7e7a1d78add642fbd7062e /sys/ufs
parent	509210970393a1a8cd8a65d5340dc4bed069fa68 (diff)
parent	b641222476732f1f99d2362f093b79bbe088d764 (diff)
download	src-a4bf5fb987611aeb78c422312b63b185e39982d7.tar.gz src-a4bf5fb987611aeb78c422312b63b185e39982d7.zip

Update to current version of head.

Notes

Notes: svn path=/projects/quota64/; revision=207307

Diffstat (limited to 'sys/ufs')

-rw-r--r--

sys/ufs/ffs/ffs_alloc.c

252

-rw-r--r--

sys/ufs/ffs/ffs_balloc.c

-rw-r--r--

sys/ufs/ffs/ffs_extern.h

-rw-r--r--

sys/ufs/ffs/ffs_inode.c

132

-rw-r--r--

sys/ufs/ffs/ffs_snapshot.c

-rw-r--r--

sys/ufs/ffs/ffs_softdep.c

7289

-rw-r--r--

sys/ufs/ffs/ffs_subr.c

130

-rw-r--r--

sys/ufs/ffs/ffs_vfsops.c

-rw-r--r--

sys/ufs/ffs/ffs_vnops.c

-rw-r--r--

sys/ufs/ffs/fs.h

135

-rw-r--r--

sys/ufs/ffs/softdep.h

446

-rw-r--r--

sys/ufs/ufs/dinode.h

-rw-r--r--

sys/ufs/ufs/inode.h

-rw-r--r--

sys/ufs/ufs/ufs_dirhash.c

-rw-r--r--

sys/ufs/ufs/ufs_extern.h

-rw-r--r--

sys/ufs/ufs/ufs_lookup.c

192

-rw-r--r--

sys/ufs/ufs/ufs_vnops.c

583

-rw-r--r--

sys/ufs/ufs/ufsmount.h

18 files changed, 7576 insertions, 1803 deletions

diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index 7bf117719726..b1f7ba0127f7 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c

@@ -94,24 +94,24 @@ __FBSDID("$FreeBSD$");

#include <ufs/ffs/ffs_extern.h>

typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,

- int size);

+ int size, int rsize);

-static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int);

+static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);

static ufs2_daddr_t

- ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t);

+ ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);

#ifdef INVARIANTS

static int ffs_checkblk(struct inode *, ufs2_daddr_t, long);

#endif

-static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);

-static void ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *,

- ufs1_daddr_t, int);

+static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int,

+ int);

static ino_t ffs_dirpref(struct inode *);

static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,

int, int);

static void ffs_fserr(struct fs *, ino_t, char *);

static ufs2_daddr_t ffs_hashalloc

- (struct inode *, u_int, ufs2_daddr_t, int, allocfcn_t *);

-static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int);

+ (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *);

+static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,

+ int);

static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);

static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *);

static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *);

@@ -188,7 +188,7 @@ retry:

cg = ino_to_cg(fs, ip->i_number);

else

cg = dtog(fs, bpref);

- bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);

+ bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);

if (bno > 0) {

delta = btodb(size);

if (ip->i_flag & IN_SPACECOUNTED) {

@@ -387,16 +387,12 @@ retry:

panic("ffs_realloccg: bad optim");

/* NOTREACHED */

}

- bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);

+ bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);

if (bno > 0) {

bp->b_blkno = fsbtodb(fs, bno);

if (!DOINGSOFTDEP(vp))

ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize,

- ip->i_number);

- if (nsize < request)

- ffs_blkfree(ump, fs, ip->i_devvp,

- bno + numfrags(fs, nsize),

- (long)(request - nsize), ip->i_number);

+ ip->i_number, NULL);

delta = btodb(nsize - osize);

if (ip->i_flag & IN_SPACECOUNTED) {

UFS_LOCK(ump);

@@ -487,6 +483,14 @@ ffs_reallocblks(ap)

if (doreallocblks == 0)

return (ENOSPC);

+ /*

+ * We can't wait in softdep prealloc as it may fsync and recurse

+ * here. Instead we simply fail to reallocate blocks if this

+ * rare condition arises.

+ */

+ if (DOINGSOFTDEP(ap->a_vp))

+ if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)

+ return (ENOSPC);

if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1)

return (ffs_reallocblks_ufs1(ap));

return (ffs_reallocblks_ufs2(ap));

@@ -587,7 +591,7 @@ ffs_reallocblks_ufs1(ap)

* Search the block map looking for an allocation of the desired size.

if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,

- len, ffs_clusteralloc)) == 0) {

+ len, len, ffs_clusteralloc)) == 0) {

UFS_UNLOCK(ump);

goto fail;

}

@@ -673,7 +677,7 @@ ffs_reallocblks_ufs1(ap)

if (!DOINGSOFTDEP(vp))

ffs_blkfree(ump, fs, ip->i_devvp,

dbtofsb(fs, buflist->bs_children[i]->b_blkno),

- fs->fs_bsize, ip->i_number);

+ fs->fs_bsize, ip->i_number, NULL);

buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);

#ifdef INVARIANTS

if (!ffs_checkblk(ip,

@@ -795,7 +799,7 @@ ffs_reallocblks_ufs2(ap)

* Search the block map looking for an allocation of the desired size.

if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,

- len, ffs_clusteralloc)) == 0) {

+ len, len, ffs_clusteralloc)) == 0) {

UFS_UNLOCK(ump);

goto fail;

}

@@ -881,7 +885,7 @@ ffs_reallocblks_ufs2(ap)

if (!DOINGSOFTDEP(vp))

ffs_blkfree(ump, fs, ip->i_devvp,

dbtofsb(fs, buflist->bs_children[i]->b_blkno),

- fs->fs_bsize, ip->i_number);

+ fs->fs_bsize, ip->i_number, NULL);

buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);

#ifdef INVARIANTS

if (!ffs_checkblk(ip,

@@ -969,7 +973,7 @@ ffs_valloc(pvp, mode, cred, vpp)

if (fs->fs_contigdirs[cg] > 0)

fs->fs_contigdirs[cg]--;

}

- ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode,

+ ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,

(allocfcn_t *)ffs_nodealloccg);

if (ino == 0)

goto noinodes;

@@ -1278,11 +1282,12 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap)

/*VARARGS5*/

static ufs2_daddr_t

-ffs_hashalloc(ip, cg, pref, size, allocator)

+ffs_hashalloc(ip, cg, pref, size, rsize, allocator)

struct inode *ip;

u_int cg;

ufs2_daddr_t pref;

- int size; /* size for data blocks, mode for inodes */

+ int size; /* Search size for data blocks, mode for inodes */

+ int rsize; /* Real allocated size. */

allocfcn_t *allocator;

{

struct fs *fs;

@@ -1298,7 +1303,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)

* 1: preferred cylinder group

- result = (*allocator)(ip, cg, pref, size);

+ result = (*allocator)(ip, cg, pref, size, rsize);

if (result)

return (result);

@@ -1308,7 +1313,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)

cg += i;

if (cg >= fs->fs_ncg)

cg -= fs->fs_ncg;

- result = (*allocator)(ip, cg, 0, size);

+ result = (*allocator)(ip, cg, 0, size, rsize);

if (result)

return (result);

}

@@ -1319,7 +1324,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)

cg = (icg + 2) % fs->fs_ncg;

for (i = 2; i < fs->fs_ncg; i++) {

- result = (*allocator)(ip, cg, 0, size);

+ result = (*allocator)(ip, cg, 0, size, rsize);

if (result)

return (result);

cg++;

@@ -1401,7 +1406,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize)

ACTIVECLEAR(fs, cg);

UFS_UNLOCK(ump);

if (DOINGSOFTDEP(ITOV(ip)))

- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev);

+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,

+ frags, numfrags(fs, osize));

bdwrite(bp);

return (bprev);

@@ -1419,11 +1425,12 @@ fail:

* and if it is, allocate it.

static ufs2_daddr_t

-ffs_alloccg(ip, cg, bpref, size)

+ffs_alloccg(ip, cg, bpref, size, rsize)

struct inode *ip;

u_int cg;

ufs2_daddr_t bpref;

int size;

+ int rsize;

{

struct fs *fs;

struct cg *cgp;

@@ -1451,7 +1458,7 @@ ffs_alloccg(ip, cg, bpref, size)

cgp->cg_old_time = cgp->cg_time = time_second;

if (size == fs->fs_bsize) {

UFS_LOCK(ump);

- blkno = ffs_alloccgblk(ip, bp, bpref);

+ blkno = ffs_alloccgblk(ip, bp, bpref, rsize);

ACTIVECLEAR(fs, cg);

UFS_UNLOCK(ump);

bdwrite(bp);

@@ -1475,21 +1482,14 @@ ffs_alloccg(ip, cg, bpref, size)

if (cgp->cg_cs.cs_nbfree == 0)

goto fail;

UFS_LOCK(ump);

- blkno = ffs_alloccgblk(ip, bp, bpref);

- bno = dtogd(fs, blkno);

- for (i = frags; i < fs->fs_frag; i++)

- setbit(blksfree, bno + i);

- i = fs->fs_frag - frags;

- cgp->cg_cs.cs_nffree += i;

- fs->fs_cstotal.cs_nffree += i;

- fs->fs_cs(fs, cg).cs_nffree += i;

- fs->fs_fmod = 1;

- cgp->cg_frsum[i]++;

+ blkno = ffs_alloccgblk(ip, bp, bpref, rsize);

ACTIVECLEAR(fs, cg);

UFS_UNLOCK(ump);

bdwrite(bp);

return (blkno);

}

+ KASSERT(size == rsize,

+ ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));

bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);

if (bno < 0)

goto fail;

@@ -1507,7 +1507,7 @@ ffs_alloccg(ip, cg, bpref, size)

ACTIVECLEAR(fs, cg);

UFS_UNLOCK(ump);

if (DOINGSOFTDEP(ITOV(ip)))

- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);

+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);

bdwrite(bp);

return (blkno);

@@ -1529,10 +1529,11 @@ fail:

* blocks may be fragmented by the routine that allocates them.

static ufs2_daddr_t

-ffs_alloccgblk(ip, bp, bpref)

+ffs_alloccgblk(ip, bp, bpref, size)

struct inode *ip;

struct buf *bp;

ufs2_daddr_t bpref;

+ int size;

{

struct fs *fs;

struct cg *cgp;

@@ -1540,6 +1541,7 @@ ffs_alloccgblk(ip, bp, bpref)

ufs1_daddr_t bno;

ufs2_daddr_t blkno;

u_int8_t *blksfree;

+ int i;

fs = ip->i_fs;

ump = ip->i_ump;

@@ -1567,16 +1569,32 @@ ffs_alloccgblk(ip, bp, bpref)

gotit:

blkno = fragstoblks(fs, bno);

ffs_clrblock(fs, blksfree, (long)blkno);

- ffs_clusteracct(ump, fs, cgp, blkno, -1);

+ ffs_clusteracct(fs, cgp, blkno, -1);

cgp->cg_cs.cs_nbfree--;

fs->fs_cstotal.cs_nbfree--;

fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;

fs->fs_fmod = 1;

blkno = cgbase(fs, cgp->cg_cgx) + bno;

+ /*

+ * If the caller didn't want the whole block free the frags here.

+ */

+ size = numfrags(fs, size);

+ if (size != fs->fs_frag) {

+ bno = dtogd(fs, blkno);

+ for (i = size; i < fs->fs_frag; i++)

+ setbit(blksfree, bno + i);

+ i = fs->fs_frag - size;

+ cgp->cg_cs.cs_nffree += i;

+ fs->fs_cstotal.cs_nffree += i;

+ fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;

+ fs->fs_fmod = 1;

+ cgp->cg_frsum[i]++;

+ }

/* XXX Fixme. */

UFS_UNLOCK(ump);

if (DOINGSOFTDEP(ITOV(ip)))

- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);

+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,

+ size, 0);

UFS_LOCK(ump);

return (blkno);

}

@@ -1589,11 +1607,12 @@ gotit:

* take the first one that we find following bpref.

static ufs2_daddr_t

-ffs_clusteralloc(ip, cg, bpref, len)

+ffs_clusteralloc(ip, cg, bpref, len, unused)

struct inode *ip;

u_int cg;

ufs2_daddr_t bpref;

int len;

+ int unused;

{

struct fs *fs;

struct cg *cgp;

@@ -1689,7 +1708,7 @@ ffs_clusteralloc(ip, cg, bpref, len)

len = blkstofrags(fs, len);

UFS_LOCK(ump);

for (i = 0; i < len; i += fs->fs_frag)

- if (ffs_alloccgblk(ip, bp, bno + i) != bno + i)

+ if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)

panic("ffs_clusteralloc: lost block");

ACTIVECLEAR(fs, cg);

UFS_UNLOCK(ump);

@@ -1713,11 +1732,12 @@ fail:

* inode in the specified cylinder group.

static ufs2_daddr_t

-ffs_nodealloccg(ip, cg, ipref, mode)

+ffs_nodealloccg(ip, cg, ipref, mode, unused)

struct inode *ip;

u_int cg;

ufs2_daddr_t ipref;

int mode;

+ int unused;

{

struct fs *fs;

struct cg *cgp;

@@ -1820,28 +1840,6 @@ gotit:

}

- * check if a block is free

- */

-static int

-ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)

- switch ((int)fs->fs_frag) {

- case 8:

- return (cp[h] == 0);

- case 4:

- return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);

- case 2:

- return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);

- case 1:

- return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);

- default:

- panic("ffs_isfreeblock");

- }

- return (0);

-/*

* Free a block or fragment.

* The specified block or fragment is placed back in the

@@ -1849,14 +1847,16 @@ ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)

* block reassembly is checked.

void

-ffs_blkfree(ump, fs, devvp, bno, size, inum)

+ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd)

struct ufsmount *ump;

struct fs *fs;

struct vnode *devvp;

ufs2_daddr_t bno;

long size;

ino_t inum;

+ struct workhead *dephd;

{

+ struct mount *mp;

struct cg *cgp;

struct buf *bp;

ufs1_daddr_t fragno, cgbno;

@@ -1923,7 +1923,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)

panic("ffs_blkfree: freeing free block");

}

ffs_setblock(fs, blksfree, fragno);

- ffs_clusteracct(ump, fs, cgp, fragno, 1);

+ ffs_clusteracct(fs, cgp, fragno, 1);

cgp->cg_cs.cs_nbfree++;

fs->fs_cstotal.cs_nbfree++;

fs->fs_cs(fs, cg).cs_nbfree++;

@@ -1963,7 +1963,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)

cgp->cg_cs.cs_nffree -= fs->fs_frag;

fs->fs_cstotal.cs_nffree -= fs->fs_frag;

fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;

- ffs_clusteracct(ump, fs, cgp, fragno, 1);

+ ffs_clusteracct(fs, cgp, fragno, 1);

cgp->cg_cs.cs_nbfree++;

fs->fs_cstotal.cs_nbfree++;

fs->fs_cs(fs, cg).cs_nbfree++;

@@ -1972,6 +1972,10 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)

fs->fs_fmod = 1;

ACTIVECLEAR(fs, cg);

UFS_UNLOCK(ump);

+ mp = UFSTOVFS(ump);

+ if (mp->mnt_flag & MNT_SOFTDEP && devvp->v_type != VREG)

+ softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,

+ numfrags(fs, size), dephd);

bdwrite(bp);

}

@@ -2042,7 +2046,8 @@ ffs_vfree(pvp, ino, mode)

return (0);

}

ip = VTOI(pvp);

- return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode));

+ return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode,

+ NULL));

}

@@ -2050,12 +2055,13 @@ ffs_vfree(pvp, ino, mode)

* The specified inode is placed back in the free map.

int

-ffs_freefile(ump, fs, devvp, ino, mode)

+ffs_freefile(ump, fs, devvp, ino, mode, wkhd)

struct ufsmount *ump;

struct fs *fs;

struct vnode *devvp;

ino_t ino;

int mode;

+ struct workhead *wkhd;

{

struct cg *cgp;

struct buf *bp;

@@ -2112,6 +2118,9 @@ ffs_freefile(ump, fs, devvp, ino, mode)

fs->fs_fmod = 1;

ACTIVECLEAR(fs, cg);

UFS_UNLOCK(ump);

+ if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP && devvp->v_type != VREG)

+ softdep_setup_inofree(UFSTOVFS(ump), bp,

+ ino + cg * fs->fs_ipg, wkhd);

bdwrite(bp);

return (0);

}

@@ -2226,101 +2235,6 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz)

}

- * Update the cluster map because of an allocation or free.

- *

- * Cnt == 1 means free; cnt == -1 means allocating.

- */

-void

-ffs_clusteracct(ump, fs, cgp, blkno, cnt)

- struct ufsmount *ump;

- struct fs *fs;

- struct cg *cgp;

- ufs1_daddr_t blkno;

- int cnt;

- int32_t *sump;

- int32_t *lp;

- u_char *freemapp, *mapp;

- int i, start, end, forw, back, map, bit;

- mtx_assert(UFS_MTX(ump), MA_OWNED);

- if (fs->fs_contigsumsize <= 0)

- return;

- freemapp = cg_clustersfree(cgp);

- sump = cg_clustersum(cgp);

- /*

- * Allocate or clear the actual block.

- */

- if (cnt > 0)

- setbit(freemapp, blkno);

- else

- clrbit(freemapp, blkno);

- /*

- * Find the size of the cluster going forward.

- */

- start = blkno + 1;

- end = start + fs->fs_contigsumsize;

- if (end >= cgp->cg_nclusterblks)

- end = cgp->cg_nclusterblks;

- mapp = &freemapp[start / NBBY];

- map = *mapp++;

- bit = 1 << (start % NBBY);

- for (i = start; i < end; i++) {

- if ((map & bit) == 0)

- break;

- if ((i & (NBBY - 1)) != (NBBY - 1)) {

- bit <<= 1;

- } else {

- map = *mapp++;

- bit = 1;

- }

- forw = i - start;

- /*

- * Find the size of the cluster going backward.

- */

- start = blkno - 1;

- end = start - fs->fs_contigsumsize;

- if (end < 0)

- end = -1;

- mapp = &freemapp[start / NBBY];

- map = *mapp--;

- bit = 1 << (start % NBBY);

- for (i = start; i > end; i--) {

- if ((map & bit) == 0)

- break;

- if ((i & (NBBY - 1)) != 0) {

- bit >>= 1;

- } else {

- map = *mapp--;

- bit = 1 << (NBBY - 1);

- }

- back = start - i;

- /*

- * Account for old cluster and the possibly new forward and

- * back clusters.

- */

- i = back + forw + 1;

- if (i > fs->fs_contigsumsize)

- i = fs->fs_contigsumsize;

- sump[i] += cnt;

- if (back > 0)

- sump[back] -= cnt;

- if (forw > 0)

- sump[forw] -= cnt;

- /*

- * Update cluster summary information.

- */

- lp = &sump[fs->fs_contigsumsize];

- for (i = fs->fs_contigsumsize; i > 0; i--)

- if (*lp-- > 0)

- break;

- fs->fs_maxcluster[cgp->cg_cgx] = i;

-/*

* Fserr prints the name of a filesystem with an error diagnostic.

* The form of the error message is:

@@ -2540,7 +2454,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)

#endif /* DEBUG */

while (cmd.size > 0) {

if ((error = ffs_freefile(ump, fs, ump->um_devvp,

- cmd.value, filetype)))

+ cmd.value, filetype, NULL)))

break;

cmd.size -= 1;

cmd.value += 1;

@@ -2568,7 +2482,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)

if (blksize > blkcnt)

blksize = blkcnt;

ffs_blkfree(ump, fs, ump->um_devvp, blkno,

- blksize * fs->fs_fsize, ROOTINO);

+ blksize * fs->fs_fsize, ROOTINO, NULL);

blkno += blksize;

blkcnt -= blksize;

blksize = fs->fs_frag;

diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index a12f96e60d0e..6d5f27c1f306 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c

@@ -120,6 +120,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,

if (lbn < 0)

return (EFBIG);

+ if (DOINGSOFTDEP(vp))

+ softdep_prealloc(vp, MNT_WAIT);

* If the next write will extend the file into a new block,

* and the file is currently composed of a fragment

@@ -418,6 +420,8 @@ fail:

* slow, running out of disk space is not expected to be a common

* occurence. The error return from fsync is ignored as we already

* have an error to return to the user.

+ *

+ * XXX Still have to journal the free below

(void) ffs_syncvnode(vp, MNT_WAIT);

for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;

@@ -473,7 +477,7 @@ fail:

for (blkp = allociblk; blkp < allocblk; blkp++) {

ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,

- ip->i_number);

+ ip->i_number, NULL);

}

return (error);

}

@@ -515,6 +519,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,

if (lbn < 0)

return (EFBIG);

+ if (DOINGSOFTDEP(vp))

+ softdep_prealloc(vp, MNT_WAIT);

* Check for allocating external data.

@@ -930,6 +937,8 @@ fail:

* slow, running out of disk space is not expected to be a common

* occurence. The error return from fsync is ignored as we already

* have an error to return to the user.

+ *

+ * XXX Still have to journal the free below

(void) ffs_syncvnode(vp, MNT_WAIT);

for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;

@@ -985,7 +994,7 @@ fail:

for (blkp = allociblk; blkp < allocblk; blkp++) {

ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,

- ip->i_number);

+ ip->i_number, NULL);

}

return (error);

}

diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index 7e32ced2ebe2..7011623749ba 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h

@@ -47,6 +47,7 @@ struct ucred;

struct vnode;

struct vop_fsync_args;

struct vop_reallocblks_args;

+struct workhead;

int ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int,

struct ucred *, ufs2_daddr_t *);

@@ -56,20 +57,23 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size,

struct ucred *a_cred, int a_flags, struct buf **a_bpp);

int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);

void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,

- ufs2_daddr_t, long, ino_t);

+ ufs2_daddr_t, long, ino_t, struct workhead *);

ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);

ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);

int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);

void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);

+void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);

void ffs_bdflush(struct bufobj *, struct buf *);

int ffs_copyonwrite(struct vnode *, struct buf *);

int ffs_flushfiles(struct mount *, int, struct thread *);

void ffs_fragacct(struct fs *, int, int32_t [], int);

int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t,

- int);

+ int, struct workhead *);

int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);

+int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);

void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);

int ffs_mountroot(void);

+void ffs_oldfscompat_write(struct fs *, struct ufsmount *);

int ffs_reallocblks(struct vop_reallocblks_args *);

int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,

ufs2_daddr_t, int, int, int, struct ucred *, struct buf **);

@@ -103,12 +107,14 @@ extern struct vop_vector ffs_fifoops2;

int softdep_check_suspend(struct mount *, struct vnode *,

int, int, int, int);

+int softdep_complete_trunc(struct vnode *, void *);

void softdep_get_depcounts(struct mount *, int *, int *);

void softdep_initialize(void);

void softdep_uninitialize(void);

int softdep_mount(struct vnode *, struct mount *, struct fs *,

struct ucred *);

-void softdep_move_dependencies(struct buf *, struct buf *);

+void softdep_unmount(struct mount *);

+int softdep_move_dependencies(struct buf *, struct buf *);

int softdep_flushworklist(struct mount *, int *, struct thread *);

int softdep_flushfiles(struct mount *, int, struct thread *);

void softdep_update_inodeblock(struct inode *, struct buf *, int);

@@ -117,7 +123,8 @@ void softdep_freefile(struct vnode *, ino_t, int);

int softdep_request_cleanup(struct fs *, struct vnode *);

void softdep_setup_freeblocks(struct inode *, off_t, int);

void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t);

-void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t);

+void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t,

+ int, int);

void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t,

ufs2_daddr_t, long, long, struct buf *);

void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t,

@@ -126,11 +133,20 @@ void softdep_setup_allocindir_meta(struct buf *, struct inode *,

struct buf *, int, ufs2_daddr_t);

void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t,

struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *);

+void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int,

+ struct workhead *);

+void softdep_setup_inofree(struct mount *, struct buf *, ino_t,

+ struct workhead *);

+void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);

+void *softdep_setup_trunc(struct vnode *vp, off_t length, int flags);

void softdep_fsync_mountdev(struct vnode *);

int softdep_sync_metadata(struct vnode *);

int softdep_process_worklist(struct mount *, int);

int softdep_fsync(struct vnode *);

int softdep_waitidle(struct mount *);

+int softdep_prealloc(struct vnode *, int);

+int softdep_journal_lookup(struct mount *, struct vnode **);

int ffs_rdonly(struct inode *);

diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index b2f906730121..3b6983258b93 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c

@@ -92,15 +92,6 @@ ffs_update(vp, waitfor)

fs = ip->i_fs;

if (fs->fs_ronly)

return (0);

- /*

- * Ensure that uid and gid are correct. This is a temporary

- * fix until fsck has been changed to do the update.

- */

- if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */

- fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */

- ip->i_din1->di_ouid = ip->i_uid; /* XXX */

- ip->i_din1->di_ogid = ip->i_gid; /* XXX */

- } /* XXX */

error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),

(int)fs->fs_bsize, NOCRED, &bp);

if (error) {

@@ -160,6 +151,7 @@ ffs_truncate(vp, length, flags, cred, td)

ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR];

ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];

ufs2_daddr_t count, blocksreleased = 0, datablocks;

+ void *cookie;

struct bufobj *bo;

struct fs *fs;

struct buf *bp;

@@ -173,11 +165,14 @@ ffs_truncate(vp, length, flags, cred, td)

fs = ip->i_fs;

ump = ip->i_ump;

bo = &vp->v_bufobj;

+ cookie = NULL;

ASSERT_VOP_LOCKED(vp, "ffs_truncate");

if (length < 0)

return (EINVAL);

+ if (length > fs->fs_maxfilesize)

+ return (EFBIG);

* Historically clients did not have to specify which data

* they were truncating. So, if not specified, we assume

@@ -192,6 +187,7 @@ ffs_truncate(vp, length, flags, cred, td)

* (e.g., the file is being unlinked), then pick it off with

* soft updates below.

+ allerror = 0;

needextclean = 0;

softdepslowdown = DOINGSOFTDEP(vp) && softdep_slowdown(vp);

extblocks = 0;

@@ -212,6 +208,8 @@ ffs_truncate(vp, length, flags, cred, td)

panic("ffs_truncate: partial trunc of extdata");

if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)

return (error);

+ if (DOINGSUJ(vp))

+ cookie = softdep_setup_trunc(vp, length, flags);

osize = ip->i_din2->di_extsize;

ip->i_din2->di_blocks -= extblocks;

#ifdef QUOTA

@@ -227,19 +225,19 @@ ffs_truncate(vp, length, flags, cred, td)

}

ip->i_flag |= IN_CHANGE;

if ((error = ffs_update(vp, 1)))

- return (error);

+ goto out;

for (i = 0; i < NXADDR; i++) {

if (oldblks[i] == 0)

continue;

ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i],

- sblksize(fs, osize, i), ip->i_number);

+ sblksize(fs, osize, i), ip->i_number, NULL);

}

- if ((flags & IO_NORMAL) == 0)

- return (0);

- if (length > fs->fs_maxfilesize)

- return (EFBIG);

+ if ((flags & IO_NORMAL) == 0) {

+ error = 0;

+ goto out;

+ }

if (vp->v_type == VLNK &&

(ip->i_size < vp->v_mount->mnt_maxsymlinklen ||

datablocks == 0)) {

@@ -253,24 +251,52 @@ ffs_truncate(vp, length, flags, cred, td)

ip->i_flag |= IN_CHANGE | IN_UPDATE;

if (needextclean)

softdep_setup_freeblocks(ip, length, IO_EXT);

- return (ffs_update(vp, 1));

+ error = ffs_update(vp, 1);

+ goto out;

}

if (ip->i_size == length) {

ip->i_flag |= IN_CHANGE | IN_UPDATE;

if (needextclean)

softdep_setup_freeblocks(ip, length, IO_EXT);

- return (ffs_update(vp, 0));

+ error = ffs_update(vp, 0);

+ goto out;

}

if (fs->fs_ronly)

panic("ffs_truncate: read-only filesystem");

#ifdef QUOTA

error = getinoquota(ip);

if (error)

- return (error);

+ goto out;

#endif

if ((ip->i_flags & SF_SNAPSHOT) != 0)

ffs_snapremove(vp);

vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;

+ osize = ip->i_size;

+ /*

+ * Lengthen the size of the file. We must ensure that the

+ * last byte of the file is allocated. Since the smallest

+ * value of osize is 0, length will be at least 1.

+ */

+ if (osize < length) {

+ vnode_pager_setsize(vp, length);

+ flags |= BA_CLRBUF;

+ error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);

+ if (error) {

+ vnode_pager_setsize(vp, osize);

+ goto out;

+ }

+ ip->i_size = length;

+ DIP_SET(ip, i_size, length);

+ if (bp->b_bufsize == fs->fs_bsize)

+ bp->b_flags |= B_CLUSTEROK;

+ if (flags & IO_SYNC)

+ bwrite(bp);

+ else

+ bawrite(bp);

+ ip->i_flag |= IN_CHANGE | IN_UPDATE;

+ error = ffs_update(vp, 1);

+ goto out;

+ }

if (DOINGSOFTDEP(vp)) {

if (length > 0 || softdepslowdown) {

@@ -283,11 +309,18 @@ ffs_truncate(vp, length, flags, cred, td)

* so that it will have no data structures left.

if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)

- return (error);

+ goto out;

UFS_LOCK(ump);

if (ip->i_flag & IN_SPACECOUNTED)

fs->fs_pendingblocks -= datablocks;

UFS_UNLOCK(ump);

+ /*

+ * We have to journal the truncation before we change

+ * any blocks so we don't leave the file partially

+ * truncated.

+ */

+ if (DOINGSUJ(vp) && cookie == NULL)

+ cookie = softdep_setup_trunc(vp, length, flags);

} else {

#ifdef QUOTA

(void) chkdq(ip, -datablocks, NOCRED, 0);

@@ -301,33 +334,9 @@ ffs_truncate(vp, length, flags, cred, td)

OFF_TO_IDX(lblktosize(fs, -extblocks)));

vnode_pager_setsize(vp, 0);

ip->i_flag |= IN_CHANGE | IN_UPDATE;

- return (ffs_update(vp, 0));

- }

- osize = ip->i_size;

- /*

- * Lengthen the size of the file. We must ensure that the

- * last byte of the file is allocated. Since the smallest

- * value of osize is 0, length will be at least 1.

- */

- if (osize < length) {

- vnode_pager_setsize(vp, length);

- flags |= BA_CLRBUF;

- error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);

- if (error) {

- vnode_pager_setsize(vp, osize);

- return (error);

+ error = ffs_update(vp, 0);

+ goto out;

}

- ip->i_size = length;

- DIP_SET(ip, i_size, length);

- if (bp->b_bufsize == fs->fs_bsize)

- bp->b_flags |= B_CLUSTEROK;

- if (flags & IO_SYNC)

- bwrite(bp);

- else

- bawrite(bp);

- ip->i_flag |= IN_CHANGE | IN_UPDATE;

- return (ffs_update(vp, 1));

}

* Shorten the size of the file. If the file is not being

@@ -345,9 +354,8 @@ ffs_truncate(vp, length, flags, cred, td)

lbn = lblkno(fs, length);

flags |= BA_CLRBUF;

error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);

- if (error) {

- return (error);

- }

+ if (error)

+ goto out;

* When we are doing soft updates and the UFS_BALLOC

* above fills in a direct block hole with a full sized

@@ -359,7 +367,7 @@ ffs_truncate(vp, length, flags, cred, td)

if (DOINGSOFTDEP(vp) && lbn < NDADDR &&

fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&

(error = ffs_syncvnode(vp, MNT_WAIT)) != 0)

- return (error);

+ goto out;

ip->i_size = length;

DIP_SET(ip, i_size, length);

size = blksize(fs, ip, lbn);

@@ -405,7 +413,13 @@ ffs_truncate(vp, length, flags, cred, td)

DIP_SET(ip, i_db[i], 0);

}

ip->i_flag |= IN_CHANGE | IN_UPDATE;

- allerror = ffs_update(vp, 1);

+ /*

+ * When doing softupdate journaling we must preserve the size along

+ * with the old pointers until they are freed or we might not

+ * know how many fragments remain.

+ */

+ if (!DOINGSUJ(vp))

+ allerror = ffs_update(vp, 1);

* Having written the new inode to disk, save its new configuration

@@ -445,7 +459,7 @@ ffs_truncate(vp, length, flags, cred, td)

if (lastiblock[level] < 0) {

DIP_SET(ip, i_ib[level], 0);

ffs_blkfree(ump, fs, ip->i_devvp, bn,

- fs->fs_bsize, ip->i_number);

+ fs->fs_bsize, ip->i_number, NULL);

blocksreleased += nblocks;

}

@@ -464,7 +478,8 @@ ffs_truncate(vp, length, flags, cred, td)

continue;

DIP_SET(ip, i_db[i], 0);

bsize = blksize(fs, ip, i);

- ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number);

+ ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number,

+ NULL);

blocksreleased += btodb(bsize);

}

if (lastblock < 0)

@@ -496,7 +511,7 @@ ffs_truncate(vp, length, flags, cred, td)

bn += numfrags(fs, newspace);

ffs_blkfree(ump, fs, ip->i_devvp, bn,

- oldspace - newspace, ip->i_number);

+ oldspace - newspace, ip->i_number, NULL);

blocksreleased += btodb(oldspace - newspace);

}

@@ -528,7 +543,14 @@ done:

#ifdef QUOTA

(void) chkdq(ip, -blocksreleased, NOCRED, 0);

#endif

- return (allerror);

+ error = allerror;

+out:

+ if (cookie) {

+ allerror = softdep_complete_trunc(vp, cookie);

+ if (allerror != 0 && error == 0)

+ error = allerror;

+ }

+ return (error);

}

@@ -638,7 +660,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)

blocksreleased += blkcount;

}

ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize,

- ip->i_number);

+ ip->i_number, NULL);

blocksreleased += nblocks;

}

diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
index b36cb58808bd..11362cfbc755 100644
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c

@@ -142,7 +142,7 @@ MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);

static int cgaccount(int, struct vnode *, struct buf *, int);

static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,

int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,

- ufs_lbn_t, int), int);

+ ufs_lbn_t, int), int, int);

static int indiracct_ufs1(struct vnode *, struct vnode *, int,

ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,

int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,

@@ -155,7 +155,7 @@ static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,

struct fs *, ufs_lbn_t, int);

static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,

int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,

- ufs_lbn_t, int), int);

+ ufs_lbn_t, int), int, int);

static int indiracct_ufs2(struct vnode *, struct vnode *, int,

ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,

int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,

@@ -582,7 +582,8 @@ loop:

len = fragroundup(fs, blkoff(fs, xp->i_size));

if (len != 0 && len < fs->fs_bsize) {

ffs_blkfree(ump, copy_fs, vp,

- DIP(xp, i_db[loc]), len, xp->i_number);

+ DIP(xp, i_db[loc]), len, xp->i_number,

+ NULL);

blkno = DIP(xp, i_db[loc]);

DIP_SET(xp, i_db[loc], 0);

}

@@ -590,15 +591,15 @@ loop:

snaplistsize += 1;

if (xp->i_ump->um_fstype == UFS1)

error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,

- BLK_NOCOPY);

+ BLK_NOCOPY, 1);

else

error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,

- BLK_NOCOPY);

+ BLK_NOCOPY, 1);

if (blkno)

DIP_SET(xp, i_db[loc], blkno);

if (!error)

error = ffs_freefile(ump, copy_fs, vp, xp->i_number,

- xp->i_mode);

+ xp->i_mode, NULL);

VOP_UNLOCK(xvp, 0);

vdrop(xvp);

if (error) {

@@ -612,6 +613,26 @@ loop:

}

MNT_IUNLOCK(mp);

+ * Erase the journal file from the snapshot.

+ */

+ if (fs->fs_flags & FS_SUJ) {

+ error = softdep_journal_lookup(mp, &xvp);

+ if (error) {

+ free(copy_fs->fs_csp, M_UFSMNT);

+ bawrite(sbp);

+ sbp = NULL;

+ goto out1;

+ }

+ xp = VTOI(xvp);

+ if (xp->i_ump->um_fstype == UFS1)

+ error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,

+ BLK_NOCOPY, 0);

+ else

+ error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,

+ BLK_NOCOPY, 0);

+ vput(xvp);

+ }

+ /*

* Acquire a lock on the snapdata structure, creating it if necessary.

sn = ffs_snapdata_acquire(devvp);

@@ -691,16 +712,16 @@ out1:

break;

if (xp->i_ump->um_fstype == UFS1)

error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,

- BLK_SNAP);

+ BLK_SNAP, 0);

else

error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,

- BLK_SNAP);

+ BLK_SNAP, 0);

if (error == 0 && xp->i_effnlink == 0) {

error = ffs_freefile(ump,

copy_fs,

vp,

xp->i_number,

- xp->i_mode);

+ xp->i_mode, NULL);

}

if (error) {

fs->fs_snapinum[snaploc] = 0;

@@ -719,9 +740,11 @@ out1:

* the list of allocated blocks in i_snapblklist.

if (ip->i_ump->um_fstype == UFS1)

- error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);

+ error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,

+ BLK_SNAP, 0);

else

- error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);

+ error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,

+ BLK_SNAP, 0);

if (error) {

fs->fs_snapinum[snaploc] = 0;

free(snapblklist, M_UFSMNT);

@@ -954,13 +977,14 @@ cgaccount(cg, vp, nbp, passno)

* is reproduced once each for UFS1 and UFS2.

static int

-expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)

+expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)

struct vnode *snapvp;

struct inode *cancelip;

struct fs *fs;

int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,

struct fs *, ufs_lbn_t, int);

int expungetype;

+ int clearmode;

{

int i, error, indiroff;

ufs_lbn_t lbn, rlbn;

@@ -1005,7 +1029,7 @@ expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)

dip = (struct ufs1_dinode *)bp->b_data +

ino_to_fsbo(fs, cancelip->i_number);

- if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)

+ if (clearmode || cancelip->i_effnlink == 0)

dip->di_mode = 0;

dip->di_size = 0;

dip->di_blocks = 0;

@@ -1220,7 +1244,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)

*ip->i_snapblklist++ = lblkno;

if (blkno == BLK_SNAP)

blkno = blkstofrags(fs, lblkno);

- ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);

+ ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);

}

return (0);

}

@@ -1234,13 +1258,14 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)

* is reproduced once each for UFS1 and UFS2.

static int

-expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)

+expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)

struct vnode *snapvp;

struct inode *cancelip;

struct fs *fs;

int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,

struct fs *, ufs_lbn_t, int);

int expungetype;

+ int clearmode;

{

int i, error, indiroff;

ufs_lbn_t lbn, rlbn;

@@ -1285,7 +1310,7 @@ expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)

dip = (struct ufs2_dinode *)bp->b_data +

ino_to_fsbo(fs, cancelip->i_number);

- if (expungetype == BLK_NOCOPY)

+ if (clearmode || cancelip->i_effnlink == 0)

dip->di_mode = 0;

dip->di_size = 0;

dip->di_blocks = 0;

@@ -1500,7 +1525,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)

*ip->i_snapblklist++ = lblkno;

if (blkno == BLK_SNAP)

blkno = blkstofrags(fs, lblkno);

- ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);

+ ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);

}

return (0);

}

@@ -1657,6 +1682,13 @@ ffs_snapremove(vp)

ip->i_flags &= ~SF_SNAPSHOT;

DIP_SET(ip, i_flags, ip->i_flags);

ip->i_flag |= IN_CHANGE | IN_UPDATE;

+ /*

+ * The dirtied indirects must be written out before

+ * softdep_setup_freeblocks() is called. Otherwise indir_trunc()

+ * may find indirect pointers using the magic BLK_* values.

+ */

+ if (DOINGSOFTDEP(vp))

+ ffs_syncvnode(vp, MNT_WAIT);

#ifdef QUOTA

* Reenable disk quotas for ex-snapshot file.

diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 4d652c114dd1..4a659f9de7ba 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c

@@ -1,5 +1,7 @@

/*-

* The soft updates code is derived from the appendix of a University

* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,

@@ -23,17 +25,16 @@

* notice, this list of conditions and the following disclaimer in the

* documentation and/or other materials provided with the distribution.

- * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY

- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

- * SUCH DAMAGE.

+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR

+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES

+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.

+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR

+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE

+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00

@@ -50,6 +51,7 @@ __FBSDID("$FreeBSD$");

#ifndef DEBUG

#define DEBUG

#endif

+#define SUJ_DEBUG

#include <sys/param.h>

#include <sys/kernel.h>

@@ -62,6 +64,7 @@ __FBSDID("$FreeBSD$");

#include <sys/malloc.h>

#include <sys/mount.h>

#include <sys/mutex.h>

+#include <sys/namei.h>

#include <sys/proc.h>

#include <sys/stat.h>

#include <sys/sysctl.h>

@@ -130,10 +133,12 @@ softdep_setup_inomapdep(bp, ip, newinum)

}

void

-softdep_setup_blkmapdep(bp, mp, newblkno)

+softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)

struct buf *bp;

struct mount *mp;

ufs2_daddr_t newblkno;

+ int frags;

+ int oldfrags;

{

panic("softdep_setup_blkmapdep called");

@@ -227,7 +232,8 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)

}

void

-softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)

+softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)

+ struct buf *bp;

struct inode *dp;

caddr_t base;

caddr_t oldloc;

@@ -403,31 +409,13 @@ softdep_get_depcounts(struct mount *mp,

* These definitions need to be adapted to the system to which

* this file is being ported.

-/*

- * malloc types defined for the softdep system.

- */

-static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");

-static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");

-static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");

-static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");

-static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");

-static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");

-static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");

-static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");

-static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");

-static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");

-static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");

-static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");

-static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");

-static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");

-static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");

#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)

#define D_PAGEDEP 0

#define D_INODEDEP 1

-#define D_NEWBLK 2

-#define D_BMSAFEMAP 3

+#define D_BMSAFEMAP 2

+#define D_NEWBLK 3

#define D_ALLOCDIRECT 4

#define D_INDIRDEP 5

#define D_ALLOCINDIR 6

@@ -438,7 +426,67 @@ static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");

#define D_MKDIR 11

#define D_DIRREM 12

#define D_NEWDIRBLK 13

-#define D_LAST D_NEWDIRBLK

+#define D_FREEWORK 14

+#define D_FREEDEP 15

+#define D_JADDREF 16

+#define D_JREMREF 17

+#define D_JMVREF 18

+#define D_JNEWBLK 19

+#define D_JFREEBLK 20

+#define D_JFREEFRAG 21

+#define D_JSEG 22

+#define D_JSEGDEP 23

+#define D_SBDEP 24

+#define D_JTRUNC 25

+#define D_LAST D_JTRUNC

+unsigned long dep_current[D_LAST + 1];

+unsigned long dep_total[D_LAST + 1];

+SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");

+SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,

+ "total dependencies allocated");

+SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,

+ "current dependencies allocated");

+#define SOFTDEP_TYPE(type, str, long) \

+ static MALLOC_DEFINE(M_ ## type, #str, long); \

+ SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \

+ &dep_total[D_ ## type], 0, ""); \

+ SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \

+ &dep_current[D_ ## type], 0, "");

+SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");

+SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");

+SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,

+ "Block or frag allocated from cyl group map");

+SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");

+SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");

+SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");

+SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");

+SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");

+SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");

+SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");

+SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");

+SOFTDEP_TYPE(MKDIR, mkdir, "New directory");

+SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");

+SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");

+SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");

+SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");

+SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");

+SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");

+SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");

+SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");

+SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");

+SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");

+SOFTDEP_TYPE(JSEG, jseg, "Journal segment");

+SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");

+SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");

+SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");

+static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");

+static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");

* translate from workitem type to memory type

@@ -447,8 +495,8 @@ static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");

static struct malloc_type *memtype[] = {

M_PAGEDEP,

M_INODEDEP,

- M_NEWBLK,

M_BMSAFEMAP,

+ M_NEWBLK,

M_ALLOCDIRECT,

M_INDIRDEP,

M_ALLOCINDIR,

@@ -458,7 +506,19 @@ static struct malloc_type *memtype[] = {

M_DIRADD,

M_MKDIR,

M_DIRREM,

- M_NEWDIRBLK

+ M_NEWDIRBLK,

+ M_FREEWORK,

+ M_FREEDEP,

+ M_JADDREF,

+ M_JREMREF,

+ M_JMVREF,

+ M_JNEWBLK,

+ M_JFREEBLK,

+ M_JFREEFRAG,

+ M_JSEG,

+ M_JSEGDEP,

+ M_SBDEP,

+ M_JTRUNC

};

#define DtoM(type) (memtype[type])

@@ -467,17 +527,21 @@ static struct malloc_type *memtype[] = {

* Names of malloc types.

#define TYPENAME(type) \

- ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")

+ ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")

* End system adaptation definitions.

+#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino)

+#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino)

* Forward declarations.

struct inodedep_hashhead;

struct newblk_hashhead;

struct pagedep_hashhead;

+struct bmsafemap_hashhead;

* Internal function prototypes.

@@ -487,59 +551,172 @@ static void drain_output(struct vnode *);

static struct buf *getdirtybuf(struct buf *, struct mtx *, int);

static void clear_remove(struct thread *);

static void clear_inodedeps(struct thread *);

+static void unlinked_inodedep(struct mount *, struct inodedep *);

+static void clear_unlinked_inodedep(struct inodedep *);

+static struct inodedep *first_unlinked_inodedep(struct ufsmount *);

static int flush_pagedep_deps(struct vnode *, struct mount *,

struct diraddhd *);

+static void free_pagedep(struct pagedep *);

+static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);

static int flush_inodedep_deps(struct mount *, ino_t);

static int flush_deplist(struct allocdirectlst *, int, int *);

static int handle_written_filepage(struct pagedep *, struct buf *);

+static int handle_written_sbdep(struct sbdep *, struct buf *);

+static void initiate_write_sbdep(struct sbdep *);

static void diradd_inode_written(struct diradd *, struct inodedep *);

+static int handle_written_indirdep(struct indirdep *, struct buf *,

+ struct buf**);

static int handle_written_inodeblock(struct inodedep *, struct buf *);

-static void handle_allocdirect_partdone(struct allocdirect *);

+static int handle_written_bmsafemap(struct bmsafemap *, struct buf *);

+static void handle_written_jaddref(struct jaddref *);

+static void handle_written_jremref(struct jremref *);

+static void handle_written_jseg(struct jseg *, struct buf *);

+static void handle_written_jnewblk(struct jnewblk *);

+static void handle_written_jfreeblk(struct jfreeblk *);

+static void handle_written_jfreefrag(struct jfreefrag *);

+static void complete_jseg(struct jseg *);

+static void jseg_write(struct fs *, struct jblocks *, struct jseg *,

+ uint8_t *);

+static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);

+static void jremref_write(struct jremref *, struct jseg *, uint8_t *);

+static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);

+static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);

+static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);

+static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);

+static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);

+static inline void inoref_write(struct inoref *, struct jseg *,

+ struct jrefrec *);

+static void handle_allocdirect_partdone(struct allocdirect *,

+ struct workhead *);

+static void cancel_newblk(struct newblk *, struct workhead *);

+static void indirdep_complete(struct indirdep *);

static void handle_allocindir_partdone(struct allocindir *);

static void initiate_write_filepage(struct pagedep *, struct buf *);

+static void initiate_write_indirdep(struct indirdep*, struct buf *);

static void handle_written_mkdir(struct mkdir *, int);

+static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);

static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);

static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);

static void handle_workitem_freefile(struct freefile *);

static void handle_workitem_remove(struct dirrem *, struct vnode *);

static struct dirrem *newdirrem(struct buf *, struct inode *,

struct inode *, int, struct dirrem **);

-static void free_diradd(struct diradd *);

-static void free_allocindir(struct allocindir *, struct inodedep *);

+static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,

+ struct freeblks *);

+static void free_indirdep(struct indirdep *);

+static void free_diradd(struct diradd *, struct workhead *);

+static void merge_diradd(struct inodedep *, struct diradd *);

+static void complete_diradd(struct diradd *);

+static struct diradd *diradd_lookup(struct pagedep *, int);

+static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,

+ struct jremref *);

+static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,

+ struct jremref *);

+static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,

+ struct jremref *, struct jremref *);

+static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,

+ struct jremref *);

+static void cancel_allocindir(struct allocindir *, struct inodedep *,

+ struct freeblks *);

+static void complete_mkdir(struct mkdir *);

static void free_newdirblk(struct newdirblk *);

-static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,

- ufs2_daddr_t *);

-static void deallocate_dependencies(struct buf *, struct inodedep *);

-static void free_allocdirect(struct allocdirectlst *,

- struct allocdirect *, int);

+static void free_jremref(struct jremref *);

+static void free_jaddref(struct jaddref *);

+static void free_jsegdep(struct jsegdep *);

+static void free_jseg(struct jseg *);

+static void free_jnewblk(struct jnewblk *);

+static void free_jfreeblk(struct jfreeblk *);

+static void free_jfreefrag(struct jfreefrag *);

+static void free_freedep(struct freedep *);

+static void journal_jremref(struct dirrem *, struct jremref *,

+ struct inodedep *);

+static void cancel_jnewblk(struct jnewblk *, struct workhead *);

+static int cancel_jaddref(struct jaddref *, struct inodedep *,

+ struct workhead *);

+static void cancel_jfreefrag(struct jfreefrag *);

+static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);

+static int deallocate_dependencies(struct buf *, struct inodedep *,

+ struct freeblks *);

+static void free_newblk(struct newblk *);

+static void cancel_allocdirect(struct allocdirectlst *,

+ struct allocdirect *, struct freeblks *, int);

static int check_inode_unwritten(struct inodedep *);

static int free_inodedep(struct inodedep *);

+static void freework_freeblock(struct freework *);

static void handle_workitem_freeblocks(struct freeblks *, int);

+static void handle_complete_freeblocks(struct freeblks *);

+static void handle_workitem_indirblk(struct freework *);

+static void handle_written_freework(struct freework *);

static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);

static void setup_allocindir_phase2(struct buf *, struct inode *,

- struct allocindir *);

+ struct inodedep *, struct allocindir *, ufs_lbn_t);

static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,

- ufs2_daddr_t);

+ ufs2_daddr_t, ufs_lbn_t);

static void handle_workitem_freefrag(struct freefrag *);

-static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);

+static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,

+ ufs_lbn_t);

static void allocdirect_merge(struct allocdirectlst *,

struct allocdirect *, struct allocdirect *);

-static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);

-static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,

- struct newblk **);

-static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);

+static struct freefrag *allocindir_merge(struct allocindir *,

+ struct allocindir *);

+static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,

+ struct bmsafemap **);

+static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,

+ int cg);

+static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,

+ int, struct newblk **);

+static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);

static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,

struct inodedep **);

static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);

-static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);

+static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,

+ struct pagedep **);

static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,

struct mount *mp, int, struct pagedep **);

static void pause_timer(void *);

static int request_cleanup(struct mount *, int);

static int process_worklist_item(struct mount *, int);

-static void add_to_worklist(struct worklist *);

+static void process_removes(struct vnode *);

+static void jwork_move(struct workhead *, struct workhead *);

+static void add_to_worklist(struct worklist *, int);

+static void remove_from_worklist(struct worklist *);

static void softdep_flush(void);

static int softdep_speedup(void);

+static void worklist_speedup(void);

+static int journal_mount(struct mount *, struct fs *, struct ucred *);

+static void journal_unmount(struct mount *);

+static int journal_space(struct ufsmount *, int);

+static void journal_suspend(struct ufsmount *);

+static void softdep_prelink(struct vnode *, struct vnode *);

+static void add_to_journal(struct worklist *);

+static void remove_from_journal(struct worklist *);

+static void softdep_process_journal(struct mount *, int);

+static struct jremref *newjremref(struct dirrem *, struct inode *,

+ struct inode *ip, off_t, nlink_t);

+static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,

+ uint16_t);

+static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,

+ uint16_t);

+static inline struct jsegdep *inoref_jseg(struct inoref *);

+static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);

+static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,

+ ufs2_daddr_t, int);

+static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,

+ ufs2_daddr_t, long, ufs_lbn_t);

+static struct freework *newfreework(struct freeblks *, struct freework *,

+ ufs_lbn_t, ufs2_daddr_t, int, int);

+static void jwait(struct worklist *wk);

+static struct inodedep *inodedep_lookup_ip(struct inode *);

+static int bmsafemap_rollbacks(struct bmsafemap *);

+static struct freefile *handle_bufwait(struct inodedep *, struct workhead *);

+static void handle_jwork(struct workhead *);

+static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,

+ struct mkdir **);

+static struct jblocks *jblocks_create(void);

+static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);

+static void jblocks_free(struct jblocks *, struct mount *, int);

+static void jblocks_destroy(struct jblocks *);

+static void jblocks_add(struct jblocks *, ufs2_daddr_t, int);

* Exported softdep operations.

@@ -572,40 +749,128 @@ MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);

(item)->wk_state &= ~ONWORKLIST; \

LIST_REMOVE(item, wk_list); \

} while (0)

+#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT

+#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE

#else /* DEBUG */

-static void worklist_insert(struct workhead *, struct worklist *);

-static void worklist_remove(struct worklist *);

+static void worklist_insert(struct workhead *, struct worklist *, int);

+static void worklist_remove(struct worklist *, int);

-#define WORKLIST_INSERT(head, item) worklist_insert(head, item)

-#define WORKLIST_REMOVE(item) worklist_remove(item)

+#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)

+#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)

+#define WORKLIST_REMOVE(item) worklist_remove(item, 1)

+#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)

static void

-worklist_insert(head, item)

+worklist_insert(head, item, locked)

struct workhead *head;

struct worklist *item;

+ int locked;

{

- mtx_assert(&lk, MA_OWNED);

+ if (locked)

+ mtx_assert(&lk, MA_OWNED);

if (item->wk_state & ONWORKLIST)

- panic("worklist_insert: already on list");

+ panic("worklist_insert: %p %s(0x%X) already on list",

+ item, TYPENAME(item->wk_type), item->wk_state);

item->wk_state |= ONWORKLIST;

LIST_INSERT_HEAD(head, item, wk_list);

}

static void

-worklist_remove(item)

+worklist_remove(item, locked)

struct worklist *item;

+ int locked;

{

- mtx_assert(&lk, MA_OWNED);

+ if (locked)

+ mtx_assert(&lk, MA_OWNED);

if ((item->wk_state & ONWORKLIST) == 0)

- panic("worklist_remove: not on list");

+ panic("worklist_remove: %p %s(0x%X) not on list",

+ item, TYPENAME(item->wk_type), item->wk_state);

item->wk_state &= ~ONWORKLIST;

LIST_REMOVE(item, wk_list);

}

#endif /* DEBUG */

+ * Merge two jsegdeps keeping only the oldest one as newer references

+ * can't be discarded until after older references.

+ */

+static inline struct jsegdep *

+jsegdep_merge(struct jsegdep *one, struct jsegdep *two)

+ struct jsegdep *swp;

+ if (two == NULL)

+ return (one);

+ if (one->jd_seg->js_seq > two->jd_seg->js_seq) {

+ swp = one;

+ one = two;

+ two = swp;

+ }

+ WORKLIST_REMOVE(&two->jd_list);

+ free_jsegdep(two);

+ return (one);

+/*

+ * If two freedeps are compatible free one to reduce list size.

+ */

+static inline struct freedep *

+freedep_merge(struct freedep *one, struct freedep *two)

+ if (two == NULL)

+ return (one);

+ if (one->fd_freework == two->fd_freework) {

+ WORKLIST_REMOVE(&two->fd_list);

+ free_freedep(two);

+ }

+ return (one);

+/*

+ * Move journal work from one list to another. Duplicate freedeps and

+ * jsegdeps are coalesced to keep the lists as small as possible.

+ */

+static void

+jwork_move(dst, src)

+ struct workhead *dst;

+ struct workhead *src;

+ struct freedep *freedep;

+ struct jsegdep *jsegdep;

+ struct worklist *wkn;

+ struct worklist *wk;

+ KASSERT(dst != src,

+ ("jwork_move: dst == src"));

+ freedep = NULL;

+ jsegdep = NULL;

+ LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {

+ if (wk->wk_type == D_JSEGDEP)

+ jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);

+ if (wk->wk_type == D_FREEDEP)

+ freedep = freedep_merge(WK_FREEDEP(wk), freedep);

+ }

+ mtx_assert(&lk, MA_OWNED);

+ while ((wk = LIST_FIRST(src)) != NULL) {

+ WORKLIST_REMOVE(wk);

+ WORKLIST_INSERT(dst, wk);

+ if (wk->wk_type == D_JSEGDEP) {

+ jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);

+ continue;

+ }

+ if (wk->wk_type == D_FREEDEP)

+ freedep = freedep_merge(WK_FREEDEP(wk), freedep);

+ }

+/*

* Routines for tracking and managing workitems.

static void workitem_free(struct worklist *, int);

@@ -623,13 +888,16 @@ workitem_free(item, type)

#ifdef DEBUG

if (item->wk_state & ONWORKLIST)

- panic("workitem_free: still on list");

+ panic("workitem_free: %s(0x%X) still on list",

+ TYPENAME(item->wk_type), item->wk_state);

if (item->wk_type != type)

- panic("workitem_free: type mismatch");

+ panic("workitem_free: type mismatch %s != %s",

+ TYPENAME(item->wk_type), TYPENAME(type));

#endif

ump = VFSTOUFS(item->wk_mp);

if (--ump->softdep_deps == 0 && ump->softdep_req)

wakeup(&ump->softdep_deps);

+ dep_current[type]--;

free(item, DtoM(type));

}

@@ -643,6 +911,8 @@ workitem_alloc(item, type, mp)

item->wk_mp = mp;

item->wk_state = 0;

ACQUIRE_LOCK(&lk);

+ dep_current[type]++;

+ dep_total[type]++;

VFSTOUFS(mp)->softdep_deps++;

VFSTOUFS(mp)->softdep_accdeps++;

FREE_LOCK(&lk);

@@ -678,24 +948,66 @@ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */

static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */

static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */

static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */

-SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");

-SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");

-SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");

-SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");

-SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");

-/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */

+static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */

+static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */

+static int stat_journal_min; /* Times hit journal min threshold */

+static int stat_journal_low; /* Times hit journal low threshold */

+static int stat_journal_wait; /* Times blocked in jwait(). */

+static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */

+static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */

+static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */

+static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */

+SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,

+ &max_softdeps, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,

+ &tickdelay, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,

+ &maxindirdeps, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,

+ &stat_worklist_push, 0,"");

+SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,

+ &stat_blk_limit_push, 0,"");

+SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,

+ &stat_ino_limit_push, 0,"");

+SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,

+ &stat_blk_limit_hit, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,

+ &stat_ino_limit_hit, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,

+ &stat_sync_limit_hit, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,

+ &stat_indir_blk_ptrs, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,

+ &stat_inode_bitmap, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,

+ &stat_direct_blk_ptrs, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,

+ &stat_dir_entry, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,

+ &stat_jaddref, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,

+ &stat_jnewblk, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,

+ &stat_journal_low, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,

+ &stat_journal_min, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,

+ &stat_journal_wait, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,

+ &stat_jwait_filepage, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,

+ &stat_jwait_freeblks, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,

+ &stat_jwait_inode, 0, "");

+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,

+ &stat_jwait_newblk, 0, "");

SYSCTL_DECL(_vfs_ffs);

+LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;

+static u_long bmsafemap_hash; /* size of hash table - 1 */

static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */

SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,

&compute_summary_at_mount, 0, "Recompute summary at mount");

@@ -770,16 +1082,22 @@ softdep_flush(void)

}

-static int

-softdep_speedup(void)

+static void

+worklist_speedup(void)

{

mtx_assert(&lk, MA_OWNED);

if (req_pending == 0) {

req_pending = 1;

wakeup(&req_pending);

}

+static int

+softdep_speedup(void)

+ worklist_speedup();

+ bd_speedup();

return speedup_syncer();

}

@@ -791,15 +1109,17 @@ softdep_speedup(void)

* and does so in order from first to last.

static void

-add_to_worklist(wk)

+add_to_worklist(wk, nodelay)

struct worklist *wk;

+ int nodelay;

{

struct ufsmount *ump;

mtx_assert(&lk, MA_OWNED);

ump = VFSTOUFS(wk->wk_mp);

if (wk->wk_state & ONWORKLIST)

- panic("add_to_worklist: already on list");

+ panic("add_to_worklist: %s(0x%X) already on list",

+ TYPENAME(wk->wk_type), wk->wk_state);

wk->wk_state |= ONWORKLIST;

if (LIST_EMPTY(&ump->softdep_workitem_pending))

LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);

@@ -807,6 +1127,30 @@ add_to_worklist(wk)

LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);

ump->softdep_worklist_tail = wk;

ump->softdep_on_worklist += 1;

+ if (nodelay)

+ worklist_speedup();

+/*

+ * Remove the item to be processed. If we are removing the last

+ * item on the list, we need to recalculate the tail pointer.

+ */

+static void

+remove_from_worklist(wk)

+ struct worklist *wk;

+ struct ufsmount *ump;

+ struct worklist *wkend;

+ ump = VFSTOUFS(wk->wk_mp);

+ WORKLIST_REMOVE(wk);

+ if (wk == ump->softdep_worklist_tail) {

+ LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)

+ if (LIST_NEXT(wkend, wk_list) == NULL)

+ break;

+ ump->softdep_worklist_tail = wkend;

+ }

+ ump->softdep_on_worklist -= 1;

}

@@ -838,8 +1182,9 @@ softdep_process_worklist(mp, full)

ACQUIRE_LOCK(&lk);

loopcount = 1;

starttime = time_second;

+ softdep_process_journal(mp, full?MNT_WAIT:0);

while (ump->softdep_on_worklist > 0) {

- if ((cnt = process_worklist_item(mp, 0)) == -1)

+ if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)

break;

else

matchcnt += cnt;

@@ -871,16 +1216,61 @@ softdep_process_worklist(mp, full)

* second. Otherwise the other mountpoints may get

* excessively backlogged.

- if (!full && starttime != time_second) {

- matchcnt = -1;

+ if (!full && starttime != time_second)

break;

- }

}

FREE_LOCK(&lk);

return (matchcnt);

}

+ * Process all removes associated with a vnode if we are running out of

+ * journal space. Any other process which attempts to flush these will

+ * be unable as we have the vnodes locked.

+ */

+static void

+process_removes(vp)

+ struct vnode *vp;

+ struct inodedep *inodedep;

+ struct dirrem *dirrem;

+ struct mount *mp;

+ ino_t inum;

+ mtx_assert(&lk, MA_OWNED);

+ mp = vp->v_mount;

+ inum = VTOI(vp)->i_number;

+ for (;;) {

+ if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)

+ return;

+ LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)

+ if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==

+ (COMPLETE | ONWORKLIST))

+ break;

+ if (dirrem == NULL)

+ return;

+ /*

+ * If another thread is trying to lock this vnode it will

+ * fail but we must wait for it to do so before we can

+ * proceed.

+ */

+ if (dirrem->dm_state & INPROGRESS) {

+ dirrem->dm_state |= IOWAITING;

+ msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);

+ continue;

+ }

+ remove_from_worklist(&dirrem->dm_list);

+ FREE_LOCK(&lk);

+ if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))

+ panic("process_removes: suspended filesystem");

+ handle_workitem_remove(dirrem, vp);

+ vn_finished_secondary_write(mp);

+ ACQUIRE_LOCK(&lk);

+ }

+/*

* Process one item on the worklist.

static int

@@ -888,7 +1278,7 @@ process_worklist_item(mp, flags)

struct mount *mp;

int flags;

{

- struct worklist *wk, *wkend;

+ struct worklist *wk, *wkXXX;

struct ufsmount *ump;

struct vnode *vp;

int matchcnt = 0;

@@ -908,11 +1298,14 @@ process_worklist_item(mp, flags)

* inodes, we have to skip over any dirrem requests whose

* vnodes are resident and locked.

- ump = VFSTOUFS(mp);

vp = NULL;

+ ump = VFSTOUFS(mp);

LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {

- if (wk->wk_state & INPROGRESS)

+ if (wk->wk_state & INPROGRESS) {

+ wkXXX = wk;

continue;

+ }

+ wkXXX = wk; /* Record the last valid wk pointer. */

if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)

break;

wk->wk_state |= INPROGRESS;

@@ -921,6 +1314,10 @@ process_worklist_item(mp, flags)

ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,

LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);

ACQUIRE_LOCK(&lk);

+ if (wk->wk_state & IOWAITING) {

+ wk->wk_state &= ~IOWAITING;

+ wakeup(wk);

+ }

wk->wk_state &= ~INPROGRESS;

ump->softdep_on_worklist_inprogress--;

if (vp != NULL)

@@ -928,21 +1325,7 @@ process_worklist_item(mp, flags)

}

if (wk == 0)

return (-1);

- /*

- * Remove the item to be processed. If we are removing the last

- * item on the list, we need to recalculate the tail pointer.

- * As this happens rarely and usually when the list is short,

- * we just run down the list to find it rather than tracking it

- * in the above loop.

- */

- WORKLIST_REMOVE(wk);

- if (wk == ump->softdep_worklist_tail) {

- LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)

- if (LIST_NEXT(wkend, wk_list) == NULL)

- break;

- ump->softdep_worklist_tail = wkend;

- }

- ump->softdep_on_worklist -= 1;

+ remove_from_worklist(wk);

FREE_LOCK(&lk);

if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))

panic("process_worklist_item: suspended filesystem");

@@ -952,6 +1335,8 @@ process_worklist_item(mp, flags)

case D_DIRREM:

/* removal of a directory entry */

handle_workitem_remove(WK_DIRREM(wk), vp);

+ if (vp)

+ vput(vp);

break;

case D_FREEBLKS:

@@ -969,6 +1354,11 @@ process_worklist_item(mp, flags)

handle_workitem_freefile(WK_FREEFILE(wk));

break;

+ case D_FREEWORK:

+ /* Final block in an indirect was freed. */

+ handle_workitem_indirblk(WK_FREEWORK(wk));

+ break;

default:

panic("%s_process_worklist: Unknown type %s",

"softdep", TYPENAME(wk->wk_type));

@@ -982,19 +1372,22 @@ process_worklist_item(mp, flags)

* Move dependencies from one buffer to another.

-void

+int

softdep_move_dependencies(oldbp, newbp)

struct buf *oldbp;

struct buf *newbp;

{

struct worklist *wk, *wktail;

+ int dirty;

- if (!LIST_EMPTY(&newbp->b_dep))

- panic("softdep_move_dependencies: need merge code");

- wktail = 0;

+ dirty = 0;

+ wktail = NULL;

ACQUIRE_LOCK(&lk);

while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {

LIST_REMOVE(wk, wk_list);

+ if (wk->wk_type == D_BMSAFEMAP &&

+ bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))

+ dirty = 1;

if (wktail == 0)

LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);

else

@@ -1002,6 +1395,8 @@ softdep_move_dependencies(oldbp, newbp)

wktail = wk;

}

FREE_LOCK(&lk);

+ return (dirty);

}

@@ -1198,23 +1593,22 @@ pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)

* This routine must be called with splbio interrupts blocked.

static int

-pagedep_lookup(ip, lbn, flags, pagedeppp)

- struct inode *ip;

+pagedep_lookup(mp, ino, lbn, flags, pagedeppp)

+ struct mount *mp;

+ ino_t ino;

ufs_lbn_t lbn;

int flags;

struct pagedep **pagedeppp;

{

struct pagedep *pagedep;

struct pagedep_hashhead *pagedephd;

- struct mount *mp;

int ret;

int i;

mtx_assert(&lk, MA_OWNED);

- mp = ITOV(ip)->v_mount;

- pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);

+ pagedephd = PAGEDEP_HASH(mp, ino, lbn);

- ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);

+ ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);

if (*pagedeppp || (flags & DEPALLOC) == 0)

return (ret);

FREE_LOCK(&lk);

@@ -1222,12 +1616,12 @@ pagedep_lookup(ip, lbn, flags, pagedeppp)

M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);

workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);

ACQUIRE_LOCK(&lk);

- ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);

+ ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);

if (*pagedeppp) {

WORKITEM_FREE(pagedep, D_PAGEDEP);

return (ret);

}

- pagedep->pd_ino = ip->i_number;

+ pagedep->pd_ino = ino;

pagedep->pd_lbn = lbn;

LIST_INIT(&pagedep->pd_dirremhd);

LIST_INIT(&pagedep->pd_pendinghd);

@@ -1314,10 +1708,14 @@ inodedep_lookup(mp, inum, flags, inodedeppp)

inodedep->id_savedino1 = NULL;

inodedep->id_savedsize = -1;

inodedep->id_savedextsize = -1;

- inodedep->id_buf = NULL;

+ inodedep->id_savednlink = -1;

+ inodedep->id_bmsafemap = NULL;

+ inodedep->id_mkdiradd = NULL;

+ LIST_INIT(&inodedep->id_dirremhd);

LIST_INIT(&inodedep->id_pendinghd);

LIST_INIT(&inodedep->id_inowait);

LIST_INIT(&inodedep->id_bufwait);

+ TAILQ_INIT(&inodedep->id_inoreflst);

TAILQ_INIT(&inodedep->id_inoupdt);

TAILQ_INIT(&inodedep->id_newinoupdt);

TAILQ_INIT(&inodedep->id_extupdt);

@@ -1336,17 +1734,29 @@ u_long newblk_hash; /* size of hash table - 1 */

(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])

static int

-newblk_find(newblkhd, fs, newblkno, newblkpp)

+newblk_find(newblkhd, mp, newblkno, flags, newblkpp)

struct newblk_hashhead *newblkhd;

- struct fs *fs;

+ struct mount *mp;

ufs2_daddr_t newblkno;

+ int flags;

struct newblk **newblkpp;

{

struct newblk *newblk;

- LIST_FOREACH(newblk, newblkhd, nb_hash)

- if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)

- break;

+ LIST_FOREACH(newblk, newblkhd, nb_hash) {

+ if (newblkno != newblk->nb_newblkno)

+ continue;

+ if (mp != newblk->nb_list.wk_mp)

+ continue;

+ /*

+ * If we're creating a new dependency don't match those that

+ * have already been converted to allocdirects. This is for

+ * a frag extend.

+ */

+ if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)

+ continue;

+ break;

+ }

if (newblk) {

*newblkpp = newblk;

return (1);

@@ -1361,8 +1771,8 @@ newblk_find(newblkhd, fs, newblkno, newblkpp)

* Found or allocated entry is returned in newblkpp.

static int

-newblk_lookup(fs, newblkno, flags, newblkpp)

- struct fs *fs;

+newblk_lookup(mp, newblkno, flags, newblkpp)

+ struct mount *mp;

ufs2_daddr_t newblkno;

int flags;

struct newblk **newblkpp;

@@ -1370,21 +1780,25 @@ newblk_lookup(fs, newblkno, flags, newblkpp)

struct newblk *newblk;

struct newblk_hashhead *newblkhd;

- newblkhd = NEWBLK_HASH(fs, newblkno);

- if (newblk_find(newblkhd, fs, newblkno, newblkpp))

+ newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);

+ if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))

return (1);

if ((flags & DEPALLOC) == 0)

return (0);

FREE_LOCK(&lk);

- newblk = malloc(sizeof(struct newblk),

- M_NEWBLK, M_SOFTDEP_FLAGS);

+ newblk = malloc(sizeof(union allblk), M_NEWBLK,

+ M_SOFTDEP_FLAGS | M_ZERO);

+ workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);

ACQUIRE_LOCK(&lk);

- if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {

- free(newblk, M_NEWBLK);

+ if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {

+ WORKITEM_FREE(newblk, D_NEWBLK);

return (1);

}

- newblk->nb_state = 0;

- newblk->nb_fs = fs;

+ newblk->nb_freefrag = NULL;

+ LIST_INIT(&newblk->nb_indirdeps);

+ LIST_INIT(&newblk->nb_newdirblk);

+ LIST_INIT(&newblk->nb_jwork);

+ newblk->nb_state = ATTACHED;

newblk->nb_newblkno = newblkno;

LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);

*newblkpp = newblk;

@@ -1401,10 +1815,10 @@ softdep_initialize()

LIST_INIT(&mkdirlisthd);

max_softdeps = desiredvnodes * 4;

- pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,

- &pagedep_hash);

+ pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);

inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);

- newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);

+ newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash);

+ bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);

/* initialise bioops hack */

bioops.io_start = softdep_disk_io_initiation;

@@ -1428,6 +1842,7 @@ softdep_uninitialize()

hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);

hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);

hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);

+ hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);

}

@@ -1457,9 +1872,16 @@ softdep_mount(devvp, mp, fs, cred)

MNT_IUNLOCK(mp);

ump = VFSTOUFS(mp);

LIST_INIT(&ump->softdep_workitem_pending);

+ LIST_INIT(&ump->softdep_journal_pending);

+ TAILQ_INIT(&ump->softdep_unlinked);

ump->softdep_worklist_tail = NULL;

ump->softdep_on_worklist = 0;

ump->softdep_deps = 0;

+ if ((fs->fs_flags & FS_SUJ) &&

+ (error = journal_mount(mp, fs, cred)) != 0) {

+ printf("Failed to start journal: %d\n", error);

+ return (error);

+ }

* When doing soft updates, the counters in the

* superblock may have gotten out of sync. Recomputation

@@ -1493,6 +1915,2019 @@ softdep_mount(devvp, mp, fs, cred)

return (0);

}

+void

+softdep_unmount(mp)

+ struct mount *mp;

+ if (mp->mnt_kern_flag & MNTK_SUJ)

+ journal_unmount(mp);

+struct jblocks {

+ struct jseglst jb_segs; /* TAILQ of current segments. */

+ struct jseg *jb_writeseg; /* Next write to complete. */

+ struct jextent *jb_extent; /* Extent array. */

+ uint64_t jb_nextseq; /* Next sequence number. */

+ uint64_t jb_oldestseq; /* Oldest active sequence number. */

+ int jb_avail; /* Available extents. */

+ int jb_used; /* Last used extent. */

+ int jb_head; /* Allocator head. */

+ int jb_off; /* Allocator extent offset. */

+ int jb_blocks; /* Total disk blocks covered. */

+ int jb_free; /* Total disk blocks free. */

+ int jb_min; /* Minimum free space. */

+ int jb_low; /* Low on space. */

+ int jb_age; /* Insertion time of oldest rec. */

+ int jb_suspended; /* Did journal suspend writes? */

+};

+struct jextent {

+ ufs2_daddr_t je_daddr; /* Disk block address. */

+ int je_blocks; /* Disk block count. */

+};

+static struct jblocks *

+jblocks_create(void)

+ struct jblocks *jblocks;

+ jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);

+ TAILQ_INIT(&jblocks->jb_segs);

+ jblocks->jb_avail = 10;

+ jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,

+ M_JBLOCKS, M_WAITOK | M_ZERO);

+ return (jblocks);

+static ufs2_daddr_t

+jblocks_alloc(jblocks, bytes, actual)

+ struct jblocks *jblocks;

+ int bytes;

+ int *actual;

+ ufs2_daddr_t daddr;

+ struct jextent *jext;

+ int freecnt;

+ int blocks;

+ blocks = bytes / DEV_BSIZE;

+ jext = &jblocks->jb_extent[jblocks->jb_head];

+ freecnt = jext->je_blocks - jblocks->jb_off;

+ if (freecnt == 0) {

+ jblocks->jb_off = 0;

+ if (++jblocks->jb_head > jblocks->jb_used)

+ jblocks->jb_head = 0;

+ jext = &jblocks->jb_extent[jblocks->jb_head];

+ freecnt = jext->je_blocks;

+ }

+ if (freecnt > blocks)

+ freecnt = blocks;

+ *actual = freecnt * DEV_BSIZE;

+ daddr = jext->je_daddr + jblocks->jb_off;

+ jblocks->jb_off += freecnt;

+ jblocks->jb_free -= freecnt;

+ return (daddr);

+static void

+jblocks_free(jblocks, mp, bytes)

+ struct jblocks *jblocks;

+ struct mount *mp;

+ int bytes;

+ jblocks->jb_free += bytes / DEV_BSIZE;

+ if (jblocks->jb_suspended)

+ worklist_speedup();

+ wakeup(jblocks);

+static void

+jblocks_destroy(jblocks)

+ struct jblocks *jblocks;

+ if (jblocks->jb_extent)

+ free(jblocks->jb_extent, M_JBLOCKS);

+ free(jblocks, M_JBLOCKS);

+static void

+jblocks_add(jblocks, daddr, blocks)

+ struct jblocks *jblocks;

+ ufs2_daddr_t daddr;

+ int blocks;

+ struct jextent *jext;

+ jblocks->jb_blocks += blocks;

+ jblocks->jb_free += blocks;

+ jext = &jblocks->jb_extent[jblocks->jb_used];

+ /* Adding the first block. */

+ if (jext->je_daddr == 0) {

+ jext->je_daddr = daddr;

+ jext->je_blocks = blocks;

+ return;

+ }

+ /* Extending the last extent. */

+ if (jext->je_daddr + jext->je_blocks == daddr) {

+ jext->je_blocks += blocks;

+ return;

+ }

+ /* Adding a new extent. */

+ if (++jblocks->jb_used == jblocks->jb_avail) {

+ jblocks->jb_avail *= 2;

+ jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,

+ M_JBLOCKS, M_WAITOK | M_ZERO);

+ memcpy(jext, jblocks->jb_extent,

+ sizeof(struct jextent) * jblocks->jb_used);

+ free(jblocks->jb_extent, M_JBLOCKS);

+ jblocks->jb_extent = jext;

+ }

+ jext = &jblocks->jb_extent[jblocks->jb_used];

+ jext->je_daddr = daddr;

+ jext->je_blocks = blocks;

+ return;

+int

+softdep_journal_lookup(mp, vpp)

+ struct mount *mp;

+ struct vnode **vpp;

+ struct componentname cnp;

+ struct vnode *dvp;

+ ino_t sujournal;

+ int error;

+ error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);

+ if (error)

+ return (error);

+ bzero(&cnp, sizeof(cnp));

+ cnp.cn_nameiop = LOOKUP;

+ cnp.cn_flags = ISLASTCN;

+ cnp.cn_thread = curthread;

+ cnp.cn_cred = curthread->td_ucred;

+ cnp.cn_pnbuf = SUJ_FILE;

+ cnp.cn_nameptr = SUJ_FILE;

+ cnp.cn_namelen = strlen(SUJ_FILE);

+ error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);

+ vput(dvp);

+ if (error != 0)

+ return (error);

+ error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);

+ return (error);

+/*

+ * Open and verify the journal file.

+ */

+static int

+journal_mount(mp, fs, cred)

+ struct mount *mp;

+ struct fs *fs;

+ struct ucred *cred;

+ struct jblocks *jblocks;

+ struct vnode *vp;

+ struct inode *ip;

+ ufs2_daddr_t blkno;

+ int bcount;

+ int error;

+ int i;

+ mp->mnt_kern_flag |= MNTK_SUJ;

+ error = softdep_journal_lookup(mp, &vp);

+ if (error != 0) {

+ printf("Failed to find journal. Use tunefs to create one\n");

+ return (error);

+ }

+ ip = VTOI(vp);

+ if (ip->i_size < SUJ_MIN) {

+ error = ENOSPC;

+ goto out;

+ }

+ bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */

+ jblocks = jblocks_create();

+ for (i = 0; i < bcount; i++) {

+ error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);

+ if (error)

+ break;

+ jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));

+ }

+ if (error) {

+ jblocks_destroy(jblocks);

+ goto out;

+ }

+ jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */

+ jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */

+ /*

+ * Only validate the journal contents if the filesystem is clean,

+ * otherwise we write the logs but they'll never be used. If the

+ * filesystem was still dirty when we mounted it the journal is

+ * invalid and a new journal can only be valid if it starts from a

+ * clean mount.

+ */

+ if (fs->fs_clean) {

+ DIP_SET(ip, i_modrev, fs->fs_mtime);

+ ip->i_flags |= IN_MODIFIED;

+ ffs_update(vp, 1);

+ }

+ VFSTOUFS(mp)->softdep_jblocks = jblocks;

+out:

+ vput(vp);

+ return (error);

+static void

+journal_unmount(mp)

+ struct mount *mp;

+ struct ufsmount *ump;

+ ump = VFSTOUFS(mp);

+ if (ump->softdep_jblocks)

+ jblocks_destroy(ump->softdep_jblocks);

+ ump->softdep_jblocks = NULL;

+/*

+ * Called when a journal record is ready to be written. Space is allocated

+ * and the journal entry is created when the journal is flushed to stable

+ * store.

+ */

+static void

+add_to_journal(wk)

+ struct worklist *wk;

+ struct ufsmount *ump;

+ mtx_assert(&lk, MA_OWNED);

+ ump = VFSTOUFS(wk->wk_mp);

+ if (wk->wk_state & ONWORKLIST)

+ panic("add_to_journal: %s(0x%X) already on list",

+ TYPENAME(wk->wk_type), wk->wk_state);

+ wk->wk_state |= ONWORKLIST | DEPCOMPLETE;

+ if (LIST_EMPTY(&ump->softdep_journal_pending)) {

+ ump->softdep_jblocks->jb_age = ticks;

+ LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);

+ } else

+ LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);

+ ump->softdep_journal_tail = wk;

+ ump->softdep_on_journal += 1;

+/*

+ * Remove an arbitrary item for the journal worklist maintain the tail

+ * pointer. This happens when a new operation obviates the need to

+ * journal an old operation.

+ */

+static void

+remove_from_journal(wk)

+ struct worklist *wk;

+ struct ufsmount *ump;

+ mtx_assert(&lk, MA_OWNED);

+ ump = VFSTOUFS(wk->wk_mp);

+#ifdef DEBUG /* XXX Expensive, temporary. */

+ {

+ struct worklist *wkn;

+ LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)

+ if (wkn == wk)

+ break;

+ if (wkn == NULL)

+ panic("remove_from_journal: %p is not in journal", wk);

+ }

+#endif

+ /*

+ * We emulate a TAILQ to save space in most structures which do not

+ * require TAILQ semantics. Here we must update the tail position

+ * when removing the tail which is not the final entry.

+ */

+ if (ump->softdep_journal_tail == wk)

+ ump->softdep_journal_tail =

+ (struct worklist *)wk->wk_list.le_prev;

+ WORKLIST_REMOVE(wk);

+ ump->softdep_on_journal -= 1;

+/*

+ * Check for journal space as well as dependency limits so the prelink

+ * code can throttle both journaled and non-journaled filesystems.

+ * Threshold is 0 for low and 1 for min.

+ */

+static int

+journal_space(ump, thresh)

+ struct ufsmount *ump;

+ int thresh;

+ struct jblocks *jblocks;

+ int avail;

+ /*

+ * We use a tighter restriction here to prevent request_cleanup()

+ * running in threads from running into locks we currently hold.

+ */

+ if (num_inodedep > (max_softdeps / 10) * 9)

+ return (0);

+ jblocks = ump->softdep_jblocks;

+ if (jblocks == NULL)

+ return (1);

+ if (thresh)

+ thresh = jblocks->jb_min;

+ else

+ thresh = jblocks->jb_low;

+ avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;

+ avail = jblocks->jb_free - avail;

+ return (avail > thresh);

+static void

+journal_suspend(ump)

+ struct ufsmount *ump;

+ struct jblocks *jblocks;

+ struct mount *mp;

+ mp = UFSTOVFS(ump);

+ jblocks = ump->softdep_jblocks;

+ MNT_ILOCK(mp);

+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {

+ stat_journal_min++;

+ mp->mnt_kern_flag |= MNTK_SUSPEND;

+ mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);

+ }

+ jblocks->jb_suspended = 1;

+ MNT_IUNLOCK(mp);

+/*

+ * Called before any allocation function to be certain that there is

+ * sufficient space in the journal prior to creating any new records.

+ * Since in the case of block allocation we may have multiple locked

+ * buffers at the time of the actual allocation we can not block

+ * when the journal records are created. Doing so would create a deadlock

+ * if any of these buffers needed to be flushed to reclaim space. Instead

+ * we require a sufficiently large amount of available space such that

+ * each thread in the system could have passed this allocation check and

+ * still have sufficient free space. With 20% of a minimum journal size

+ * of 1MB we have 6553 records available.

+ */

+int

+softdep_prealloc(vp, waitok)

+ struct vnode *vp;

+ int waitok;

+ struct ufsmount *ump;

+ if (DOINGSUJ(vp) == 0)

+ return (0);

+ ump = VFSTOUFS(vp->v_mount);

+ ACQUIRE_LOCK(&lk);

+ if (journal_space(ump, 0)) {

+ FREE_LOCK(&lk);

+ return (0);

+ }

+ stat_journal_low++;

+ FREE_LOCK(&lk);

+ if (waitok == MNT_NOWAIT)

+ return (ENOSPC);

+ /*

+ * Attempt to sync this vnode once to flush any journal

+ * work attached to it.

+ */

+ ffs_syncvnode(vp, waitok);

+ ACQUIRE_LOCK(&lk);

+ process_removes(vp);

+ if (journal_space(ump, 0) == 0) {

+ softdep_speedup();

+ if (journal_space(ump, 1) == 0)

+ journal_suspend(ump);

+ }

+ FREE_LOCK(&lk);

+ return (0);

+/*

+ * Before adjusting a link count on a vnode verify that we have sufficient

+ * journal space. If not, process operations that depend on the currently

+ * locked pair of vnodes to try to flush space as the syncer, buf daemon,

+ * and softdep flush threads can not acquire these locks to reclaim space.

+ */

+static void

+softdep_prelink(dvp, vp)

+ struct vnode *dvp;

+ struct vnode *vp;

+ struct ufsmount *ump;

+ ump = VFSTOUFS(dvp->v_mount);

+ mtx_assert(&lk, MA_OWNED);

+ if (journal_space(ump, 0))

+ return;

+ stat_journal_low++;

+ FREE_LOCK(&lk);

+ if (vp)

+ ffs_syncvnode(vp, MNT_NOWAIT);

+ ffs_syncvnode(dvp, MNT_WAIT);

+ ACQUIRE_LOCK(&lk);

+ /* Process vp before dvp as it may create .. removes. */

+ if (vp)

+ process_removes(vp);

+ process_removes(dvp);

+ softdep_speedup();

+ process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);

+ if (journal_space(ump, 0) == 0) {

+ softdep_speedup();

+ if (journal_space(ump, 1) == 0)

+ journal_suspend(ump);

+ }

+static void

+jseg_write(fs, jblocks, jseg, data)

+ struct fs *fs;

+ struct jblocks *jblocks;

+ struct jseg *jseg;

+ uint8_t *data;

+ struct jsegrec *rec;

+ rec = (struct jsegrec *)data;

+ rec->jsr_seq = jseg->js_seq;

+ rec->jsr_oldest = jblocks->jb_oldestseq;

+ rec->jsr_cnt = jseg->js_cnt;

+ rec->jsr_blocks = jseg->js_size / DEV_BSIZE;

+ rec->jsr_crc = 0;

+ rec->jsr_time = fs->fs_mtime;

+static inline void

+inoref_write(inoref, jseg, rec)

+ struct inoref *inoref;

+ struct jseg *jseg;

+ struct jrefrec *rec;

+ inoref->if_jsegdep->jd_seg = jseg;

+ rec->jr_ino = inoref->if_ino;

+ rec->jr_parent = inoref->if_parent;

+ rec->jr_nlink = inoref->if_nlink;

+ rec->jr_mode = inoref->if_mode;

+ rec->jr_diroff = inoref->if_diroff;

+static void

+jaddref_write(jaddref, jseg, data)

+ struct jaddref *jaddref;

+ struct jseg *jseg;

+ uint8_t *data;

+ struct jrefrec *rec;

+ rec = (struct jrefrec *)data;

+ rec->jr_op = JOP_ADDREF;

+ inoref_write(&jaddref->ja_ref, jseg, rec);

+static void

+jremref_write(jremref, jseg, data)

+ struct jremref *jremref;

+ struct jseg *jseg;

+ uint8_t *data;

+ struct jrefrec *rec;

+ rec = (struct jrefrec *)data;

+ rec->jr_op = JOP_REMREF;

+ inoref_write(&jremref->jr_ref, jseg, rec);

+static void

+jmvref_write(jmvref, jseg, data)

+ struct jmvref *jmvref;

+ struct jseg *jseg;

+ uint8_t *data;

+ struct jmvrec *rec;

+ rec = (struct jmvrec *)data;

+ rec->jm_op = JOP_MVREF;

+ rec->jm_ino = jmvref->jm_ino;

+ rec->jm_parent = jmvref->jm_parent;

+ rec->jm_oldoff = jmvref->jm_oldoff;

+ rec->jm_newoff = jmvref->jm_newoff;

+static void

+jnewblk_write(jnewblk, jseg, data)

+ struct jnewblk *jnewblk;

+ struct jseg *jseg;

+ uint8_t *data;

+ struct jblkrec *rec;

+ jnewblk->jn_jsegdep->jd_seg = jseg;

+ rec = (struct jblkrec *)data;

+ rec->jb_op = JOP_NEWBLK;

+ rec->jb_ino = jnewblk->jn_ino;

+ rec->jb_blkno = jnewblk->jn_blkno;

+ rec->jb_lbn = jnewblk->jn_lbn;

+ rec->jb_frags = jnewblk->jn_frags;

+ rec->jb_oldfrags = jnewblk->jn_oldfrags;

+static void

+jfreeblk_write(jfreeblk, jseg, data)

+ struct jfreeblk *jfreeblk;

+ struct jseg *jseg;

+ uint8_t *data;

+ struct jblkrec *rec;

+ jfreeblk->jf_jsegdep->jd_seg = jseg;

+ rec = (struct jblkrec *)data;

+ rec->jb_op = JOP_FREEBLK;

+ rec->jb_ino = jfreeblk->jf_ino;

+ rec->jb_blkno = jfreeblk->jf_blkno;

+ rec->jb_lbn = jfreeblk->jf_lbn;

+ rec->jb_frags = jfreeblk->jf_frags;

+ rec->jb_oldfrags = 0;

+static void

+jfreefrag_write(jfreefrag, jseg, data)

+ struct jfreefrag *jfreefrag;

+ struct jseg *jseg;

+ uint8_t *data;

+ struct jblkrec *rec;

+ jfreefrag->fr_jsegdep->jd_seg = jseg;

+ rec = (struct jblkrec *)data;

+ rec->jb_op = JOP_FREEBLK;

+ rec->jb_ino = jfreefrag->fr_ino;

+ rec->jb_blkno = jfreefrag->fr_blkno;

+ rec->jb_lbn = jfreefrag->fr_lbn;

+ rec->jb_frags = jfreefrag->fr_frags;

+ rec->jb_oldfrags = 0;

+static void

+jtrunc_write(jtrunc, jseg, data)

+ struct jtrunc *jtrunc;

+ struct jseg *jseg;

+ uint8_t *data;

+ struct jtrncrec *rec;

+ rec = (struct jtrncrec *)data;

+ rec->jt_op = JOP_TRUNC;

+ rec->jt_ino = jtrunc->jt_ino;

+ rec->jt_size = jtrunc->jt_size;

+ rec->jt_extsize = jtrunc->jt_extsize;

+/*

+ * Flush some journal records to disk.

+ */

+static void

+softdep_process_journal(mp, flags)

+ struct mount *mp;

+ int flags;

+ struct jblocks *jblocks;

+ struct ufsmount *ump;

+ struct worklist *wk;

+ struct jseg *jseg;

+ struct buf *bp;

+ uint8_t *data;

+ struct fs *fs;

+ int segwritten;

+ int jrecmin; /* Minimum records per block. */

+ int jrecmax; /* Maximum records per block. */

+ int size;

+ int cnt;

+ int off;

+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)

+ return;

+ ump = VFSTOUFS(mp);

+ fs = ump->um_fs;

+ jblocks = ump->softdep_jblocks;

+ /*

+ * We write anywhere between a disk block and fs block. The upper

+ * bound is picked to prevent buffer cache fragmentation and limit

+ * processing time per I/O.

+ */

+ jrecmin = (DEV_BSIZE / JREC_SIZE) - 1; /* -1 for seg header */

+ jrecmax = (fs->fs_bsize / DEV_BSIZE) * jrecmin;

+ segwritten = 0;

+ while ((cnt = ump->softdep_on_journal) != 0) {

+ /*

+ * Create a new segment to hold as many as 'cnt' journal

+ * entries and add them to the segment. Notice cnt is

+ * off by one to account for the space required by the

+ * jsegrec. If we don't have a full block to log skip it

+ * unless we haven't written anything.

+ */

+ cnt++;

+ if (cnt < jrecmax && segwritten)

+ break;

+ /*

+ * Verify some free journal space. softdep_prealloc() should

+ * guarantee that we don't run out so this is indicative of

+ * a problem with the flow control. Try to recover

+ * gracefully in any event.

+ */

+ while (jblocks->jb_free == 0) {

+ if (flags != MNT_WAIT)

+ break;

+ printf("softdep: Out of journal space!\n");

+ softdep_speedup();

+ msleep(jblocks, &lk, PRIBIO, "jblocks", 1);

+ }

+ FREE_LOCK(&lk);

+ jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);

+ workitem_alloc(&jseg->js_list, D_JSEG, mp);

+ LIST_INIT(&jseg->js_entries);

+ jseg->js_state = ATTACHED;

+ jseg->js_jblocks = jblocks;

+ bp = geteblk(fs->fs_bsize, 0);

+ ACQUIRE_LOCK(&lk);

+ /*

+ * If there was a race while we were allocating the block

+ * and jseg the entry we care about was likely written.

+ * We bail out in both the WAIT and NOWAIT case and assume

+ * the caller will loop if the entry it cares about is

+ * not written.

+ */

+ if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {

+ bp->b_flags |= B_INVAL | B_NOCACHE;

+ WORKITEM_FREE(jseg, D_JSEG);

+ FREE_LOCK(&lk);

+ brelse(bp);

+ ACQUIRE_LOCK(&lk);

+ break;

+ }

+ /*

+ * Calculate the disk block size required for the available

+ * records rounded to the min size.

+ */

+ cnt = ump->softdep_on_journal;

+ if (cnt < jrecmax)

+ size = howmany(cnt, jrecmin) * DEV_BSIZE;

+ else

+ size = fs->fs_bsize;

+ /*

+ * Allocate a disk block for this journal data and account

+ * for truncation of the requested size if enough contiguous

+ * space was not available.

+ */

+ bp->b_blkno = jblocks_alloc(jblocks, size, &size);

+ bp->b_lblkno = bp->b_blkno;

+ bp->b_offset = bp->b_blkno * DEV_BSIZE;

+ bp->b_bcount = size;

+ bp->b_bufobj = &ump->um_devvp->v_bufobj;

+ bp->b_flags &= ~B_INVAL;

+ bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;

+ /*

+ * Initialize our jseg with cnt records. Assign the next

+ * sequence number to it and link it in-order.

+ */

+ cnt = MIN(ump->softdep_on_journal,

+ (size / DEV_BSIZE) * jrecmin);

+ jseg->js_buf = bp;

+ jseg->js_cnt = cnt;

+ jseg->js_refs = cnt + 1; /* Self ref. */

+ jseg->js_size = size;

+ jseg->js_seq = jblocks->jb_nextseq++;

+ if (TAILQ_EMPTY(&jblocks->jb_segs))

+ jblocks->jb_oldestseq = jseg->js_seq;

+ TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);

+ if (jblocks->jb_writeseg == NULL)

+ jblocks->jb_writeseg = jseg;

+ /*

+ * Start filling in records from the pending list.

+ */

+ data = bp->b_data;

+ off = 0;

+ while ((wk = LIST_FIRST(&ump->softdep_journal_pending))

+ != NULL) {

+ /* Place a segment header on every device block. */

+ if ((off % DEV_BSIZE) == 0) {

+ jseg_write(fs, jblocks, jseg, data);

+ off += JREC_SIZE;

+ data = bp->b_data + off;

+ }

+ remove_from_journal(wk);

+ wk->wk_state |= IOSTARTED;

+ WORKLIST_INSERT(&jseg->js_entries, wk);

+ switch (wk->wk_type) {

+ case D_JADDREF:

+ jaddref_write(WK_JADDREF(wk), jseg, data);

+ break;

+ case D_JREMREF:

+ jremref_write(WK_JREMREF(wk), jseg, data);

+ break;

+ case D_JMVREF:

+ jmvref_write(WK_JMVREF(wk), jseg, data);

+ break;

+ case D_JNEWBLK:

+ jnewblk_write(WK_JNEWBLK(wk), jseg, data);

+ break;

+ case D_JFREEBLK:

+ jfreeblk_write(WK_JFREEBLK(wk), jseg, data);

+ break;

+ case D_JFREEFRAG:

+ jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);

+ break;

+ case D_JTRUNC:

+ jtrunc_write(WK_JTRUNC(wk), jseg, data);

+ break;

+ default:

+ panic("process_journal: Unknown type %s",

+ TYPENAME(wk->wk_type));

+ /* NOTREACHED */

+ }

+ if (--cnt == 0)

+ break;

+ off += JREC_SIZE;

+ data = bp->b_data + off;

+ }

+ /*

+ * Write this one buffer and continue.

+ */

+ WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);

+ FREE_LOCK(&lk);

+ BO_LOCK(bp->b_bufobj);

+ bgetvp(ump->um_devvp, bp);

+ BO_UNLOCK(bp->b_bufobj);

+ if (flags == MNT_NOWAIT)

+ bawrite(bp);

+ else

+ bwrite(bp);

+ ACQUIRE_LOCK(&lk);

+ }

+ /*

+ * If we've suspended the filesystem because we ran out of journal

+ * space either try to sync it here to make some progress or

+ * unsuspend it if we already have.

+ */

+ if (flags == 0 && jblocks && jblocks->jb_suspended) {

+ if (journal_space(ump, jblocks->jb_min)) {

+ FREE_LOCK(&lk);

+ jblocks->jb_suspended = 0;

+ mp->mnt_susp_owner = curthread;

+ vfs_write_resume(mp);

+ ACQUIRE_LOCK(&lk);

+ return;

+ }

+ FREE_LOCK(&lk);

+ VFS_SYNC(mp, MNT_NOWAIT);

+ ffs_sbupdate(ump, MNT_WAIT, 0);

+ ACQUIRE_LOCK(&lk);

+ }

+/*

+ * Complete a jseg, allowing all dependencies awaiting journal writes

+ * to proceed. Each journal dependency also attaches a jsegdep to dependent

+ * structures so that the journal segment can be freed to reclaim space.

+ */

+static void

+complete_jseg(jseg)

+ struct jseg *jseg;

+ struct worklist *wk;

+ struct jmvref *jmvref;

+ int waiting;

+ int i;

+ i = 0;

+ while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {

+ WORKLIST_REMOVE(wk);

+ waiting = wk->wk_state & IOWAITING;

+ wk->wk_state &= ~(IOSTARTED | IOWAITING);

+ wk->wk_state |= COMPLETE;

+ KASSERT(i < jseg->js_cnt,

+ ("handle_written_jseg: overflow %d >= %d",

+ i, jseg->js_cnt));

+ switch (wk->wk_type) {

+ case D_JADDREF:

+ handle_written_jaddref(WK_JADDREF(wk));

+ break;

+ case D_JREMREF:

+ handle_written_jremref(WK_JREMREF(wk));

+ break;

+ case D_JMVREF:

+ /* No jsegdep here. */

+ free_jseg(jseg);

+ jmvref = WK_JMVREF(wk);

+ LIST_REMOVE(jmvref, jm_deps);

+ free_pagedep(jmvref->jm_pagedep);

+ WORKITEM_FREE(jmvref, D_JMVREF);

+ break;

+ case D_JNEWBLK:

+ handle_written_jnewblk(WK_JNEWBLK(wk));

+ break;

+ case D_JFREEBLK:

+ handle_written_jfreeblk(WK_JFREEBLK(wk));

+ break;

+ case D_JFREEFRAG:

+ handle_written_jfreefrag(WK_JFREEFRAG(wk));

+ break;

+ case D_JTRUNC:

+ WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;

+ WORKITEM_FREE(wk, D_JTRUNC);

+ break;

+ default:

+ panic("handle_written_jseg: Unknown type %s",

+ TYPENAME(wk->wk_type));

+ /* NOTREACHED */

+ }

+ if (waiting)

+ wakeup(wk);

+ }

+ /* Release the self reference so the structure may be freed. */

+ free_jseg(jseg);

+/*

+ * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg

+ * completions in order only.

+ */

+static void

+handle_written_jseg(jseg, bp)

+ struct jseg *jseg;

+ struct buf *bp;

+ struct jblocks *jblocks;

+ struct jseg *jsegn;

+ if (jseg->js_refs == 0)

+ panic("handle_written_jseg: No self-reference on %p", jseg);

+ jseg->js_state |= DEPCOMPLETE;

+ /*

+ * We'll never need this buffer again, set flags so it will be

+ * discarded.

+ */

+ bp->b_flags |= B_INVAL | B_NOCACHE;

+ jblocks = jseg->js_jblocks;

+ /*

+ * Don't allow out of order completions. If this isn't the first

+ * block wait for it to write before we're done.

+ */

+ if (jseg != jblocks->jb_writeseg)

+ return;

+ /* Iterate through available jsegs processing their entries. */

+ do {

+ jsegn = TAILQ_NEXT(jseg, js_next);

+ complete_jseg(jseg);

+ jseg = jsegn;

+ } while (jseg && jseg->js_state & DEPCOMPLETE);

+ jblocks->jb_writeseg = jseg;

+static inline struct jsegdep *

+inoref_jseg(inoref)

+ struct inoref *inoref;

+ struct jsegdep *jsegdep;

+ jsegdep = inoref->if_jsegdep;

+ inoref->if_jsegdep = NULL;

+ return (jsegdep);

+/*

+ * Called once a jremref has made it to stable store. The jremref is marked

+ * complete and we attempt to free it. Any pagedeps writes sleeping waiting

+ * for the jremref to complete will be awoken by free_jremref.

+ */

+static void

+handle_written_jremref(jremref)

+ struct jremref *jremref;

+ struct inodedep *inodedep;

+ struct jsegdep *jsegdep;

+ struct dirrem *dirrem;

+ /* Grab the jsegdep. */

+ jsegdep = inoref_jseg(&jremref->jr_ref);

+ /*

+ * Remove us from the inoref list.

+ */

+ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,

+ 0, &inodedep) == 0)

+ panic("handle_written_jremref: Lost inodedep");

+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);

+ /*

+ * Complete the dirrem.

+ */

+ dirrem = jremref->jr_dirrem;

+ jremref->jr_dirrem = NULL;

+ LIST_REMOVE(jremref, jr_deps);

+ jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;

+ WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);

+ if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&

+ (dirrem->dm_state & COMPLETE) != 0)

+ add_to_worklist(&dirrem->dm_list, 0);

+ free_jremref(jremref);

+/*

+ * Called once a jaddref has made it to stable store. The dependency is

+ * marked complete and any dependent structures are added to the inode

+ * bufwait list to be completed as soon as it is written. If a bitmap write

+ * depends on this entry we move the inode into the inodedephd of the

+ * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.

+ */

+static void

+handle_written_jaddref(jaddref)

+ struct jaddref *jaddref;

+ struct jsegdep *jsegdep;

+ struct inodedep *inodedep;

+ struct diradd *diradd;

+ struct mkdir *mkdir;

+ /* Grab the jsegdep. */

+ jsegdep = inoref_jseg(&jaddref->ja_ref);

+ mkdir = NULL;

+ diradd = NULL;

+ if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,

+ 0, &inodedep) == 0)

+ panic("handle_written_jaddref: Lost inodedep.");

+ if (jaddref->ja_diradd == NULL)

+ panic("handle_written_jaddref: No dependency");

+ if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {

+ diradd = jaddref->ja_diradd;

+ WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);

+ } else if (jaddref->ja_state & MKDIR_PARENT) {

+ mkdir = jaddref->ja_mkdir;

+ WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);

+ } else if (jaddref->ja_state & MKDIR_BODY)

+ mkdir = jaddref->ja_mkdir;

+ else

+ panic("handle_written_jaddref: Unknown dependency %p",

+ jaddref->ja_diradd);

+ jaddref->ja_diradd = NULL; /* also clears ja_mkdir */

+ /*

+ * Remove us from the inode list.

+ */

+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);

+ /*

+ * The mkdir may be waiting on the jaddref to clear before freeing.

+ */

+ if (mkdir) {

+ KASSERT(mkdir->md_list.wk_type == D_MKDIR,

+ ("handle_written_jaddref: Incorrect type for mkdir %s",

+ TYPENAME(mkdir->md_list.wk_type)));

+ mkdir->md_jaddref = NULL;

+ diradd = mkdir->md_diradd;

+ mkdir->md_state |= DEPCOMPLETE;

+ complete_mkdir(mkdir);

+ }

+ WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);

+ if (jaddref->ja_state & NEWBLOCK) {

+ inodedep->id_state |= ONDEPLIST;

+ LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,

+ inodedep, id_deps);

+ }

+ free_jaddref(jaddref);

+/*

+ * Called once a jnewblk journal is written. The allocdirect or allocindir

+ * is placed in the bmsafemap to await notification of a written bitmap.

+ */

+static void

+handle_written_jnewblk(jnewblk)

+ struct jnewblk *jnewblk;

+ struct bmsafemap *bmsafemap;

+ struct jsegdep *jsegdep;

+ struct newblk *newblk;

+ /* Grab the jsegdep. */

+ jsegdep = jnewblk->jn_jsegdep;

+ jnewblk->jn_jsegdep = NULL;

+ /*

+ * Add the written block to the bmsafemap so it can be notified when

+ * the bitmap is on disk.

+ */

+ newblk = jnewblk->jn_newblk;

+ jnewblk->jn_newblk = NULL;

+ if (newblk == NULL)

+ panic("handle_written_jnewblk: No dependency for the segdep.");

+ newblk->nb_jnewblk = NULL;

+ bmsafemap = newblk->nb_bmsafemap;

+ WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);

+ newblk->nb_state |= ONDEPLIST;

+ LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);

+ free_jnewblk(jnewblk);

+/*

+ * Cancel a jfreefrag that won't be needed, probably due to colliding with

+ * an in-flight allocation that has not yet been committed. Divorce us

+ * from the freefrag and mark it DEPCOMPLETE so that it may be added

+ * to the worklist.

+ */

+static void

+cancel_jfreefrag(jfreefrag)

+ struct jfreefrag *jfreefrag;

+ struct freefrag *freefrag;

+ if (jfreefrag->fr_jsegdep) {

+ free_jsegdep(jfreefrag->fr_jsegdep);

+ jfreefrag->fr_jsegdep = NULL;

+ }

+ freefrag = jfreefrag->fr_freefrag;

+ jfreefrag->fr_freefrag = NULL;

+ freefrag->ff_jfreefrag = NULL;

+ free_jfreefrag(jfreefrag);

+ freefrag->ff_state |= DEPCOMPLETE;

+/*

+ * Free a jfreefrag when the parent freefrag is rendered obsolete.

+ */

+static void

+free_jfreefrag(jfreefrag)

+ struct jfreefrag *jfreefrag;

+ if (jfreefrag->fr_state & IOSTARTED)

+ WORKLIST_REMOVE(&jfreefrag->fr_list);

+ else if (jfreefrag->fr_state & ONWORKLIST)

+ remove_from_journal(&jfreefrag->fr_list);

+ if (jfreefrag->fr_freefrag != NULL)

+ panic("free_jfreefrag: Still attached to a freefrag.");

+ WORKITEM_FREE(jfreefrag, D_JFREEFRAG);

+/*

+ * Called when the journal write for a jfreefrag completes. The parent

+ * freefrag is added to the worklist if this completes its dependencies.

+ */

+static void

+handle_written_jfreefrag(jfreefrag)

+ struct jfreefrag *jfreefrag;

+ struct jsegdep *jsegdep;

+ struct freefrag *freefrag;

+ /* Grab the jsegdep. */

+ jsegdep = jfreefrag->fr_jsegdep;

+ jfreefrag->fr_jsegdep = NULL;

+ freefrag = jfreefrag->fr_freefrag;

+ if (freefrag == NULL)

+ panic("handle_written_jfreefrag: No freefrag.");

+ freefrag->ff_state |= DEPCOMPLETE;

+ freefrag->ff_jfreefrag = NULL;

+ WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);

+ if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)

+ add_to_worklist(&freefrag->ff_list, 0);

+ jfreefrag->fr_freefrag = NULL;

+ free_jfreefrag(jfreefrag);

+/*

+ * Called when the journal write for a jfreeblk completes. The jfreeblk

+ * is removed from the freeblks list of pending journal writes and the

+ * jsegdep is moved to the freeblks jwork to be completed when all blocks

+ * have been reclaimed.

+ */

+static void

+handle_written_jfreeblk(jfreeblk)

+ struct jfreeblk *jfreeblk;

+ struct freeblks *freeblks;

+ struct jsegdep *jsegdep;

+ /* Grab the jsegdep. */

+ jsegdep = jfreeblk->jf_jsegdep;

+ jfreeblk->jf_jsegdep = NULL;

+ freeblks = jfreeblk->jf_freeblks;

+ LIST_REMOVE(jfreeblk, jf_deps);

+ WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);

+ /*

+ * If the freeblks is all journaled, we can add it to the worklist.

+ */

+ if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&

+ (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {

+ /* Remove from the b_dep that is waiting on this write. */

+ if (freeblks->fb_state & ONWORKLIST)

+ WORKLIST_REMOVE(&freeblks->fb_list);

+ add_to_worklist(&freeblks->fb_list, 1);

+ }

+ free_jfreeblk(jfreeblk);

+static struct jsegdep *

+newjsegdep(struct worklist *wk)

+ struct jsegdep *jsegdep;

+ jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);

+ workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);

+ jsegdep->jd_seg = NULL;

+ return (jsegdep);

+static struct jmvref *

+newjmvref(dp, ino, oldoff, newoff)

+ struct inode *dp;

+ ino_t ino;

+ off_t oldoff;

+ off_t newoff;

+ struct jmvref *jmvref;

+ jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);

+ workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));

+ jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;

+ jmvref->jm_parent = dp->i_number;

+ jmvref->jm_ino = ino;

+ jmvref->jm_oldoff = oldoff;

+ jmvref->jm_newoff = newoff;

+ return (jmvref);

+/*

+ * Allocate a new jremref that tracks the removal of ip from dp with the

+ * directory entry offset of diroff. Mark the entry as ATTACHED and

+ * DEPCOMPLETE as we have all the information required for the journal write

+ * and the directory has already been removed from the buffer. The caller

+ * is responsible for linking the jremref into the pagedep and adding it

+ * to the journal to write. The MKDIR_PARENT flag is set if we're doing

+ * a DOTDOT addition so handle_workitem_remove() can properly assign

+ * the jsegdep when we're done.

+ */

+static struct jremref *

+newjremref(dirrem, dp, ip, diroff, nlink)

+ struct dirrem *dirrem;

+ struct inode *dp;

+ struct inode *ip;

+ off_t diroff;

+ nlink_t nlink;

+ struct jremref *jremref;

+ jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);

+ workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));

+ jremref->jr_state = ATTACHED;

+ newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,

+ nlink, ip->i_mode);

+ jremref->jr_dirrem = dirrem;

+ return (jremref);

+static inline void

+newinoref(inoref, ino, parent, diroff, nlink, mode)

+ struct inoref *inoref;

+ ino_t ino;

+ ino_t parent;

+ off_t diroff;

+ nlink_t nlink;

+ uint16_t mode;

+ inoref->if_jsegdep = newjsegdep(&inoref->if_list);

+ inoref->if_diroff = diroff;

+ inoref->if_ino = ino;

+ inoref->if_parent = parent;

+ inoref->if_nlink = nlink;

+ inoref->if_mode = mode;

+/*

+ * Allocate a new jaddref to track the addition of ino to dp at diroff. The

+ * directory offset may not be known until later. The caller is responsible

+ * adding the entry to the journal when this information is available. nlink

+ * should be the link count prior to the addition and mode is only required

+ * to have the correct FMT.

+ */

+static struct jaddref *

+newjaddref(dp, ino, diroff, nlink, mode)

+ struct inode *dp;

+ ino_t ino;

+ off_t diroff;

+ int16_t nlink;

+ uint16_t mode;

+ struct jaddref *jaddref;

+ jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);

+ workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));

+ jaddref->ja_state = ATTACHED;

+ jaddref->ja_mkdir = NULL;

+ newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);

+ return (jaddref);

+/*

+ * Create a new free dependency for a freework. The caller is responsible

+ * for adjusting the reference count when it has the lock held. The freedep

+ * will track an outstanding bitmap write that will ultimately clear the

+ * freework to continue.

+ */

+static struct freedep *

+newfreedep(struct freework *freework)

+ struct freedep *freedep;

+ freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);

+ workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);

+ freedep->fd_freework = freework;

+ return (freedep);

+/*

+ * Free a freedep structure once the buffer it is linked to is written. If

+ * this is the last reference to the freework schedule it for completion.

+ */

+static void

+free_freedep(freedep)

+ struct freedep *freedep;

+ if (--freedep->fd_freework->fw_ref == 0)

+ add_to_worklist(&freedep->fd_freework->fw_list, 1);

+ WORKITEM_FREE(freedep, D_FREEDEP);

+/*

+ * Allocate a new freework structure that may be a level in an indirect

+ * when parent is not NULL or a top level block when it is. The top level

+ * freework structures are allocated without lk held and before the freeblks

+ * is visible outside of softdep_setup_freeblocks().

+ */

+static struct freework *

+newfreework(freeblks, parent, lbn, nb, frags, journal)

+ struct freeblks *freeblks;

+ struct freework *parent;

+ ufs_lbn_t lbn;

+ ufs2_daddr_t nb;

+ int frags;

+ int journal;

+ struct freework *freework;

+ freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);

+ workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);

+ freework->fw_freeblks = freeblks;

+ freework->fw_parent = parent;

+ freework->fw_lbn = lbn;

+ freework->fw_blkno = nb;

+ freework->fw_frags = frags;

+ freework->fw_ref = 0;

+ freework->fw_off = 0;

+ LIST_INIT(&freework->fw_jwork);

+ if (parent == NULL) {

+ WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,

+ &freework->fw_list);

+ freeblks->fb_ref++;

+ }

+ if (journal)

+ newjfreeblk(freeblks, lbn, nb, frags);

+ return (freework);

+/*

+ * Allocate a new jfreeblk to journal top level block pointer when truncating

+ * a file. The caller must add this to the worklist when lk is held.

+ */

+static struct jfreeblk *

+newjfreeblk(freeblks, lbn, blkno, frags)

+ struct freeblks *freeblks;

+ ufs_lbn_t lbn;

+ ufs2_daddr_t blkno;

+ int frags;

+ struct jfreeblk *jfreeblk;

+ jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);

+ workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);

+ jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);

+ jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;

+ jfreeblk->jf_ino = freeblks->fb_previousinum;

+ jfreeblk->jf_lbn = lbn;

+ jfreeblk->jf_blkno = blkno;

+ jfreeblk->jf_frags = frags;

+ jfreeblk->jf_freeblks = freeblks;

+ LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);

+ return (jfreeblk);

+static void move_newblock_dep(struct jaddref *, struct inodedep *);

+/*

+ * If we're canceling a new bitmap we have to search for another ref

+ * to move into the bmsafemap dep. This might be better expressed

+ * with another structure.

+ */

+static void

+move_newblock_dep(jaddref, inodedep)

+ struct jaddref *jaddref;

+ struct inodedep *inodedep;

+ struct inoref *inoref;

+ struct jaddref *jaddrefn;

+ jaddrefn = NULL;

+ for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;

+ inoref = TAILQ_NEXT(inoref, if_deps)) {

+ if ((jaddref->ja_state & NEWBLOCK) &&

+ inoref->if_list.wk_type == D_JADDREF) {

+ jaddrefn = (struct jaddref *)inoref;

+ break;

+ }

+ if (jaddrefn == NULL)

+ return;

+ jaddrefn->ja_state &= ~(ATTACHED | UNDONE);

+ jaddrefn->ja_state |= jaddref->ja_state &

+ (ATTACHED | UNDONE | NEWBLOCK);

+ jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);

+ jaddref->ja_state |= ATTACHED;

+ LIST_REMOVE(jaddref, ja_bmdeps);

+ LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,

+ ja_bmdeps);

+/*

+ * Cancel a jaddref either before it has been written or while it is being

+ * written. This happens when a link is removed before the add reaches

+ * the disk. The jaddref dependency is kept linked into the bmsafemap

+ * and inode to prevent the link count or bitmap from reaching the disk

+ * until handle_workitem_remove() re-adjusts the counts and bitmaps as

+ * required.

+ *

+ * Returns 1 if the canceled addref requires journaling of the remove and

+ * 0 otherwise.

+ */

+static int

+cancel_jaddref(jaddref, inodedep, wkhd)

+ struct jaddref *jaddref;

+ struct inodedep *inodedep;

+ struct workhead *wkhd;

+ struct inoref *inoref;

+ struct jsegdep *jsegdep;

+ int needsj;

+ KASSERT((jaddref->ja_state & COMPLETE) == 0,

+ ("cancel_jaddref: Canceling complete jaddref"));

+ if (jaddref->ja_state & (IOSTARTED | COMPLETE))

+ needsj = 1;

+ else

+ needsj = 0;

+ if (inodedep == NULL)

+ if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,

+ 0, &inodedep) == 0)

+ panic("cancel_jaddref: Lost inodedep");

+ /*

+ * We must adjust the nlink of any reference operation that follows

+ * us so that it is consistent with the in-memory reference. This

+ * ensures that inode nlink rollbacks always have the correct link.

+ */

+ if (needsj == 0)

+ for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;

+ inoref = TAILQ_NEXT(inoref, if_deps))

+ inoref->if_nlink--;

+ jsegdep = inoref_jseg(&jaddref->ja_ref);

+ if (jaddref->ja_state & NEWBLOCK)

+ move_newblock_dep(jaddref, inodedep);

+ if (jaddref->ja_state & IOWAITING) {

+ jaddref->ja_state &= ~IOWAITING;

+ wakeup(&jaddref->ja_list);

+ }

+ jaddref->ja_mkdir = NULL;

+ if (jaddref->ja_state & IOSTARTED) {

+ jaddref->ja_state &= ~IOSTARTED;

+ WORKLIST_REMOVE(&jaddref->ja_list);

+ WORKLIST_INSERT(wkhd, &jsegdep->jd_list);

+ } else {

+ free_jsegdep(jsegdep);

+ remove_from_journal(&jaddref->ja_list);

+ }

+ /*

+ * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove

+ * can arrange for them to be freed with the bitmap. Otherwise we

+ * no longer need this addref attached to the inoreflst and it

+ * will incorrectly adjust nlink if we leave it.

+ */

+ if ((jaddref->ja_state & NEWBLOCK) == 0) {

+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,

+ if_deps);

+ jaddref->ja_state |= COMPLETE;

+ free_jaddref(jaddref);

+ return (needsj);

+ }

+ jaddref->ja_state |= GOINGAWAY;

+ /*

+ * Leave the head of the list for jsegdeps for fast merging.

+ */

+ if (LIST_FIRST(wkhd) != NULL) {

+ jaddref->ja_state |= ONWORKLIST;

+ LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);

+ } else

+ WORKLIST_INSERT(wkhd, &jaddref->ja_list);

+ return (needsj);

+/*

+ * Attempt to free a jaddref structure when some work completes. This

+ * should only succeed once the entry is written and all dependencies have

+ * been notified.

+ */

+static void

+free_jaddref(jaddref)

+ struct jaddref *jaddref;

+ if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)

+ return;

+ if (jaddref->ja_ref.if_jsegdep)

+ panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",

+ jaddref, jaddref->ja_state);

+ if (jaddref->ja_state & NEWBLOCK)

+ LIST_REMOVE(jaddref, ja_bmdeps);

+ if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))

+ panic("free_jaddref: Bad state %p(0x%X)",

+ jaddref, jaddref->ja_state);

+ if (jaddref->ja_mkdir != NULL)

+ panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);

+ WORKITEM_FREE(jaddref, D_JADDREF);

+/*

+ * Free a jremref structure once it has been written or discarded.

+ */

+static void

+free_jremref(jremref)

+ struct jremref *jremref;

+ if (jremref->jr_ref.if_jsegdep)

+ free_jsegdep(jremref->jr_ref.if_jsegdep);

+ if (jremref->jr_state & IOSTARTED)

+ panic("free_jremref: IO still pending");

+ WORKITEM_FREE(jremref, D_JREMREF);

+/*

+ * Free a jnewblk structure.

+ */

+static void

+free_jnewblk(jnewblk)

+ struct jnewblk *jnewblk;

+ if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)

+ return;

+ LIST_REMOVE(jnewblk, jn_deps);

+ if (jnewblk->jn_newblk != NULL)

+ panic("free_jnewblk: Dependency still attached.");

+ WORKITEM_FREE(jnewblk, D_JNEWBLK);

+/*

+ * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk

+ * is kept linked into the bmsafemap until the free completes, thus

+ * preventing the modified state from ever reaching disk. The free

+ * routine must pass this structure via ffs_blkfree() to

+ * softdep_setup_freeblks() so there is no race in releasing the space.

+ */

+static void

+cancel_jnewblk(jnewblk, wkhd)

+ struct jnewblk *jnewblk;

+ struct workhead *wkhd;

+ struct jsegdep *jsegdep;

+ jsegdep = jnewblk->jn_jsegdep;

+ jnewblk->jn_jsegdep = NULL;

+ free_jsegdep(jsegdep);

+ jnewblk->jn_newblk = NULL;

+ jnewblk->jn_state |= GOINGAWAY;

+ if (jnewblk->jn_state & IOSTARTED) {

+ jnewblk->jn_state &= ~IOSTARTED;

+ WORKLIST_REMOVE(&jnewblk->jn_list);

+ } else

+ remove_from_journal(&jnewblk->jn_list);

+ /*

+ * Leave the head of the list for jsegdeps for fast merging.

+ */

+ if (LIST_FIRST(wkhd) != NULL) {

+ jnewblk->jn_state |= ONWORKLIST;

+ LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list);

+ } else

+ WORKLIST_INSERT(wkhd, &jnewblk->jn_list);

+ if (jnewblk->jn_state & IOWAITING) {

+ jnewblk->jn_state &= ~IOWAITING;

+ wakeup(&jnewblk->jn_list);

+ }

+static void

+free_jfreeblk(jfreeblk)

+ struct jfreeblk *jfreeblk;

+ WORKITEM_FREE(jfreeblk, D_JFREEBLK);

+/*

+ * Release one reference to a jseg and free it if the count reaches 0. This

+ * should eventually reclaim journal space as well.

+ */

+static void

+free_jseg(jseg)

+ struct jseg *jseg;

+ struct jblocks *jblocks;

+ KASSERT(jseg->js_refs > 0,

+ ("free_jseg: Invalid refcnt %d", jseg->js_refs));

+ if (--jseg->js_refs != 0)

+ return;

+ /*

+ * Free only those jsegs which have none allocated before them to

+ * preserve the journal space ordering.

+ */

+ jblocks = jseg->js_jblocks;

+ while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {

+ jblocks->jb_oldestseq = jseg->js_seq;

+ if (jseg->js_refs != 0)

+ break;

+ TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);

+ jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);

+ KASSERT(LIST_EMPTY(&jseg->js_entries),

+ ("free_jseg: Freed jseg has valid entries."));

+ WORKITEM_FREE(jseg, D_JSEG);

+ }

+/*

+ * Release a jsegdep and decrement the jseg count.

+ */

+static void

+free_jsegdep(jsegdep)

+ struct jsegdep *jsegdep;

+ if (jsegdep->jd_seg)

+ free_jseg(jsegdep->jd_seg);

+ WORKITEM_FREE(jsegdep, D_JSEGDEP);

+/*

+ * Wait for a journal item to make it to disk. Initiate journal processing

+ * if required.

+ */

+static void

+jwait(wk)

+ struct worklist *wk;

+ stat_journal_wait++;

+ /*

+ * If IO has not started we process the journal. We can't mark the

+ * worklist item as IOWAITING because we drop the lock while

+ * processing the journal and the worklist entry may be freed after

+ * this point. The caller may call back in and re-issue the request.

+ */

+ if ((wk->wk_state & IOSTARTED) == 0) {

+ softdep_process_journal(wk->wk_mp, MNT_WAIT);

+ return;

+ }

+ wk->wk_state |= IOWAITING;

+ msleep(wk, &lk, PRIBIO, "jwait", 0);

+/*

+ * Lookup an inodedep based on an inode pointer and set the nlinkdelta as

+ * appropriate. This is a convenience function to reduce duplicate code

+ * for the setup and revert functions below.

+ */

+static struct inodedep *

+inodedep_lookup_ip(ip)

+ struct inode *ip;

+ struct inodedep *inodedep;

+ KASSERT(ip->i_nlink >= ip->i_effnlink,

+ ("inodedep_lookup_ip: bad delta"));

+ (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,

+ DEPALLOC, &inodedep);

+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;

+ return (inodedep);

+/*

+ * Create a journal entry that describes a truncate that we're about to

+ * perform. The inode allocations and frees between here and the completion

+ * of the operation are done asynchronously and without journaling. At

+ * the end of the operation the vnode is sync'd and the journal space

+ * is released. Recovery will discover the partially completed truncate

+ * and complete it.

+ */

+void *

+softdep_setup_trunc(vp, length, flags)

+ struct vnode *vp;

+ off_t length;

+ int flags;

+ struct jsegdep *jsegdep;

+ struct jtrunc *jtrunc;

+ struct ufsmount *ump;

+ struct inode *ip;

+ softdep_prealloc(vp, MNT_WAIT);

+ ip = VTOI(vp);

+ ump = VFSTOUFS(vp->v_mount);

+ jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);

+ workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);

+ jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);

+ jtrunc->jt_ino = ip->i_number;

+ jtrunc->jt_extsize = 0;

+ jtrunc->jt_size = length;

+ if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)

+ jtrunc->jt_extsize = ip->i_din2->di_extsize;

+ if ((flags & IO_NORMAL) == 0)

+ jtrunc->jt_size = DIP(ip, i_size);

+ ACQUIRE_LOCK(&lk);

+ add_to_journal(&jtrunc->jt_list);

+ while (jsegdep->jd_seg == NULL) {

+ stat_jwait_freeblks++;

+ jwait(&jtrunc->jt_list);

+ }

+ FREE_LOCK(&lk);

+ return (jsegdep);

+/*

+ * After synchronous truncation is complete we free sync the vnode and

+ * release the jsegdep so the journal space can be freed.

+ */

+int

+softdep_complete_trunc(vp, cookie)

+ struct vnode *vp;

+ void *cookie;

+ int error;

+ error = ffs_syncvnode(vp, MNT_WAIT);

+ ACQUIRE_LOCK(&lk);

+ free_jsegdep((struct jsegdep *)cookie);

+ FREE_LOCK(&lk);

+ return (error);

+/*

+ * Called prior to creating a new inode and linking it to a directory. The

+ * jaddref structure must already be allocated by softdep_setup_inomapdep

+ * and it is discovered here so we can initialize the mode and update

+ * nlinkdelta.

+ */

+void

+softdep_setup_create(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct vnode *dvp;

+ KASSERT(ip->i_nlink == 1,

+ ("softdep_setup_create: Invalid link count."));

+ dvp = ITOV(dp);

+ ACQUIRE_LOCK(&lk);

+ inodedep = inodedep_lookup_ip(ip);

+ if (DOINGSUJ(dvp)) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,

+ ("softdep_setup_create: No addref structure present."));

+ jaddref->ja_mode = ip->i_mode;

+ }

+ softdep_prelink(dvp, NULL);

+ FREE_LOCK(&lk);

+/*

+ * Create a jaddref structure to track the addition of a DOTDOT link when

+ * we are reparenting an inode as part of a rename. This jaddref will be

+ * found by softdep_setup_directory_change. Adjusts nlinkdelta for

+ * non-journaling softdep.

+ */

+void

+softdep_setup_dotdot_link(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct vnode *dvp;

+ struct vnode *vp;

+ dvp = ITOV(dp);

+ vp = ITOV(ip);

+ jaddref = NULL;

+ /*

+ * We don't set MKDIR_PARENT as this is not tied to a mkdir and

+ * is used as a normal link would be.

+ */

+ if (DOINGSUJ(dvp))

+ jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,

+ dp->i_effnlink - 1, dp->i_mode);

+ ACQUIRE_LOCK(&lk);

+ inodedep = inodedep_lookup_ip(dp);

+ if (jaddref)

+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,

+ if_deps);

+ softdep_prelink(dvp, ITOV(ip));

+ FREE_LOCK(&lk);

+/*

+ * Create a jaddref structure to track a new link to an inode. The directory

+ * offset is not known until softdep_setup_directory_add or

+ * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling

+ * softdep.

+ */

+void

+softdep_setup_link(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct vnode *dvp;

+ dvp = ITOV(dp);

+ jaddref = NULL;

+ if (DOINGSUJ(dvp))

+ jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,

+ ip->i_mode);

+ ACQUIRE_LOCK(&lk);

+ inodedep = inodedep_lookup_ip(ip);

+ if (jaddref)

+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,

+ if_deps);

+ softdep_prelink(dvp, ITOV(ip));

+ FREE_LOCK(&lk);

+/*

+ * Called to create the jaddref structures to track . and .. references as

+ * well as lookup and further initialize the incomplete jaddref created

+ * by softdep_setup_inomapdep when the inode was allocated. Adjusts

+ * nlinkdelta for non-journaling softdep.

+ */

+void

+softdep_setup_mkdir(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct inodedep *inodedep;

+ struct jaddref *dotdotaddref;

+ struct jaddref *dotaddref;

+ struct jaddref *jaddref;

+ struct vnode *dvp;

+ dvp = ITOV(dp);

+ dotaddref = dotdotaddref = NULL;

+ if (DOINGSUJ(dvp)) {

+ dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,

+ ip->i_mode);

+ dotaddref->ja_state |= MKDIR_BODY;

+ dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,

+ dp->i_effnlink - 1, dp->i_mode);

+ dotdotaddref->ja_state |= MKDIR_PARENT;

+ }

+ ACQUIRE_LOCK(&lk);

+ inodedep = inodedep_lookup_ip(ip);

+ if (DOINGSUJ(dvp)) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref != NULL,

+ ("softdep_setup_mkdir: No addref structure present."));

+ KASSERT(jaddref->ja_parent == dp->i_number,

+ ("softdep_setup_mkdir: bad parent %d",

+ jaddref->ja_parent));

+ jaddref->ja_mode = ip->i_mode;

+ TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,

+ if_deps);

+ }

+ inodedep = inodedep_lookup_ip(dp);

+ if (DOINGSUJ(dvp))

+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,

+ &dotdotaddref->ja_ref, if_deps);

+ softdep_prelink(ITOV(dp), NULL);

+ FREE_LOCK(&lk);

+/*

+ * Called to track nlinkdelta of the inode and parent directories prior to

+ * unlinking a directory.

+ */

+void

+softdep_setup_rmdir(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct vnode *dvp;

+ dvp = ITOV(dp);

+ ACQUIRE_LOCK(&lk);

+ (void) inodedep_lookup_ip(ip);

+ (void) inodedep_lookup_ip(dp);

+ softdep_prelink(dvp, ITOV(ip));

+ FREE_LOCK(&lk);

+/*

+ * Called to track nlinkdelta of the inode and parent directories prior to

+ * unlink.

+ */

+void

+softdep_setup_unlink(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct vnode *dvp;

+ dvp = ITOV(dp);

+ ACQUIRE_LOCK(&lk);

+ (void) inodedep_lookup_ip(ip);

+ (void) inodedep_lookup_ip(dp);

+ softdep_prelink(dvp, ITOV(ip));

+ FREE_LOCK(&lk);

+/*

+ * Called to release the journal structures created by a failed non-directory

+ * creation. Adjusts nlinkdelta for non-journaling softdep.

+ */

+void

+softdep_revert_create(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct vnode *dvp;

+ dvp = ITOV(dp);

+ ACQUIRE_LOCK(&lk);

+ inodedep = inodedep_lookup_ip(ip);

+ if (DOINGSUJ(dvp)) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref->ja_parent == dp->i_number,

+ ("softdep_revert_create: addref parent mismatch"));

+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);

+ }

+ FREE_LOCK(&lk);

+/*

+ * Called to release the journal structures created by a failed dotdot link

+ * creation. Adjusts nlinkdelta for non-journaling softdep.

+ */

+void

+softdep_revert_dotdot_link(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct vnode *dvp;

+ dvp = ITOV(dp);

+ ACQUIRE_LOCK(&lk);

+ inodedep = inodedep_lookup_ip(dp);

+ if (DOINGSUJ(dvp)) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref->ja_parent == ip->i_number,

+ ("softdep_revert_dotdot_link: addref parent mismatch"));

+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);

+ }

+ FREE_LOCK(&lk);

+/*

+ * Called to release the journal structures created by a failed link

+ * addition. Adjusts nlinkdelta for non-journaling softdep.

+ */

+void

+softdep_revert_link(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct vnode *dvp;

+ dvp = ITOV(dp);

+ ACQUIRE_LOCK(&lk);

+ inodedep = inodedep_lookup_ip(ip);

+ if (DOINGSUJ(dvp)) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref->ja_parent == dp->i_number,

+ ("softdep_revert_link: addref parent mismatch"));

+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);

+ }

+ FREE_LOCK(&lk);

+/*

+ * Called to release the journal structures created by a failed mkdir

+ * attempt. Adjusts nlinkdelta for non-journaling softdep.

+ */

+void

+softdep_revert_mkdir(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct vnode *dvp;

+ dvp = ITOV(dp);

+ ACQUIRE_LOCK(&lk);

+ inodedep = inodedep_lookup_ip(dp);

+ if (DOINGSUJ(dvp)) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref->ja_parent == ip->i_number,

+ ("softdep_revert_mkdir: dotdot addref parent mismatch"));

+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);

+ }

+ inodedep = inodedep_lookup_ip(ip);

+ if (DOINGSUJ(dvp)) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref->ja_parent == dp->i_number,

+ ("softdep_revert_mkdir: addref parent mismatch"));

+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref->ja_parent == ip->i_number,

+ ("softdep_revert_mkdir: dot addref parent mismatch"));

+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);

+ }

+ FREE_LOCK(&lk);

+/*

+ * Called to correct nlinkdelta after a failed rmdir.

+ */

+void

+softdep_revert_rmdir(dp, ip)

+ struct inode *dp;

+ struct inode *ip;

+ ACQUIRE_LOCK(&lk);

+ (void) inodedep_lookup_ip(ip);

+ (void) inodedep_lookup_ip(dp);

+ FREE_LOCK(&lk);

* Protecting the freemaps (or bitmaps).

@@ -1536,6 +3971,22 @@ softdep_setup_inomapdep(bp, ip, newinum)

{

struct inodedep *inodedep;

struct bmsafemap *bmsafemap;

+ struct jaddref *jaddref;

+ struct mount *mp;

+ struct fs *fs;

+ mp = UFSTOVFS(ip->i_ump);

+ fs = ip->i_ump->um_fs;

+ jaddref = NULL;

+ /*

+ * Allocate the journal reference add structure so that the bitmap

+ * can be dependent on it.

+ */

+ if (mp->mnt_kern_flag & MNTK_SUJ) {

+ jaddref = newjaddref(ip, newinum, 0, 0, 0);

+ jaddref->ja_state |= NEWBLOCK;

+ }

* Create a dependency for the newly allocated inode.

@@ -1544,14 +3995,20 @@ softdep_setup_inomapdep(bp, ip, newinum)

* the cylinder group map from which it was allocated.

ACQUIRE_LOCK(&lk);

- if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,

- &inodedep)))

- panic("softdep_setup_inomapdep: dependency for new inode "

- "already exists");

- inodedep->id_buf = bp;

+ if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))

+ panic("softdep_setup_inomapdep: dependency %p for new"

+ "inode already exists", inodedep);

+ bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));

+ if (jaddref) {

+ LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);

+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,

+ if_deps);

+ } else {

+ inodedep->id_state |= ONDEPLIST;

+ LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);

+ }

+ inodedep->id_bmsafemap = bmsafemap;

inodedep->id_state &= ~DEPCOMPLETE;

- bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);

- LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);

FREE_LOCK(&lk);

}

@@ -1560,29 +4017,98 @@ softdep_setup_inomapdep(bp, ip, newinum)

* allocate block or fragment.

void

-softdep_setup_blkmapdep(bp, mp, newblkno)

+softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)

struct buf *bp; /* buffer for cylgroup block with block map */

struct mount *mp; /* filesystem doing allocation */

ufs2_daddr_t newblkno; /* number of newly allocated block */

+ int frags; /* Number of fragments. */

+ int oldfrags; /* Previous number of fragments for extend. */

{

struct newblk *newblk;

struct bmsafemap *bmsafemap;

+ struct jnewblk *jnewblk;

struct fs *fs;

fs = VFSTOUFS(mp)->um_fs;

+ jnewblk = NULL;

* Create a dependency for the newly allocated block.

* Add it to the dependency list for the buffer holding

* the cylinder group map from which it was allocated.

+ if (mp->mnt_kern_flag & MNTK_SUJ) {

+ jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);

+ workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);

+ jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);

+ jnewblk->jn_state = ATTACHED;

+ jnewblk->jn_blkno = newblkno;

+ jnewblk->jn_frags = frags;

+ jnewblk->jn_oldfrags = oldfrags;

+#ifdef SUJ_DEBUG

+ {

+ struct cg *cgp;

+ uint8_t *blksfree;

+ long bno;

+ int i;

+ cgp = (struct cg *)bp->b_data;

+ blksfree = cg_blksfree(cgp);

+ bno = dtogd(fs, jnewblk->jn_blkno);

+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;

+ i++) {

+ if (isset(blksfree, bno + i))

+ panic("softdep_setup_blkmapdep: "

+ "free fragment %d from %d-%d "

+ "state 0x%X dep %p", i,

+ jnewblk->jn_oldfrags,

+ jnewblk->jn_frags,

+ jnewblk->jn_state,

+ jnewblk->jn_newblk);

+ }

+#endif

+ }

ACQUIRE_LOCK(&lk);

- if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)

+ if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)

panic("softdep_setup_blkmapdep: found block");

- newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);

- LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);

+ newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,

+ dtog(fs, newblkno));

+ if (jnewblk) {

+ jnewblk->jn_newblk = newblk;

+ LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);

+ } else {

+ newblk->nb_state |= ONDEPLIST;

+ LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);

+ }

+ newblk->nb_bmsafemap = bmsafemap;

+ newblk->nb_jnewblk = jnewblk;

FREE_LOCK(&lk);

}

+#define BMSAFEMAP_HASH(fs, cg) \

+ (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])

+static int

+bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)

+ struct bmsafemap_hashhead *bmsafemaphd;

+ struct mount *mp;

+ int cg;

+ struct bmsafemap **bmsafemapp;

+ struct bmsafemap *bmsafemap;

+ LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)

+ if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)

+ break;

+ if (bmsafemap) {

+ *bmsafemapp = bmsafemap;

+ return (1);

+ }

+ *bmsafemapp = NULL;

+ return (0);

* Find the bmsafemap associated with a cylinder group buffer.

* If none exists, create one. The buffer must be locked when

@@ -1590,27 +4116,43 @@ softdep_setup_blkmapdep(bp, mp, newblkno)

* splbio interrupts blocked.

static struct bmsafemap *

-bmsafemap_lookup(mp, bp)

+bmsafemap_lookup(mp, bp, cg)

struct mount *mp;

struct buf *bp;

+ int cg;

{

- struct bmsafemap *bmsafemap;

+ struct bmsafemap_hashhead *bmsafemaphd;

+ struct bmsafemap *bmsafemap, *collision;

struct worklist *wk;

+ struct fs *fs;

mtx_assert(&lk, MA_OWNED);

- LIST_FOREACH(wk, &bp->b_dep, wk_list)

- if (wk->wk_type == D_BMSAFEMAP)

- return (WK_BMSAFEMAP(wk));

+ if (bp)

+ LIST_FOREACH(wk, &bp->b_dep, wk_list)

+ if (wk->wk_type == D_BMSAFEMAP)

+ return (WK_BMSAFEMAP(wk));

+ fs = VFSTOUFS(mp)->um_fs;

+ bmsafemaphd = BMSAFEMAP_HASH(fs, cg);

+ if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)

+ return (bmsafemap);

FREE_LOCK(&lk);

bmsafemap = malloc(sizeof(struct bmsafemap),

M_BMSAFEMAP, M_SOFTDEP_FLAGS);

workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);

bmsafemap->sm_buf = bp;

- LIST_INIT(&bmsafemap->sm_allocdirecthd);

- LIST_INIT(&bmsafemap->sm_allocindirhd);

LIST_INIT(&bmsafemap->sm_inodedephd);

+ LIST_INIT(&bmsafemap->sm_inodedepwr);

LIST_INIT(&bmsafemap->sm_newblkhd);

+ LIST_INIT(&bmsafemap->sm_newblkwr);

+ LIST_INIT(&bmsafemap->sm_jaddrefhd);

+ LIST_INIT(&bmsafemap->sm_jnewblkhd);

ACQUIRE_LOCK(&lk);

+ if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {

+ WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);

+ return (collision);

+ }

+ bmsafemap->sm_cg = cg;

+ LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);

WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);

return (bmsafemap);

}

@@ -1645,9 +4187,9 @@ bmsafemap_lookup(mp, bp)

* unreferenced fragments.

void

-softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)

+softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)

struct inode *ip; /* inode to which block is being added */

- ufs_lbn_t lbn; /* block pointer within inode */

+ ufs_lbn_t off; /* block pointer within inode */

ufs2_daddr_t newblkno; /* disk block number being added */

ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */

long newsize; /* size of new block */

@@ -1656,34 +4198,33 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)

{

struct allocdirect *adp, *oldadp;

struct allocdirectlst *adphead;

- struct bmsafemap *bmsafemap;

+ struct freefrag *freefrag;

struct inodedep *inodedep;

struct pagedep *pagedep;

+ struct jnewblk *jnewblk;

struct newblk *newblk;

struct mount *mp;

+ ufs_lbn_t lbn;

+ lbn = bp->b_lblkno;

mp = UFSTOVFS(ip->i_ump);

- adp = malloc(sizeof(struct allocdirect),

- M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);

- workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);

- adp->ad_lbn = lbn;

- adp->ad_newblkno = newblkno;

- adp->ad_oldblkno = oldblkno;

- adp->ad_newsize = newsize;

- adp->ad_oldsize = oldsize;

- adp->ad_state = ATTACHED;

- LIST_INIT(&adp->ad_newdirblk);

- if (newblkno == oldblkno)

- adp->ad_freefrag = NULL;

+ if (oldblkno && oldblkno != newblkno)

+ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);

else

- adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);

+ freefrag = NULL;

ACQUIRE_LOCK(&lk);

- if (lbn >= NDADDR) {

+ if (off >= NDADDR) {

+ if (lbn > 0)

+ panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",

+ lbn, off);

/* allocating an indirect block */

if (oldblkno != 0)

panic("softdep_setup_allocdirect: non-zero indir");

} else {

+ if (off != lbn)

+ panic("softdep_setup_allocdirect: lbn %jd != off %jd",

+ lbn, off);

* Allocating a direct block.

@@ -1692,26 +4233,39 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)

* deletions.

if ((ip->i_mode & IFMT) == IFDIR &&

- pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)

+ pagedep_lookup(mp, ip->i_number, off, DEPALLOC,

+ &pagedep) == 0)

WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);

}

- if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)

+ if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)

panic("softdep_setup_allocdirect: lost block");

- if (newblk->nb_state == DEPCOMPLETE) {

- adp->ad_state |= DEPCOMPLETE;

- adp->ad_buf = NULL;

- } else {

- bmsafemap = newblk->nb_bmsafemap;

- adp->ad_buf = bmsafemap->sm_buf;

- LIST_REMOVE(newblk, nb_deps);

- LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);

- }

- LIST_REMOVE(newblk, nb_hash);

- free(newblk, M_NEWBLK);

+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,

+ ("softdep_setup_allocdirect: newblk already initialized"));

+ /*

+ * Convert the newblk to an allocdirect.

+ */

+ newblk->nb_list.wk_type = D_ALLOCDIRECT;

+ adp = (struct allocdirect *)newblk;

+ newblk->nb_freefrag = freefrag;

+ adp->ad_offset = off;

+ adp->ad_oldblkno = oldblkno;

+ adp->ad_newsize = newsize;

+ adp->ad_oldsize = oldsize;

+ /*

+ * Finish initializing the journal.

+ */

+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {

+ jnewblk->jn_ino = ip->i_number;

+ jnewblk->jn_lbn = lbn;

+ add_to_journal(&jnewblk->jn_list);

+ }

+ if (freefrag && freefrag->ff_jfreefrag != NULL)

+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);

inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);

adp->ad_inodedep = inodedep;

- WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);

+ WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);

* The list of allocdirects must be kept in sorted and ascending

* order so that the rollback routines can quickly determine the

@@ -1726,24 +4280,25 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)

adphead = &inodedep->id_newinoupdt;

oldadp = TAILQ_LAST(adphead, allocdirectlst);

- if (oldadp == NULL || oldadp->ad_lbn <= lbn) {

+ if (oldadp == NULL || oldadp->ad_offset <= off) {

/* insert at end of list */

TAILQ_INSERT_TAIL(adphead, adp, ad_next);

- if (oldadp != NULL && oldadp->ad_lbn == lbn)

+ if (oldadp != NULL && oldadp->ad_offset == off)

allocdirect_merge(adphead, adp, oldadp);

FREE_LOCK(&lk);

return;

}

TAILQ_FOREACH(oldadp, adphead, ad_next) {

- if (oldadp->ad_lbn >= lbn)

+ if (oldadp->ad_offset >= off)

break;

}

if (oldadp == NULL)

panic("softdep_setup_allocdirect: lost entry");

/* insert in middle of list */

TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);

- if (oldadp->ad_lbn == lbn)

+ if (oldadp->ad_offset == off)

allocdirect_merge(adphead, adp, oldadp);

FREE_LOCK(&lk);

}

@@ -1761,10 +4316,11 @@ allocdirect_merge(adphead, newadp, oldadp)

struct freefrag *freefrag;

struct newdirblk *newdirblk;

+ freefrag = NULL;

mtx_assert(&lk, MA_OWNED);

if (newadp->ad_oldblkno != oldadp->ad_newblkno ||

newadp->ad_oldsize != oldadp->ad_newsize ||

- newadp->ad_lbn >= NDADDR)

+ newadp->ad_offset >= NDADDR)

panic("%s %jd != new %jd || old size %ld != new %ld",

"allocdirect_merge: old blkno",

(intmax_t)newadp->ad_oldblkno,

@@ -1779,7 +4335,7 @@ allocdirect_merge(adphead, newadp, oldadp)

* This action is done by swapping the freefrag dependencies.

* The new dependency gains the old one's freefrag, and the

* old one gets the new one and then immediately puts it on

- * the worklist when it is freed by free_allocdirect. It is

+ * the worklist when it is freed by free_newblk. It is

* not possible to do this swap when the old dependency had a

* non-zero size but no previous fragment to free. This condition

* arises when the new block is an extension of the old block.

@@ -1788,8 +4344,8 @@ allocdirect_merge(adphead, newadp, oldadp)

* the old dependency, so cannot legitimately be freed until the

* conditions for the new dependency are fulfilled.

+ freefrag = newadp->ad_freefrag;

if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {

- freefrag = newadp->ad_freefrag;

newadp->ad_freefrag = oldadp->ad_freefrag;

oldadp->ad_freefrag = freefrag;

}

@@ -1804,32 +4360,118 @@ allocdirect_merge(adphead, newadp, oldadp)

panic("allocdirect_merge: extra newdirblk");

WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);

}

- free_allocdirect(adphead, oldadp, 0);

+ TAILQ_REMOVE(adphead, oldadp, ad_next);

+ /*

+ * We need to move any journal dependencies over to the freefrag

+ * that releases this block if it exists. Otherwise we are

+ * extending an existing block and we'll wait until that is

+ * complete to release the journal space and extend the

+ * new journal to cover this old space as well.

+ */

+ if (freefrag == NULL) {

+ struct jnewblk *jnewblk;

+ struct jnewblk *njnewblk;

+ if (oldadp->ad_newblkno != newadp->ad_newblkno)

+ panic("allocdirect_merge: %jd != %jd",

+ oldadp->ad_newblkno, newadp->ad_newblkno);

+ jnewblk = oldadp->ad_block.nb_jnewblk;

+ cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork);

+ /*

+ * We have an unwritten jnewblk, we need to merge the

+ * frag bits with our own. The newer adp's journal can not

+ * be written prior to the old one so no need to check for

+ * it here.

+ */

+ if (jnewblk) {

+ njnewblk = newadp->ad_block.nb_jnewblk;

+ if (njnewblk == NULL)

+ panic("allocdirect_merge: No jnewblk");

+ if (jnewblk->jn_state & UNDONE) {

+ njnewblk->jn_state |= UNDONE | NEWBLOCK;

+ njnewblk->jn_state &= ~ATTACHED;

+ jnewblk->jn_state &= ~UNDONE;

+ }

+ njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;

+ WORKLIST_REMOVE(&jnewblk->jn_list);

+ jnewblk->jn_state |= ATTACHED | COMPLETE;

+ free_jnewblk(jnewblk);

+ }

+ } else {

+ /*

+ * We can skip journaling for this freefrag and just complete

+ * any pending journal work for the allocdirect that is being

+ * removed after the freefrag completes.

+ */

+ if (freefrag->ff_jfreefrag)

+ cancel_jfreefrag(freefrag->ff_jfreefrag);

+ cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork);

+ }

+ free_newblk(&oldadp->ad_block);

}

- * Allocate a new freefrag structure if needed.

+ * Allocate a jfreefrag structure to journal a single block free.

+ */

+static struct jfreefrag *

+newjfreefrag(freefrag, ip, blkno, size, lbn)

+ struct freefrag *freefrag;

+ struct inode *ip;

+ ufs2_daddr_t blkno;

+ long size;

+ ufs_lbn_t lbn;

+ struct jfreefrag *jfreefrag;

+ struct fs *fs;

+ fs = ip->i_fs;

+ jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,

+ M_SOFTDEP_FLAGS);

+ workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));

+ jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);

+ jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;

+ jfreefrag->fr_ino = ip->i_number;

+ jfreefrag->fr_lbn = lbn;

+ jfreefrag->fr_blkno = blkno;

+ jfreefrag->fr_frags = numfrags(fs, size);

+ jfreefrag->fr_freefrag = freefrag;

+ return (jfreefrag);

+/*

+ * Allocate a new freefrag structure.

static struct freefrag *

-newfreefrag(ip, blkno, size)

+newfreefrag(ip, blkno, size, lbn)

struct inode *ip;

ufs2_daddr_t blkno;

long size;

+ ufs_lbn_t lbn;

{

struct freefrag *freefrag;

struct fs *fs;

- if (blkno == 0)

- return (NULL);

fs = ip->i_fs;

if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)

panic("newfreefrag: frag size");

freefrag = malloc(sizeof(struct freefrag),

- M_FREEFRAG, M_SOFTDEP_FLAGS);

+ M_FREEFRAG, M_SOFTDEP_FLAGS);

workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));

+ freefrag->ff_state = ATTACHED;

+ LIST_INIT(&freefrag->ff_jwork);

freefrag->ff_inum = ip->i_number;

freefrag->ff_blkno = blkno;

freefrag->ff_fragsize = size;

+ if (fs->fs_flags & FS_SUJ) {

+ freefrag->ff_jfreefrag =

+ newjfreefrag(freefrag, ip, blkno, size, lbn);

+ } else {

+ freefrag->ff_state |= DEPCOMPLETE;

+ freefrag->ff_jfreefrag = NULL;

+ }

return (freefrag);

}

@@ -1842,9 +4484,17 @@ handle_workitem_freefrag(freefrag)

struct freefrag *freefrag;

{

struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);

+ struct workhead wkhd;

+ /*

+ * It would be illegal to add new completion items to the

+ * freefrag after it was schedule to be done so it must be

+ * safe to modify the list head here.

+ */

+ LIST_INIT(&wkhd);

+ LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);

ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,

- freefrag->ff_fragsize, freefrag->ff_inum);

+ freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);

ACQUIRE_LOCK(&lk);

WORKITEM_FREE(freefrag, D_FREEFRAG);

FREE_LOCK(&lk);

@@ -1856,9 +4506,9 @@ handle_workitem_freefrag(freefrag)

* See the description of softdep_setup_allocdirect above for details.

void

-softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)

+softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)

struct inode *ip;

- ufs_lbn_t lbn;

+ ufs_lbn_t off;

ufs2_daddr_t newblkno;

ufs2_daddr_t oldblkno;

long newsize;

@@ -1867,50 +4517,55 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)

{

struct allocdirect *adp, *oldadp;

struct allocdirectlst *adphead;

- struct bmsafemap *bmsafemap;

+ struct freefrag *freefrag;

struct inodedep *inodedep;

+ struct jnewblk *jnewblk;

struct newblk *newblk;

struct mount *mp;

+ ufs_lbn_t lbn;

+ if (off >= NXADDR)

+ panic("softdep_setup_allocext: lbn %lld > NXADDR",

+ (long long)off);

+ lbn = bp->b_lblkno;

mp = UFSTOVFS(ip->i_ump);

- adp = malloc(sizeof(struct allocdirect),

- M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);

- workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);

- adp->ad_lbn = lbn;

- adp->ad_newblkno = newblkno;

- adp->ad_oldblkno = oldblkno;

- adp->ad_newsize = newsize;

- adp->ad_oldsize = oldsize;

- adp->ad_state = ATTACHED | EXTDATA;

- LIST_INIT(&adp->ad_newdirblk);

- if (newblkno == oldblkno)

- adp->ad_freefrag = NULL;

+ if (oldblkno && oldblkno != newblkno)

+ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);

else

- adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);

+ freefrag = NULL;

ACQUIRE_LOCK(&lk);

- if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)

+ if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)

panic("softdep_setup_allocext: lost block");

+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,

+ ("softdep_setup_allocext: newblk already initialized"));

+ /*

+ * Convert the newblk to an allocdirect.

+ */

+ newblk->nb_list.wk_type = D_ALLOCDIRECT;

+ adp = (struct allocdirect *)newblk;

+ newblk->nb_freefrag = freefrag;

+ adp->ad_offset = off;

+ adp->ad_oldblkno = oldblkno;

+ adp->ad_newsize = newsize;

+ adp->ad_oldsize = oldsize;

+ adp->ad_state |= EXTDATA;

+ /*

+ * Finish initializing the journal.

+ */

+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {

+ jnewblk->jn_ino = ip->i_number;

+ jnewblk->jn_lbn = lbn;

+ add_to_journal(&jnewblk->jn_list);

+ }

+ if (freefrag && freefrag->ff_jfreefrag != NULL)

+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);

inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);

adp->ad_inodedep = inodedep;

- if (newblk->nb_state == DEPCOMPLETE) {

- adp->ad_state |= DEPCOMPLETE;

- adp->ad_buf = NULL;

- } else {

- bmsafemap = newblk->nb_bmsafemap;

- adp->ad_buf = bmsafemap->sm_buf;

- LIST_REMOVE(newblk, nb_deps);

- LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);

- }

- LIST_REMOVE(newblk, nb_hash);

- free(newblk, M_NEWBLK);

- WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);

- if (lbn >= NXADDR)

- panic("softdep_setup_allocext: lbn %lld > NXADDR",

- (long long)lbn);

+ WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);

* The list of allocdirects must be kept in sorted and ascending

* order so that the rollback routines can quickly determine the

@@ -1925,23 +4580,23 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)

adphead = &inodedep->id_newextupdt;

oldadp = TAILQ_LAST(adphead, allocdirectlst);

- if (oldadp == NULL || oldadp->ad_lbn <= lbn) {

+ if (oldadp == NULL || oldadp->ad_offset <= off) {

/* insert at end of list */

TAILQ_INSERT_TAIL(adphead, adp, ad_next);

- if (oldadp != NULL && oldadp->ad_lbn == lbn)

+ if (oldadp != NULL && oldadp->ad_offset == off)

allocdirect_merge(adphead, adp, oldadp);

FREE_LOCK(&lk);

return;

}

TAILQ_FOREACH(oldadp, adphead, ad_next) {

- if (oldadp->ad_lbn >= lbn)

+ if (oldadp->ad_offset >= off)

break;

}

if (oldadp == NULL)

panic("softdep_setup_allocext: lost entry");

/* insert in middle of list */

TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);

- if (oldadp->ad_lbn == lbn)

+ if (oldadp->ad_offset == off)

allocdirect_merge(adphead, adp, oldadp);

FREE_LOCK(&lk);

}

@@ -1975,22 +4630,39 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)

* Allocate a new allocindir structure.

static struct allocindir *

-newallocindir(ip, ptrno, newblkno, oldblkno)

+newallocindir(ip, ptrno, newblkno, oldblkno, lbn)

struct inode *ip; /* inode for file being extended */

int ptrno; /* offset of pointer in indirect block */

ufs2_daddr_t newblkno; /* disk block number being added */

ufs2_daddr_t oldblkno; /* previous block number, 0 if none */

+ ufs_lbn_t lbn;

{

+ struct newblk *newblk;

struct allocindir *aip;

+ struct freefrag *freefrag;

+ struct jnewblk *jnewblk;

- aip = malloc(sizeof(struct allocindir),

- M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);

- workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));

- aip->ai_state = ATTACHED;

+ if (oldblkno)

+ freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);

+ else

+ freefrag = NULL;

+ ACQUIRE_LOCK(&lk);

+ if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)

+ panic("new_allocindir: lost block");

+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,

+ ("newallocindir: newblk already initialized"));

+ newblk->nb_list.wk_type = D_ALLOCINDIR;

+ newblk->nb_freefrag = freefrag;

+ aip = (struct allocindir *)newblk;

aip->ai_offset = ptrno;

- aip->ai_newblkno = newblkno;

aip->ai_oldblkno = oldblkno;

- aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);

+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {

+ jnewblk->jn_ino = ip->i_number;

+ jnewblk->jn_lbn = lbn;

+ add_to_journal(&jnewblk->jn_list);

+ }

+ if (freefrag && freefrag->ff_jfreefrag != NULL)

+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);

return (aip);

}

@@ -2008,22 +4680,28 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)

ufs2_daddr_t oldblkno; /* previous block number, 0 if none */

struct buf *nbp; /* buffer holding allocated page */

{

+ struct inodedep *inodedep;

struct allocindir *aip;

struct pagedep *pagedep;

+ struct mount *mp;

+ if (lbn != nbp->b_lblkno)

+ panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",

+ lbn, bp->b_lblkno);

ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");

- aip = newallocindir(ip, ptrno, newblkno, oldblkno);

- ACQUIRE_LOCK(&lk);

+ mp = UFSTOVFS(ip->i_ump);

+ aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);

+ (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);

* If we are allocating a directory page, then we must

* allocate an associated pagedep to track additions and

* deletions.

if ((ip->i_mode & IFMT) == IFDIR &&

- pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)

+ pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)

WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);

- WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);

- setup_allocindir_phase2(bp, ip, aip);

+ WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);

+ setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);

FREE_LOCK(&lk);

}

@@ -2039,38 +4717,68 @@ softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)

int ptrno; /* offset of pointer in indirect block */

ufs2_daddr_t newblkno; /* disk block number being added */

{

+ struct inodedep *inodedep;

struct allocindir *aip;

+ ufs_lbn_t lbn;

+ lbn = nbp->b_lblkno;

ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");

- aip = newallocindir(ip, ptrno, newblkno, 0);

- ACQUIRE_LOCK(&lk);

- WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);

- setup_allocindir_phase2(bp, ip, aip);

+ aip = newallocindir(ip, ptrno, newblkno, 0, lbn);

+ inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);

+ WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);

+ setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);

FREE_LOCK(&lk);

}

+static void

+indirdep_complete(indirdep)

+ struct indirdep *indirdep;

+ struct allocindir *aip;

+ LIST_REMOVE(indirdep, ir_next);

+ indirdep->ir_state &= ~ONDEPLIST;

+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {

+ LIST_REMOVE(aip, ai_next);

+ free_newblk(&aip->ai_block);

+ }

+ /*

+ * If this indirdep is not attached to a buf it was simply waiting

+ * on completion to clear completehd. free_indirdep() asserts

+ * that nothing is dangling.

+ */

+ if ((indirdep->ir_state & ONWORKLIST) == 0)

+ free_indirdep(indirdep);

* Called to finish the allocation of the "aip" allocated

* by one of the two routines above.

static void

-setup_allocindir_phase2(bp, ip, aip)

+setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)

struct buf *bp; /* in-memory copy of the indirect block */

struct inode *ip; /* inode for file being extended */

+ struct inodedep *inodedep; /* Inodedep for ip */

struct allocindir *aip; /* allocindir allocated by the above routines */

+ ufs_lbn_t lbn; /* Logical block number for this block. */

{

struct worklist *wk;

+ struct fs *fs;

+ struct newblk *newblk;

struct indirdep *indirdep, *newindirdep;

- struct bmsafemap *bmsafemap;

struct allocindir *oldaip;

struct freefrag *freefrag;

- struct newblk *newblk;

+ struct mount *mp;

ufs2_daddr_t blkno;

+ mp = UFSTOVFS(ip->i_ump);

+ fs = ip->i_fs;

mtx_assert(&lk, MA_OWNED);

if (bp->b_lblkno >= 0)

panic("setup_allocindir_phase2: not indir blk");

- for (indirdep = NULL, newindirdep = NULL; ; ) {

+ for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {

LIST_FOREACH(wk, &bp->b_dep, wk_list) {

if (wk->wk_type != D_INDIRDEP)

continue;

@@ -2079,49 +4787,41 @@ setup_allocindir_phase2(bp, ip, aip)

}

if (indirdep == NULL && newindirdep) {

indirdep = newindirdep;

- WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);

newindirdep = NULL;

+ WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);

+ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,

+ &newblk)) {

+ indirdep->ir_state |= ONDEPLIST;

+ LIST_INSERT_HEAD(&newblk->nb_indirdeps,

+ indirdep, ir_next);

+ } else

+ indirdep->ir_state |= DEPCOMPLETE;

}

if (indirdep) {

- if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,

- &newblk) == 0)

- panic("setup_allocindir: lost block");

- if (newblk->nb_state == DEPCOMPLETE) {

- aip->ai_state |= DEPCOMPLETE;

- aip->ai_buf = NULL;

- } else {

- bmsafemap = newblk->nb_bmsafemap;

- aip->ai_buf = bmsafemap->sm_buf;

- LIST_REMOVE(newblk, nb_deps);

- LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,

- aip, ai_deps);

- }

- LIST_REMOVE(newblk, nb_hash);

- free(newblk, M_NEWBLK);

aip->ai_indirdep = indirdep;

* Check to see if there is an existing dependency

* for this block. If there is, merge the old

- * dependency into the new one.

+ * dependency into the new one. This happens

+ * as a result of reallocblk only.

if (aip->ai_oldblkno == 0)

oldaip = NULL;

else

- LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)

+ LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,

+ ai_next)

if (oldaip->ai_offset == aip->ai_offset)

break;

- freefrag = NULL;

- if (oldaip != NULL) {

- if (oldaip->ai_newblkno != aip->ai_oldblkno)

- panic("setup_allocindir_phase2: blkno");

- aip->ai_oldblkno = oldaip->ai_oldblkno;

- freefrag = aip->ai_freefrag;

- aip->ai_freefrag = oldaip->ai_freefrag;

- oldaip->ai_freefrag = NULL;

- free_allocindir(oldaip, NULL);

- }

+ if (oldaip != NULL)

+ freefrag = allocindir_merge(aip, oldaip);

LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);

+ KASSERT(aip->ai_offset >= 0 &&

+ aip->ai_offset < NINDIR(ip->i_ump->um_fs),

+ ("setup_allocindir_phase2: Bad offset %d",

+ aip->ai_offset));

+ KASSERT(indirdep->ir_savebp != NULL,

+ ("setup_allocindir_phase2 NULL ir_savebp"));

if (ip->i_ump->um_fstype == UFS1)

((ufs1_daddr_t *)indirdep->ir_savebp->b_data)

[aip->ai_offset] = aip->ai_oldblkno;

@@ -2148,13 +4848,16 @@ setup_allocindir_phase2(bp, ip, aip)

}

newindirdep = malloc(sizeof(struct indirdep),

M_INDIRDEP, M_SOFTDEP_FLAGS);

- workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,

- UFSTOVFS(ip->i_ump));

+ workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);

newindirdep->ir_state = ATTACHED;

if (ip->i_ump->um_fstype == UFS1)

newindirdep->ir_state |= UFS1FMT;

+ newindirdep->ir_saveddata = NULL;

LIST_INIT(&newindirdep->ir_deplisthd);

LIST_INIT(&newindirdep->ir_donehd);

+ LIST_INIT(&newindirdep->ir_writehd);

+ LIST_INIT(&newindirdep->ir_completehd);

+ LIST_INIT(&newindirdep->ir_jwork);

if (bp->b_blkno == bp->b_lblkno) {

ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,

NULL, NULL);

@@ -2169,6 +4872,51 @@ setup_allocindir_phase2(bp, ip, aip)

}

+ * Merge two allocindirs which refer to the same block. Move newblock

+ * dependencies and setup the freefrags appropriately.

+ */

+static struct freefrag *

+allocindir_merge(aip, oldaip)

+ struct allocindir *aip;

+ struct allocindir *oldaip;

+ struct newdirblk *newdirblk;

+ struct freefrag *freefrag;

+ struct worklist *wk;

+ if (oldaip->ai_newblkno != aip->ai_oldblkno)

+ panic("allocindir_merge: blkno");

+ aip->ai_oldblkno = oldaip->ai_oldblkno;

+ freefrag = aip->ai_freefrag;

+ aip->ai_freefrag = oldaip->ai_freefrag;

+ oldaip->ai_freefrag = NULL;

+ KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));

+ /*

+ * If we are tracking a new directory-block allocation,

+ * move it from the old allocindir to the new allocindir.

+ */

+ if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {

+ newdirblk = WK_NEWDIRBLK(wk);

+ WORKLIST_REMOVE(&newdirblk->db_list);

+ if (!LIST_EMPTY(&oldaip->ai_newdirblk))

+ panic("allocindir_merge: extra newdirblk");

+ WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);

+ }

+ /*

+ * We can skip journaling for this freefrag and just complete

+ * any pending journal work for the allocindir that is being

+ * removed after the freefrag completes.

+ */

+ if (freefrag->ff_jfreefrag)

+ cancel_jfreefrag(freefrag->ff_jfreefrag);

+ LIST_REMOVE(oldaip, ai_next);

+ cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork);

+ free_newblk(&oldaip->ai_block);

+ return (freefrag);

+/*

* Block de-allocation dependencies.

* When blocks are de-allocated, the on-disk pointers must be nullified before

@@ -2203,9 +4951,12 @@ softdep_setup_freeblocks(ip, length, flags)

off_t length; /* The new length for the file */

int flags; /* IO_EXT and/or IO_NORMAL */

{

+ struct ufs1_dinode *dp1;

+ struct ufs2_dinode *dp2;

struct freeblks *freeblks;

struct inodedep *inodedep;

struct allocdirect *adp;

+ struct jfreeblk *jfreeblk;

struct bufobj *bo;

struct vnode *vp;

struct buf *bp;

@@ -2213,6 +4964,13 @@ softdep_setup_freeblocks(ip, length, flags)

ufs2_daddr_t extblocks, datablocks;

struct mount *mp;

int i, delay, error;

+ ufs2_daddr_t blkno;

+ ufs_lbn_t tmpval;

+ ufs_lbn_t lbn;

+ long oldextsize;

+ long oldsize;

+ int frags;

+ int needj;

fs = ip->i_fs;

mp = UFSTOVFS(ip->i_ump);

@@ -2221,32 +4979,53 @@ softdep_setup_freeblocks(ip, length, flags)

freeblks = malloc(sizeof(struct freeblks),

M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);

workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);

+ LIST_INIT(&freeblks->fb_jfreeblkhd);

+ LIST_INIT(&freeblks->fb_jwork);

freeblks->fb_state = ATTACHED;

freeblks->fb_uid = ip->i_uid;

freeblks->fb_previousinum = ip->i_number;

freeblks->fb_devvp = ip->i_devvp;

+ freeblks->fb_chkcnt = 0;

ACQUIRE_LOCK(&lk);

+ /*

+ * If we're truncating a removed file that will never be written

+ * we don't need to journal the block frees. The canceled journals

+ * for the allocations will suffice.

+ */

+ inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);

+ if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED ||

+ (fs->fs_flags & FS_SUJ) == 0)

+ needj = 0;

+ else

+ needj = 1;

num_freeblkdep++;

FREE_LOCK(&lk);

extblocks = 0;

if (fs->fs_magic == FS_UFS2_MAGIC)

extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));

datablocks = DIP(ip, i_blocks) - extblocks;

- if ((flags & IO_NORMAL) == 0) {

- freeblks->fb_oldsize = 0;

- freeblks->fb_chkcnt = 0;

- } else {

- freeblks->fb_oldsize = ip->i_size;

+ if ((flags & IO_NORMAL) != 0) {

+ oldsize = ip->i_size;

ip->i_size = 0;

DIP_SET(ip, i_size, 0);

freeblks->fb_chkcnt = datablocks;

for (i = 0; i < NDADDR; i++) {

- freeblks->fb_dblks[i] = DIP(ip, i_db[i]);

+ blkno = DIP(ip, i_db[i]);

DIP_SET(ip, i_db[i], 0);

+ if (blkno == 0)

+ continue;

+ frags = sblksize(fs, oldsize, i);

+ frags = numfrags(fs, frags);

+ newfreework(freeblks, NULL, i, blkno, frags, needj);

}

- for (i = 0; i < NIADDR; i++) {

- freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);

+ for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;

+ i++, tmpval *= NINDIR(fs)) {

+ blkno = DIP(ip, i_ib[i]);

DIP_SET(ip, i_ib[i], 0);

+ if (blkno)

+ newfreework(freeblks, NULL, -lbn - i, blkno,

+ fs->fs_frag, needj);

+ lbn += tmpval;

}

* If the file was removed, then the space being freed was

@@ -2259,17 +5038,23 @@ softdep_setup_freeblocks(ip, length, flags)

UFS_UNLOCK(ip->i_ump);

}

- if ((flags & IO_EXT) == 0) {

- freeblks->fb_oldextsize = 0;

- } else {

- freeblks->fb_oldextsize = ip->i_din2->di_extsize;

+ if ((flags & IO_EXT) != 0) {

+ oldextsize = ip->i_din2->di_extsize;

ip->i_din2->di_extsize = 0;

freeblks->fb_chkcnt += extblocks;

for (i = 0; i < NXADDR; i++) {

- freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];

+ blkno = ip->i_din2->di_extb[i];

ip->i_din2->di_extb[i] = 0;

+ if (blkno == 0)

+ continue;

+ frags = sblksize(fs, oldextsize, i);

+ frags = numfrags(fs, frags);

+ newfreework(freeblks, NULL, -1 - i, blkno, frags,

+ needj);

}

+ if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))

+ needj = 0;

DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);

* Push the zero'ed inode to to its disk buffer so that we are free

@@ -2282,12 +5067,17 @@ softdep_setup_freeblocks(ip, length, flags)

brelse(bp);

softdep_error("softdep_setup_freeblocks", error);

}

- if (ip->i_ump->um_fstype == UFS1)

- *((struct ufs1_dinode *)bp->b_data +

- ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;

- else

- *((struct ufs2_dinode *)bp->b_data +

- ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;

+ if (ip->i_ump->um_fstype == UFS1) {

+ dp1 = ((struct ufs1_dinode *)bp->b_data +

+ ino_to_fsbo(fs, ip->i_number));

+ ip->i_din1->di_freelink = dp1->di_freelink;

+ *dp1 = *ip->i_din1;

+ } else {

+ dp2 = ((struct ufs2_dinode *)bp->b_data +

+ ino_to_fsbo(fs, ip->i_number));

+ ip->i_din2->di_freelink = dp2->di_freelink;

+ *dp2 = *ip->i_din2;

+ }

* Find and eliminate any inode dependencies.

@@ -2304,7 +5094,9 @@ softdep_setup_freeblocks(ip, length, flags)

delay = (inodedep->id_state & DEPCOMPLETE);

if (delay)

- WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);

+ WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);

+ else if (needj)

+ freeblks->fb_state |= DEPCOMPLETE | COMPLETE;

* Because the file length has been truncated to zero, any

* pending block allocation dependency structures associated

@@ -2318,14 +5110,19 @@ softdep_setup_freeblocks(ip, length, flags)

merge_inode_lists(&inodedep->id_newinoupdt,

&inodedep->id_inoupdt);

while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)

- free_allocdirect(&inodedep->id_inoupdt, adp, delay);

+ cancel_allocdirect(&inodedep->id_inoupdt, adp,

+ freeblks, delay);

}

if (flags & IO_EXT) {

merge_inode_lists(&inodedep->id_newextupdt,

&inodedep->id_extupdt);

while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)

- free_allocdirect(&inodedep->id_extupdt, adp, delay);

+ cancel_allocdirect(&inodedep->id_extupdt, adp,

+ freeblks, delay);

}

+ LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)

+ add_to_journal(&jfreeblk->jf_list);

FREE_LOCK(&lk);

bdwrite(bp);

@@ -2349,9 +5146,9 @@ restart:

BO_UNLOCK(bo);

ACQUIRE_LOCK(&lk);

(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);

- deallocate_dependencies(bp, inodedep);

+ if (deallocate_dependencies(bp, inodedep, freeblks))

+ bp->b_flags |= B_INVAL | B_NOCACHE;

FREE_LOCK(&lk);

- bp->b_flags |= B_INVAL | B_NOCACHE;

brelse(bp);

BO_LOCK(bo);

goto restart;

@@ -2361,7 +5158,7 @@ restart:

if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)

(void) free_inodedep(inodedep);

- if(delay) {

+ if (delay) {

freeblks->fb_state |= DEPCOMPLETE;

* If the inode with zeroed block pointers is now on disk

@@ -2371,16 +5168,16 @@ restart:

* the request here than in the !delay case.

if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)

- add_to_worklist(&freeblks->fb_list);

+ add_to_worklist(&freeblks->fb_list, 1);

}

FREE_LOCK(&lk);

- * If the inode has never been written to disk (delay == 0),

- * then we can process the freeblks now that we have deleted

- * the dependencies.

+ * If the inode has never been written to disk (delay == 0) and

+ * we're not waiting on any journal writes, then we can process the

+ * freeblks now that we have deleted the dependencies.

- if (!delay)

+ if (!delay && !needj)

handle_workitem_freeblocks(freeblks, 0);

}

@@ -2389,19 +5186,23 @@ restart:

* be reallocated to a new vnode. The buffer must be locked, thus,

* no I/O completion operations can occur while we are manipulating

* its associated dependencies. The mutex is held so that other I/O's

- * associated with related dependencies do not occur.

+ * associated with related dependencies do not occur. Returns 1 if

+ * all dependencies were cleared, 0 otherwise.

-static void

-deallocate_dependencies(bp, inodedep)

+static int

+deallocate_dependencies(bp, inodedep, freeblks)

struct buf *bp;

struct inodedep *inodedep;

+ struct freeblks *freeblks;

{

struct worklist *wk;

struct indirdep *indirdep;

+ struct newdirblk *newdirblk;

struct allocindir *aip;

struct pagedep *pagedep;

+ struct jremref *jremref;

+ struct jmvref *jmvref;

struct dirrem *dirrem;

- struct diradd *dap;

int i;

mtx_assert(&lk, MA_OWNED);

@@ -2410,47 +5211,24 @@ deallocate_dependencies(bp, inodedep)

case D_INDIRDEP:

indirdep = WK_INDIRDEP(wk);

- /*

- * None of the indirect pointers will ever be visible,

- * so they can simply be tossed. GOINGAWAY ensures

- * that allocated pointers will be saved in the buffer

- * cache until they are freed. Note that they will

- * only be able to be found by their physical address

- * since the inode mapping the logical address will

- * be gone. The save buffer used for the safe copy

- * was allocated in setup_allocindir_phase2 using

- * the physical address so it could be used for this

- * purpose. Hence we swap the safe copy with the real

- * copy, allowing the safe copy to be freed and holding

- * on to the real copy for later use in indir_trunc.

- */

- if (indirdep->ir_state & GOINGAWAY)

- panic("deallocate_dependencies: already gone");

- indirdep->ir_state |= GOINGAWAY;

- VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;

- while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)

- free_allocindir(aip, inodedep);

if (bp->b_lblkno >= 0 ||

bp->b_blkno != indirdep->ir_savebp->b_lblkno)

panic("deallocate_dependencies: not indir");

- bcopy(bp->b_data, indirdep->ir_savebp->b_data,

- bp->b_bcount);

- WORKLIST_REMOVE(wk);

- WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);

+ cancel_indirdep(indirdep, bp, inodedep, freeblks);

continue;

case D_PAGEDEP:

pagedep = WK_PAGEDEP(wk);

- * None of the directory additions will ever be

- * visible, so they can simply be tossed.

+ * There should be no directory add dependencies present

+ * as the directory could not be truncated until all

+ * children were removed.

+ KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,

+ ("deallocate_dependencies: pendinghd != NULL"));

for (i = 0; i < DAHASHSZ; i++)

- while ((dap =

- LIST_FIRST(&pagedep->pd_diraddhd[i])))

- free_diradd(dap);

- while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)

- free_diradd(dap);

+ KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,

+ ("deallocate_dependencies: diraddhd != NULL"));

* Copy any directory remove dependencies to the list

* to be processed after the zero'ed inode is written.

@@ -2458,28 +5236,40 @@ deallocate_dependencies(bp, inodedep)

* can be dumped directly onto the work list.

LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {

+ /*

+ * If there are any dirrems we wait for

+ * the journal write to complete and

+ * then restart the buf scan as the lock

+ * has been dropped.

+ */

+ while ((jremref =

+ LIST_FIRST(&dirrem->dm_jremrefhd))

+ != NULL) {

+ stat_jwait_filepage++;

+ jwait(&jremref->jr_list);

+ return (0);

+ }

LIST_REMOVE(dirrem, dm_next);

dirrem->dm_dirinum = pagedep->pd_ino;

if (inodedep == NULL ||

(inodedep->id_state & ALLCOMPLETE) ==

- ALLCOMPLETE)

- add_to_worklist(&dirrem->dm_list);

- else

+ ALLCOMPLETE) {

+ dirrem->dm_state |= COMPLETE;

+ add_to_worklist(&dirrem->dm_list, 0);

+ } else

WORKLIST_INSERT(&inodedep->id_bufwait,

&dirrem->dm_list);

}

if ((pagedep->pd_state & NEWBLOCK) != 0) {

- LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)

- if (wk->wk_type == D_NEWDIRBLK &&

- WK_NEWDIRBLK(wk)->db_pagedep ==

- pagedep)

- break;

- if (wk != NULL) {

- WORKLIST_REMOVE(wk);

- free_newdirblk(WK_NEWDIRBLK(wk));

- } else

- panic("deallocate_dependencies: "

- "lost pagedep");

+ newdirblk = pagedep->pd_newdirblk;

+ WORKLIST_REMOVE(&newdirblk->db_list);

+ free_newdirblk(newdirblk);

+ }

+ while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd))

+ != NULL) {

+ stat_jwait_filepage++;

+ jwait(&jmvref->jm_list);

+ return (0);

}

WORKLIST_REMOVE(&pagedep->pd_list);

LIST_REMOVE(pagedep, pd_hash);

@@ -2487,7 +5277,8 @@ deallocate_dependencies(bp, inodedep)

continue;

case D_ALLOCINDIR:

- free_allocindir(WK_ALLOCINDIR(wk), inodedep);

+ aip = WK_ALLOCINDIR(wk);

+ cancel_allocindir(aip, inodedep, freeblks);

continue;

case D_ALLOCDIRECT:

@@ -2502,46 +5293,155 @@ deallocate_dependencies(bp, inodedep)

/* NOTREACHED */

}

+ return (1);

}

- * Free an allocdirect. Generate a new freefrag work request if appropriate.

- * This routine must be called with splbio interrupts blocked.

+ * An allocdirect is being canceled due to a truncate. We must make sure

+ * the journal entry is released in concert with the blkfree that releases

+ * the storage. Completed journal entries must not be released until the

+ * space is no longer pointed to by the inode or in the bitmap.

static void

-free_allocdirect(adphead, adp, delay)

+cancel_allocdirect(adphead, adp, freeblks, delay)

struct allocdirectlst *adphead;

struct allocdirect *adp;

+ struct freeblks *freeblks;

int delay;

{

+ struct freework *freework;

+ struct newblk *newblk;

+ struct worklist *wk;

+ ufs_lbn_t lbn;

+ TAILQ_REMOVE(adphead, adp, ad_next);

+ newblk = (struct newblk *)adp;

+ /*

+ * If the journal hasn't been written the jnewblk must be passed

+ * to the call to ffs_freeblk that reclaims the space. We accomplish

+ * this by linking the journal dependency into the freework to be

+ * freed when freework_freeblock() is called. If the journal has

+ * been written we can simply reclaim the journal space when the

+ * freeblks work is complete.

+ */

+ if (newblk->nb_jnewblk == NULL) {

+ cancel_newblk(newblk, &freeblks->fb_jwork);

+ goto found;

+ }

+ lbn = newblk->nb_jnewblk->jn_lbn;

+ /*

+ * Find the correct freework structure so it releases the canceled

+ * journal when the bitmap is cleared. This preserves rollback

+ * until the allocation is reverted.

+ */

+ LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {

+ freework = WK_FREEWORK(wk);

+ if (freework->fw_lbn != lbn)

+ continue;

+ cancel_newblk(newblk, &freework->fw_jwork);

+ goto found;

+ }

+ panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);

+found:

+ if (delay)

+ WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,

+ &newblk->nb_list);

+ else

+ free_newblk(newblk);

+ return;

+static void

+cancel_newblk(newblk, wkhd)

+ struct newblk *newblk;

+ struct workhead *wkhd;

+ struct indirdep *indirdep;

+ struct allocindir *aip;

+ while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {

+ indirdep->ir_state &= ~ONDEPLIST;

+ LIST_REMOVE(indirdep, ir_next);

+ /*

+ * If an indirdep is not on the buf worklist we need to

+ * free it here as deallocate_dependencies() will never

+ * find it. These pointers were never visible on disk and

+ * can be discarded immediately.

+ */

+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {

+ LIST_REMOVE(aip, ai_next);

+ cancel_newblk(&aip->ai_block, wkhd);

+ free_newblk(&aip->ai_block);

+ }

+ /*

+ * If this indirdep is not attached to a buf it was simply

+ * waiting on completion to clear completehd. free_indirdep()

+ * asserts that nothing is dangling.

+ */

+ if ((indirdep->ir_state & ONWORKLIST) == 0)

+ free_indirdep(indirdep);

+ }

+ if (newblk->nb_state & ONDEPLIST) {

+ newblk->nb_state &= ~ONDEPLIST;

+ LIST_REMOVE(newblk, nb_deps);

+ }

+ if (newblk->nb_state & ONWORKLIST)

+ WORKLIST_REMOVE(&newblk->nb_list);

+ /*

+ * If the journal entry hasn't been written we hold onto the dep

+ * until it is safe to free along with the other journal work.

+ */

+ if (newblk->nb_jnewblk != NULL) {

+ cancel_jnewblk(newblk->nb_jnewblk, wkhd);

+ newblk->nb_jnewblk = NULL;

+ }

+ if (!LIST_EMPTY(&newblk->nb_jwork))

+ jwork_move(wkhd, &newblk->nb_jwork);

+/*

+ * Free a newblk. Generate a new freefrag work request if appropriate.

+ * This must be called after the inode pointer and any direct block pointers

+ * are valid or fully removed via truncate or frag extension.

+ */

+static void

+free_newblk(newblk)

+ struct newblk *newblk;

+ struct indirdep *indirdep;

struct newdirblk *newdirblk;

+ struct freefrag *freefrag;

struct worklist *wk;

mtx_assert(&lk, MA_OWNED);

- if ((adp->ad_state & DEPCOMPLETE) == 0)

- LIST_REMOVE(adp, ad_deps);

- TAILQ_REMOVE(adphead, adp, ad_next);

- if ((adp->ad_state & COMPLETE) == 0)

- WORKLIST_REMOVE(&adp->ad_list);

- if (adp->ad_freefrag != NULL) {

- if (delay)

- WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,

- &adp->ad_freefrag->ff_list);

- else

- add_to_worklist(&adp->ad_freefrag->ff_list);

+ if (newblk->nb_state & ONDEPLIST)

+ LIST_REMOVE(newblk, nb_deps);

+ if (newblk->nb_state & ONWORKLIST)

+ WORKLIST_REMOVE(&newblk->nb_list);

+ LIST_REMOVE(newblk, nb_hash);

+ if ((freefrag = newblk->nb_freefrag) != NULL) {

+ freefrag->ff_state |= COMPLETE;

+ if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)

+ add_to_worklist(&freefrag->ff_list, 0);

}

- if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {

+ if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {

newdirblk = WK_NEWDIRBLK(wk);

WORKLIST_REMOVE(&newdirblk->db_list);

- if (!LIST_EMPTY(&adp->ad_newdirblk))

- panic("free_allocdirect: extra newdirblk");

- if (delay)

- WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,

- &newdirblk->db_list);

- else

- free_newdirblk(newdirblk);

- }

- WORKITEM_FREE(adp, D_ALLOCDIRECT);

+ if (!LIST_EMPTY(&newblk->nb_newdirblk))

+ panic("free_newblk: extra newdirblk");

+ free_newdirblk(newdirblk);

+ }

+ while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {

+ indirdep->ir_state |= DEPCOMPLETE;

+ indirdep_complete(indirdep);

+ }

+ KASSERT(newblk->nb_jnewblk == NULL,

+ ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));

+ handle_jwork(&newblk->nb_jwork);

+ newblk->nb_list.wk_type = D_NEWBLK;

+ WORKITEM_FREE(newblk, D_NEWBLK);

}

@@ -2554,6 +5454,7 @@ free_newdirblk(newdirblk)

{

struct pagedep *pagedep;

struct diradd *dap;

+ struct worklist *wk;

int i;

mtx_assert(&lk, MA_OWNED);

@@ -2571,17 +5472,25 @@ free_newdirblk(newdirblk)

pagedep->pd_state &= ~NEWBLOCK;

if ((pagedep->pd_state & ONWORKLIST) == 0)

while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)

- free_diradd(dap);

+ free_diradd(dap, NULL);

* If no dependencies remain, the pagedep will be freed.

for (i = 0; i < DAHASHSZ; i++)

if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))

break;

- if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {

+ if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&

+ LIST_EMPTY(&pagedep->pd_jmvrefhd)) {

+ KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,

+ ("free_newdirblk: Freeing non-free pagedep %p", pagedep));

LIST_REMOVE(pagedep, pd_hash);

WORKITEM_FREE(pagedep, D_PAGEDEP);

}

+ /* Should only ever be one item in the list. */

+ while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {

+ WORKLIST_REMOVE(wk);

+ handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);

+ }

WORKITEM_FREE(newdirblk, D_NEWDIRBLK);

}

@@ -2608,6 +5517,7 @@ softdep_freefile(pvp, ino, mode)

freefile->fx_mode = mode;

freefile->fx_oldinum = ino;

freefile->fx_devvp = ip->i_devvp;

+ LIST_INIT(&freefile->fx_jwork);

if ((ip->i_flag & IN_SPACECOUNTED) == 0) {

UFS_LOCK(ip->i_ump);

ip->i_fs->fs_pendinginodes += 1;

@@ -2618,11 +5528,34 @@ softdep_freefile(pvp, ino, mode)

* If the inodedep does not exist, then the zero'ed inode has

* been written to disk. If the allocated inode has never been

* written to disk, then the on-disk inode is zero'ed. In either

- * case we can free the file immediately.

+ * case we can free the file immediately. If the journal was

+ * canceled before being written the inode will never make it to

+ * disk and we must send the canceled journal entrys to

+ * ffs_freefile() to be cleared in conjunction with the bitmap.

+ * Any blocks waiting on the inode to write can be safely freed

+ * here as it will never been written.

ACQUIRE_LOCK(&lk);

- if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||

- check_inode_unwritten(inodedep)) {

+ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);

+ /*

+ * Remove this inode from the unlinked list and set

+ * GOINGAWAY as appropriate to indicate that this inode

+ * will never be written.

+ */

+ if (inodedep && inodedep->id_state & UNLINKED) {

+ /*

+ * Save the journal work to be freed with the bitmap

+ * before we clear UNLINKED. Otherwise it can be lost

+ * if the inode block is written.

+ */

+ handle_bufwait(inodedep, &freefile->fx_jwork);

+ clear_unlinked_inodedep(inodedep);

+ /* Re-acquire inodedep as we've dropped lk. */

+ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);

+ if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0)

+ inodedep->id_state |= GOINGAWAY;

+ }

+ if (inodedep == NULL || check_inode_unwritten(inodedep)) {

FREE_LOCK(&lk);

handle_workitem_freefile(freefile);

return;

@@ -2654,7 +5587,8 @@ check_inode_unwritten(inodedep)

{

mtx_assert(&lk, MA_OWNED);

- if ((inodedep->id_state & DEPCOMPLETE) != 0 ||

+ if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||

!LIST_EMPTY(&inodedep->id_pendinghd) ||

!LIST_EMPTY(&inodedep->id_bufwait) ||

!LIST_EMPTY(&inodedep->id_inowait) ||

@@ -2662,9 +5596,9 @@ check_inode_unwritten(inodedep)

!TAILQ_EMPTY(&inodedep->id_newinoupdt) ||

!TAILQ_EMPTY(&inodedep->id_extupdt) ||

!TAILQ_EMPTY(&inodedep->id_newextupdt) ||

+ inodedep->id_mkdiradd != NULL ||

inodedep->id_nlinkdelta != 0)

return (0);

* Another process might be in initiate_write_inodeblock_ufs[12]

* trying to allocate memory without holding "Softdep Lock".

@@ -2673,9 +5607,11 @@ check_inode_unwritten(inodedep)

inodedep->id_savedino1 == NULL)

return (0);

+ if (inodedep->id_state & ONDEPLIST)

+ LIST_REMOVE(inodedep, id_deps);

+ inodedep->id_state &= ~ONDEPLIST;

inodedep->id_state |= ALLCOMPLETE;

- LIST_REMOVE(inodedep, id_deps);

- inodedep->id_buf = NULL;

+ inodedep->id_bmsafemap = NULL;

if (inodedep->id_state & ONWORKLIST)

WORKLIST_REMOVE(&inodedep->id_list);

if (inodedep->id_savedino1 != NULL) {

@@ -2696,17 +5632,23 @@ free_inodedep(inodedep)

{

mtx_assert(&lk, MA_OWNED);

- if ((inodedep->id_state & ONWORKLIST) != 0 ||

+ if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||

(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||

+ !LIST_EMPTY(&inodedep->id_dirremhd) ||

!LIST_EMPTY(&inodedep->id_pendinghd) ||

!LIST_EMPTY(&inodedep->id_bufwait) ||

!LIST_EMPTY(&inodedep->id_inowait) ||

+ !TAILQ_EMPTY(&inodedep->id_inoreflst) ||

!TAILQ_EMPTY(&inodedep->id_inoupdt) ||

!TAILQ_EMPTY(&inodedep->id_newinoupdt) ||

!TAILQ_EMPTY(&inodedep->id_extupdt) ||

!TAILQ_EMPTY(&inodedep->id_newextupdt) ||

- inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)

+ inodedep->id_mkdiradd != NULL ||

+ inodedep->id_nlinkdelta != 0 ||

+ inodedep->id_savedino1 != NULL)

return (0);

+ if (inodedep->id_state & ONDEPLIST)

+ LIST_REMOVE(inodedep, id_deps);

LIST_REMOVE(inodedep, id_hash);

WORKITEM_FREE(inodedep, D_INODEDEP);

num_inodedep -= 1;

@@ -2714,6 +5656,126 @@ free_inodedep(inodedep)

}

+ * Free the block referenced by a freework structure. The parent freeblks

+ * structure is released and completed when the final cg bitmap reaches

+ * the disk. This routine may be freeing a jnewblk which never made it to

+ * disk in which case we do not have to wait as the operation is undone

+ * in memory immediately.

+ */

+static void

+freework_freeblock(freework)

+ struct freework *freework;

+ struct freeblks *freeblks;

+ struct ufsmount *ump;

+ struct workhead wkhd;

+ struct fs *fs;

+ int complete;

+ int pending;

+ int bsize;

+ int needj;

+ freeblks = freework->fw_freeblks;

+ ump = VFSTOUFS(freeblks->fb_list.wk_mp);

+ fs = ump->um_fs;

+ needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ;

+ complete = 0;

+ LIST_INIT(&wkhd);

+ /*

+ * If we are canceling an existing jnewblk pass it to the free

+ * routine, otherwise pass the freeblk which will ultimately

+ * release the freeblks. If we're not journaling, we can just

+ * free the freeblks immediately.

+ */

+ if (!LIST_EMPTY(&freework->fw_jwork)) {

+ LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);

+ complete = 1;

+ } else if (needj)

+ WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list);

+ bsize = lfragtosize(fs, freework->fw_frags);

+ pending = btodb(bsize);

+ ACQUIRE_LOCK(&lk);

+ freeblks->fb_chkcnt -= pending;

+ FREE_LOCK(&lk);

+ /*

+ * extattr blocks don't show up in pending blocks. XXX why?

+ */

+ if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {

+ UFS_LOCK(ump);

+ fs->fs_pendingblocks -= pending;

+ UFS_UNLOCK(ump);

+ }

+ ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,

+ bsize, freeblks->fb_previousinum, &wkhd);

+ if (complete == 0 && needj)

+ return;

+ /*

+ * The jnewblk will be discarded and the bits in the map never

+ * made it to disk. We can immediately free the freeblk.

+ */

+ ACQUIRE_LOCK(&lk);

+ handle_written_freework(freework);

+ FREE_LOCK(&lk);

+/*

+ * Start, continue, or finish the process of freeing an indirect block tree.

+ * The free operation may be paused at any point with fw_off containing the

+ * offset to restart from. This enables us to implement some flow control

+ * for large truncates which may fan out and generate a huge number of

+ * dependencies.

+ */

+static void

+handle_workitem_indirblk(freework)

+ struct freework *freework;

+ struct freeblks *freeblks;

+ struct ufsmount *ump;

+ struct fs *fs;

+ freeblks = freework->fw_freeblks;

+ ump = VFSTOUFS(freeblks->fb_list.wk_mp);

+ fs = ump->um_fs;

+ if (freework->fw_off == NINDIR(fs))

+ freework_freeblock(freework);

+ else

+ indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),

+ freework->fw_lbn);

+/*

+ * Called when a freework structure attached to a cg buf is written. The

+ * ref on either the parent or the freeblks structure is released and

+ * either may be added to the worklist if it is the final ref.

+ */

+static void

+handle_written_freework(freework)

+ struct freework *freework;

+ struct freeblks *freeblks;

+ struct freework *parent;

+ freeblks = freework->fw_freeblks;

+ parent = freework->fw_parent;

+ if (parent) {

+ if (--parent->fw_ref != 0)

+ parent = NULL;

+ freeblks = NULL;

+ } else if (--freeblks->fb_ref != 0)

+ freeblks = NULL;

+ WORKITEM_FREE(freework, D_FREEWORK);

+ /*

+ * Don't delay these block frees or it takes an intolerable amount

+ * of time to process truncates and free their journal entries.

+ */

+ if (freeblks)

+ add_to_worklist(&freeblks->fb_list, 1);

+ if (parent)

+ add_to_worklist(&parent->fw_list, 1);

+/*

* This workitem routine performs the block de-allocation.

* The workitem is added to the pending list after the updated

* inode block has been written to disk. As mentioned above,

@@ -2726,99 +5788,79 @@ handle_workitem_freeblocks(freeblks, flags)

struct freeblks *freeblks;

int flags;

{

+ struct freework *freework;

+ struct worklist *wk;

+ KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),

+ ("handle_workitem_freeblocks: Journal entries not written."));

+ if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {

+ handle_complete_freeblocks(freeblks);

+ return;

+ }

+ freeblks->fb_ref++;

+ while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {

+ KASSERT(wk->wk_type == D_FREEWORK,

+ ("handle_workitem_freeblocks: Unknown type %s",

+ TYPENAME(wk->wk_type)));

+ WORKLIST_REMOVE_UNLOCKED(wk);

+ freework = WK_FREEWORK(wk);

+ if (freework->fw_lbn <= -NDADDR)

+ handle_workitem_indirblk(freework);

+ else

+ freework_freeblock(freework);

+ }

+ ACQUIRE_LOCK(&lk);

+ if (--freeblks->fb_ref != 0)

+ freeblks = NULL;

+ FREE_LOCK(&lk);

+ if (freeblks)

+ handle_complete_freeblocks(freeblks);

+/*

+ * Once all of the freework workitems are complete we can retire the

+ * freeblocks dependency and any journal work awaiting completion. This

+ * can not be called until all other dependencies are stable on disk.

+ */

+static void

+handle_complete_freeblocks(freeblks)

+ struct freeblks *freeblks;

struct inode *ip;

struct vnode *vp;

struct fs *fs;

struct ufsmount *ump;

- int i, nblocks, level, bsize;

- ufs2_daddr_t bn, blocksreleased = 0;

- int error, allerror = 0;

- ufs_lbn_t baselbns[NIADDR], tmpval;

- int fs_pendingblocks;

+ int flags;

ump = VFSTOUFS(freeblks->fb_list.wk_mp);

fs = ump->um_fs;

- fs_pendingblocks = 0;

- tmpval = 1;

- baselbns[0] = NDADDR;

- for (i = 1; i < NIADDR; i++) {

- tmpval *= NINDIR(fs);

- baselbns[i] = baselbns[i - 1] + tmpval;

- }

- nblocks = btodb(fs->fs_bsize);

- blocksreleased = 0;

- /*

- * Release all extended attribute blocks or frags.

- */

- if (freeblks->fb_oldextsize > 0) {

- for (i = (NXADDR - 1); i >= 0; i--) {

- if ((bn = freeblks->fb_eblks[i]) == 0)

- continue;

- bsize = sblksize(fs, freeblks->fb_oldextsize, i);

- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,

- freeblks->fb_previousinum);

- blocksreleased += btodb(bsize);

- }

- /*

- * Release all data blocks or frags.

- */

- if (freeblks->fb_oldsize > 0) {

- /*

- * Indirect blocks first.

- */

- for (level = (NIADDR - 1); level >= 0; level--) {

- if ((bn = freeblks->fb_iblks[level]) == 0)

- continue;

- if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),

- level, baselbns[level], &blocksreleased)) != 0)

- allerror = error;

- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,

- fs->fs_bsize, freeblks->fb_previousinum);

- fs_pendingblocks += nblocks;

- blocksreleased += nblocks;

- }

- /*

- * All direct blocks or frags.

- */

- for (i = (NDADDR - 1); i >= 0; i--) {

- if ((bn = freeblks->fb_dblks[i]) == 0)

- continue;

- bsize = sblksize(fs, freeblks->fb_oldsize, i);

- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,

- freeblks->fb_previousinum);

- fs_pendingblocks += btodb(bsize);

- blocksreleased += btodb(bsize);

- }

- UFS_LOCK(ump);

- fs->fs_pendingblocks -= fs_pendingblocks;

- UFS_UNLOCK(ump);

+ flags = LK_NOWAIT;

* If we still have not finished background cleanup, then check

* to see if the block count needs to be adjusted.

- if (freeblks->fb_chkcnt != blocksreleased &&

- (fs->fs_flags & FS_UNCLEAN) != 0 &&

+ if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&

ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,

- (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)

- == 0) {

+ (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {

ip = VTOI(vp);

- DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \

- freeblks->fb_chkcnt - blocksreleased);

+ DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);

ip->i_flag |= IN_CHANGE;

vput(vp);

}

#ifdef INVARIANTS

- if (freeblks->fb_chkcnt != blocksreleased &&

+ if (freeblks->fb_chkcnt != 0 &&

((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))

printf("handle_workitem_freeblocks: block count\n");

- if (allerror)

- softdep_error("handle_workitem_freeblks", allerror);

#endif /* INVARIANTS */

ACQUIRE_LOCK(&lk);

+ /*

+ * All of the freeblock deps must be complete prior to this call

+ * so it's now safe to complete earlier outstanding journal entries.

+ */

+ handle_jwork(&freeblks->fb_jwork);

WORKITEM_FREE(freeblks, D_FREEBLKS);

num_freeblkdep--;

FREE_LOCK(&lk);

@@ -2830,29 +5872,42 @@ handle_workitem_freeblocks(freeblks, flags)

* and recursive calls to indirtrunc must be used to cleanse other indirect

* blocks.

-static int

-indir_trunc(freeblks, dbn, level, lbn, countp)

- struct freeblks *freeblks;

+static void

+indir_trunc(freework, dbn, lbn)

+ struct freework *freework;

ufs2_daddr_t dbn;

- int level;

ufs_lbn_t lbn;

- ufs2_daddr_t *countp;

{

+ struct freework *nfreework;

+ struct workhead wkhd;

+ struct jnewblk *jnewblk;

+ struct freeblks *freeblks;

struct buf *bp;

struct fs *fs;

+ struct worklist *wkn;

struct worklist *wk;

struct indirdep *indirdep;

struct ufsmount *ump;

ufs1_daddr_t *bap1 = 0;

- ufs2_daddr_t nb, *bap2 = 0;

+ ufs2_daddr_t nb, nnb, *bap2 = 0;

ufs_lbn_t lbnadd;

int i, nblocks, ufs1fmt;

- int error, allerror = 0;

int fs_pendingblocks;

+ int freedeps;

+ int needj;

+ int level;

+ int cnt;

+ LIST_INIT(&wkhd);

+ level = lbn_level(lbn);

+ if (level == -1)

+ panic("indir_trunc: Invalid lbn %jd\n", lbn);

+ freeblks = freework->fw_freeblks;

ump = VFSTOUFS(freeblks->fb_list.wk_mp);

fs = ump->um_fs;

fs_pendingblocks = 0;

+ freedeps = 0;

+ needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ;

lbnadd = 1;

for (i = level; i > 0; i--)

lbnadd *= NINDIR(fs);

@@ -2877,13 +5932,14 @@ indir_trunc(freeblks, dbn, level, lbn, countp)

ACQUIRE_LOCK(&lk);

if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {

if (wk->wk_type != D_INDIRDEP ||

- (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||

- (indirdep->ir_state & GOINGAWAY) == 0)

- panic("indir_trunc: lost indirdep");

- WORKLIST_REMOVE(wk);

- WORKITEM_FREE(indirdep, D_INDIRDEP);

+ (wk->wk_state & GOINGAWAY) == 0)

+ panic("indir_trunc: lost indirdep %p", wk);

+ indirdep = WK_INDIRDEP(wk);

+ LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);

+ free_indirdep(indirdep);

if (!LIST_EMPTY(&bp->b_dep))

- panic("indir_trunc: dangling dep");

+ panic("indir_trunc: dangling dep %p",

+ LIST_FIRST(&bp->b_dep));

ump->um_numindirdeps -= 1;

FREE_LOCK(&lk);

} else {

@@ -2892,11 +5948,10 @@ indir_trunc(freeblks, dbn, level, lbn, countp)

brelse(bp);

#endif

FREE_LOCK(&lk);

- error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,

- NOCRED, &bp);

- if (error) {

+ if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,

+ NOCRED, &bp) != 0) {

brelse(bp);

- return (error);

+ return;

}

@@ -2909,57 +5964,264 @@ indir_trunc(freeblks, dbn, level, lbn, countp)

ufs1fmt = 0;

bap2 = (ufs2_daddr_t *)bp->b_data;

}

- nblocks = btodb(fs->fs_bsize);

- for (i = NINDIR(fs) - 1; i >= 0; i--) {

- if (ufs1fmt)

- nb = bap1[i];

+ /*

+ * Reclaim indirect blocks which never made it to disk.

+ */

+ cnt = 0;

+ LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {

+ struct workhead freewk;

+ if (wk->wk_type != D_JNEWBLK)

+ continue;

+ WORKLIST_REMOVE_UNLOCKED(wk);

+ LIST_INIT(&freewk);

+ WORKLIST_INSERT_UNLOCKED(&freewk, wk);

+ jnewblk = WK_JNEWBLK(wk);

+ if (jnewblk->jn_lbn > 0)

+ i = (jnewblk->jn_lbn - -lbn) / lbnadd;

else

+ i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd;

+ KASSERT(i >= 0 && i < NINDIR(fs),

+ ("indir_trunc: Index out of range %d parent %jd lbn %jd",

+ i, lbn, jnewblk->jn_lbn));

+ /* Clear the pointer so it isn't found below. */

+ if (ufs1fmt) {

+ nb = bap1[i];

+ bap1[i] = 0;

+ } else {

nb = bap2[i];

+ bap2[i] = 0;

+ }

+ KASSERT(nb == jnewblk->jn_blkno,

+ ("indir_trunc: Block mismatch %jd != %jd",

+ nb, jnewblk->jn_blkno));

+ ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno,

+ fs->fs_bsize, freeblks->fb_previousinum, &freewk);

+ cnt++;

+ }

+ ACQUIRE_LOCK(&lk);

+ if (needj)

+ freework->fw_ref += NINDIR(fs) + 1;

+ /* Any remaining journal work can be completed with freeblks. */

+ jwork_move(&freeblks->fb_jwork, &wkhd);

+ FREE_LOCK(&lk);

+ nblocks = btodb(fs->fs_bsize);

+ if (ufs1fmt)

+ nb = bap1[0];

+ else

+ nb = bap2[0];

+ nfreework = freework;

+ /*

+ * Reclaim on disk blocks.

+ */

+ for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {

+ if (i != NINDIR(fs) - 1) {

+ if (ufs1fmt)

+ nnb = bap1[i+1];

+ else

+ nnb = bap2[i+1];

+ } else

+ nnb = 0;

if (nb == 0)

continue;

+ cnt++;

if (level != 0) {

- if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),

- level - 1, lbn + (i * lbnadd), countp)) != 0)

- allerror = error;

+ ufs_lbn_t nlbn;

+ nlbn = (lbn + 1) - (i * lbnadd);

+ if (needj != 0) {

+ nfreework = newfreework(freeblks, freework,

+ nlbn, nb, fs->fs_frag, 0);

+ freedeps++;

+ }

+ indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);

+ } else {

+ struct freedep *freedep;

+ /*

+ * Attempt to aggregate freedep dependencies for

+ * all blocks being released to the same CG.

+ */

+ LIST_INIT(&wkhd);

+ if (needj != 0 &&

+ (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {

+ freedep = newfreedep(freework);

+ WORKLIST_INSERT_UNLOCKED(&wkhd,

+ &freedep->fd_list);

+ freedeps++;

+ }

+ ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,

+ fs->fs_bsize, freeblks->fb_previousinum, &wkhd);

}

- ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,

- freeblks->fb_previousinum);

+ }

+ if (level == 0)

+ fs_pendingblocks = (nblocks * cnt);

+ /*

+ * If we're not journaling we can free the indirect now. Otherwise

+ * setup the ref counts and offset so this indirect can be completed

+ * when its children are free.

+ */

+ if (needj == 0) {

fs_pendingblocks += nblocks;

- *countp += nblocks;

+ dbn = dbtofsb(fs, dbn);

+ ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,

+ freeblks->fb_previousinum, NULL);

+ ACQUIRE_LOCK(&lk);

+ freeblks->fb_chkcnt -= fs_pendingblocks;

+ if (freework->fw_blkno == dbn)

+ handle_written_freework(freework);

+ FREE_LOCK(&lk);

+ freework = NULL;

+ } else {

+ ACQUIRE_LOCK(&lk);

+ freework->fw_off = i;

+ freework->fw_ref += freedeps;

+ freework->fw_ref -= NINDIR(fs) + 1;

+ if (freework->fw_ref != 0)

+ freework = NULL;

+ freeblks->fb_chkcnt -= fs_pendingblocks;

+ FREE_LOCK(&lk);

+ }

+ if (fs_pendingblocks) {

+ UFS_LOCK(ump);

+ fs->fs_pendingblocks -= fs_pendingblocks;

+ UFS_UNLOCK(ump);

}

- UFS_LOCK(ump);

- fs->fs_pendingblocks -= fs_pendingblocks;

- UFS_UNLOCK(ump);

bp->b_flags |= B_INVAL | B_NOCACHE;

brelse(bp);

- return (allerror);

+ if (freework)

+ handle_workitem_indirblk(freework);

+ return;

}

- * Free an allocindir.

- * This routine must be called with splbio interrupts blocked.

+ * Cancel an allocindir when it is removed via truncation.

static void

-free_allocindir(aip, inodedep)

+cancel_allocindir(aip, inodedep, freeblks)

struct allocindir *aip;

struct inodedep *inodedep;

+ struct freeblks *freeblks;

{

- struct freefrag *freefrag;

+ struct newblk *newblk;

- mtx_assert(&lk, MA_OWNED);

- if ((aip->ai_state & DEPCOMPLETE) == 0)

- LIST_REMOVE(aip, ai_deps);

- if (aip->ai_state & ONWORKLIST)

- WORKLIST_REMOVE(&aip->ai_list);

+ /*

+ * If the journal hasn't been written the jnewblk must be passed

+ * to the call to ffs_freeblk that reclaims the space. We accomplish

+ * this by linking the journal dependency into the indirdep to be

+ * freed when indir_trunc() is called. If the journal has already

+ * been written we can simply reclaim the journal space when the

+ * freeblks work is complete.

+ */

LIST_REMOVE(aip, ai_next);

- if ((freefrag = aip->ai_freefrag) != NULL) {

+ newblk = (struct newblk *)aip;

+ if (newblk->nb_jnewblk == NULL)

+ cancel_newblk(newblk, &freeblks->fb_jwork);

+ else

+ cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork);

+ if (inodedep && inodedep->id_state & DEPCOMPLETE)

+ WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);

+ else

+ free_newblk(newblk);

+/*

+ * Create the mkdir dependencies for . and .. in a new directory. Link them

+ * in to a newdirblk so any subsequent additions are tracked properly. The

+ * caller is responsible for adding the mkdir1 dependency to the journal

+ * and updating id_mkdiradd. This function returns with lk held.

+ */

+static struct mkdir *

+setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)

+ struct diradd *dap;

+ ino_t newinum;

+ ino_t dinum;

+ struct buf *newdirbp;

+ struct mkdir **mkdirp;

+ struct newblk *newblk;

+ struct pagedep *pagedep;

+ struct inodedep *inodedep;

+ struct newdirblk *newdirblk = 0;

+ struct mkdir *mkdir1, *mkdir2;

+ struct worklist *wk;

+ struct jaddref *jaddref;

+ struct mount *mp;

+ mp = dap->da_list.wk_mp;

+ newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,

+ M_SOFTDEP_FLAGS);

+ workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);

+ LIST_INIT(&newdirblk->db_mkdir);

+ mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);

+ workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);

+ mkdir1->md_state = ATTACHED | MKDIR_BODY;

+ mkdir1->md_diradd = dap;

+ mkdir1->md_jaddref = NULL;

+ mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);

+ workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);

+ mkdir2->md_state = ATTACHED | MKDIR_PARENT;

+ mkdir2->md_diradd = dap;

+ mkdir2->md_jaddref = NULL;

+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) {

+ mkdir1->md_state |= DEPCOMPLETE;

+ mkdir2->md_state |= DEPCOMPLETE;

+ }

+ /*

+ * Dependency on "." and ".." being written to disk.

+ */

+ mkdir1->md_buf = newdirbp;

+ ACQUIRE_LOCK(&lk);

+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);

+ /*

+ * We must link the pagedep, allocdirect, and newdirblk for

+ * the initial file page so the pointer to the new directory

+ * is not written until the directory contents are live and

+ * any subsequent additions are not marked live until the

+ * block is reachable via the inode.

+ */

+ if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)

+ panic("setup_newdir: lost pagedep");

+ LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)

+ if (wk->wk_type == D_ALLOCDIRECT)

+ break;

+ if (wk == NULL)

+ panic("setup_newdir: lost allocdirect");

+ newblk = WK_NEWBLK(wk);

+ pagedep->pd_state |= NEWBLOCK;

+ pagedep->pd_newdirblk = newdirblk;

+ newdirblk->db_pagedep = pagedep;

+ WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);

+ WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);

+ /*

+ * Look up the inodedep for the parent directory so that we

+ * can link mkdir2 into the pending dotdot jaddref or

+ * the inode write if there is none. If the inode is

+ * ALLCOMPLETE and no jaddref is present all dependencies have

+ * been satisfied and mkdir2 can be freed.

+ */

+ inodedep_lookup(mp, dinum, 0, &inodedep);

+ if (mp->mnt_kern_flag & MNTK_SUJ) {

if (inodedep == NULL)

- add_to_worklist(&freefrag->ff_list);

- else

- WORKLIST_INSERT(&inodedep->id_bufwait,

- &freefrag->ff_list);

+ panic("setup_newdir: Lost parent.");

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&

+ (jaddref->ja_state & MKDIR_PARENT),

+ ("setup_newdir: bad dotdot jaddref %p", jaddref));

+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);

+ mkdir2->md_jaddref = jaddref;

+ jaddref->ja_mkdir = mkdir2;

+ } else if (inodedep == NULL ||

+ (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {

+ dap->da_state &= ~MKDIR_PARENT;

+ WORKITEM_FREE(mkdir2, D_MKDIR);

+ } else {

+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);

+ WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);

}

- WORKITEM_FREE(aip, D_ALLOCINDIR);

+ *mkdirp = mkdir2;

+ return (mkdir1);

}

@@ -2998,12 +6260,14 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)

ufs_lbn_t lbn; /* block in directory containing new entry */

struct fs *fs;

struct diradd *dap;

- struct allocdirect *adp;

+ struct newblk *newblk;

struct pagedep *pagedep;

struct inodedep *inodedep;

struct newdirblk *newdirblk = 0;

struct mkdir *mkdir1, *mkdir2;

+ struct jaddref *jaddref;

struct mount *mp;

+ int isindir;

* Whiteouts have no dependencies.

@@ -3013,6 +6277,8 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)

bdwrite(newdirbp);

return (0);

}

+ jaddref = NULL;

+ mkdir1 = mkdir2 = NULL;

mp = UFSTOVFS(dp->i_ump);

fs = dp->i_fs;

lbn = lblkno(fs, diroffset);

@@ -3023,111 +6289,123 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)

dap->da_offset = offset;

dap->da_newinum = newinum;

dap->da_state = ATTACHED;

- if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {

+ LIST_INIT(&dap->da_jwork);

+ isindir = bp->b_lblkno >= NDADDR;

+ if (isnewblk &&

+ (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {

newdirblk = malloc(sizeof(struct newdirblk),

M_NEWDIRBLK, M_SOFTDEP_FLAGS);

workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);

+ LIST_INIT(&newdirblk->db_mkdir);

}

+ /*

+ * If we're creating a new directory setup the dependencies and set

+ * the dap state to wait for them. Otherwise it's COMPLETE and

+ * we can move on.

+ */

if (newdirbp == NULL) {

dap->da_state |= DEPCOMPLETE;

ACQUIRE_LOCK(&lk);

} else {

dap->da_state |= MKDIR_BODY | MKDIR_PARENT;

- mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR,

- M_SOFTDEP_FLAGS);

- workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);

- mkdir1->md_state = MKDIR_BODY;

- mkdir1->md_diradd = dap;

- mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR,

- M_SOFTDEP_FLAGS);

- workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);

- mkdir2->md_state = MKDIR_PARENT;

- mkdir2->md_diradd = dap;

- /*

- * Dependency on "." and ".." being written to disk.

- */

- mkdir1->md_buf = newdirbp;

- ACQUIRE_LOCK(&lk);

- LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);

- WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);

- FREE_LOCK(&lk);

- bdwrite(newdirbp);

- /*

- * Dependency on link count increase for parent directory

- */

- ACQUIRE_LOCK(&lk);

- if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0

- || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {

- dap->da_state &= ~MKDIR_PARENT;

- WORKITEM_FREE(mkdir2, D_MKDIR);

- } else {

- LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);

- WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);

- }

+ mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,

+ &mkdir2);

}

* Link into parent directory pagedep to await its being written.

- if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)

+ if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)

WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);

+#ifdef DEBUG

+ if (diradd_lookup(pagedep, offset) != NULL)

+ panic("softdep_setup_directory_add: %p already at off %d\n",

+ diradd_lookup(pagedep, offset), offset);

+#endif

dap->da_pagedep = pagedep;

LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,

da_pdlist);

+ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);

- * Link into its inodedep. Put it on the id_bufwait list if the inode

- * is not yet written. If it is written, do the post-inode write

- * processing to put it on the id_pendinghd list.

+ * If we're journaling, link the diradd into the jaddref so it

+ * may be completed after the journal entry is written. Otherwise,

+ * link the diradd into its inodedep. If the inode is not yet

+ * written place it on the bufwait list, otherwise do the post-inode

+ * write processing to put it on the id_pendinghd list.

- (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);

- if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)

+ if (mp->mnt_kern_flag & MNTK_SUJ) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,

+ ("softdep_setup_directory_add: bad jaddref %p", jaddref));

+ jaddref->ja_diroff = diroffset;

+ jaddref->ja_diradd = dap;

+ add_to_journal(&jaddref->ja_list);

+ } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)

diradd_inode_written(dap, inodedep);

else

WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);

- if (isnewblk) {

+ /*

+ * Add the journal entries for . and .. links now that the primary

+ * link is written.

+ */

+ if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) {

+ jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,

+ inoreflst, if_deps);

+ KASSERT(jaddref != NULL &&

+ jaddref->ja_ino == jaddref->ja_parent &&

+ (jaddref->ja_state & MKDIR_BODY),

+ ("softdep_setup_directory_add: bad dot jaddref %p",

+ jaddref));

+ mkdir1->md_jaddref = jaddref;

+ jaddref->ja_mkdir = mkdir1;

- * Directories growing into indirect blocks are rare

- * enough and the frequency of new block allocation

- * in those cases even more rare, that we choose not

- * to bother tracking them. Rather we simply force the

- * new directory entry to disk.

+ * It is important that the dotdot journal entry

+ * is added prior to the dot entry since dot writes

+ * both the dot and dotdot links. These both must

+ * be added after the primary link for the journal

+ * to remain consistent.

- if (lbn >= NDADDR) {

- FREE_LOCK(&lk);

- /*

- * We only have a new allocation when at the

- * beginning of a new block, not when we are

- * expanding into an existing block.

- */

- if (blkoff(fs, diroffset) == 0)

- return (1);

- return (0);

- }

+ add_to_journal(&mkdir2->md_jaddref->ja_list);

+ add_to_journal(&jaddref->ja_list);

+ }

+ /*

+ * If we are adding a new directory remember this diradd so that if

+ * we rename it we can keep the dot and dotdot dependencies. If

+ * we are adding a new name for an inode that has a mkdiradd we

+ * must be in rename and we have to move the dot and dotdot

+ * dependencies to this new name. The old name is being orphaned

+ * soon.

+ */

+ if (mkdir1 != NULL) {

+ if (inodedep->id_mkdiradd != NULL)

+ panic("softdep_setup_directory_add: Existing mkdir");

+ inodedep->id_mkdiradd = dap;

+ } else if (inodedep->id_mkdiradd)

+ merge_diradd(inodedep, dap);

+ if (newdirblk) {

- * We only have a new allocation when at the beginning

- * of a new fragment, not when we are expanding into an

- * existing fragment. Also, there is nothing to do if we

- * are already tracking this block.

+ * There is nothing to do if we are already tracking

+ * this block.

- if (fragoff(fs, diroffset) != 0) {

- FREE_LOCK(&lk);

- return (0);

- }

if ((pagedep->pd_state & NEWBLOCK) != 0) {

WORKITEM_FREE(newdirblk, D_NEWDIRBLK);

FREE_LOCK(&lk);

return (0);

}

- /*

- * Find our associated allocdirect and have it track us.

- */

- if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)

- panic("softdep_setup_directory_add: lost inodedep");

- adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);

- if (adp == NULL || adp->ad_lbn != lbn)

+ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)

+ == 0)

panic("softdep_setup_directory_add: lost entry");

+ WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);

pagedep->pd_state |= NEWBLOCK;

+ pagedep->pd_newdirblk = newdirblk;

newdirblk->db_pagedep = pagedep;

- WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);

+ FREE_LOCK(&lk);

+ /*

+ * If we extended into an indirect signal direnter to sync.

+ */

+ if (isindir)

+ return (1);

+ return (0);

}

FREE_LOCK(&lk);

return (0);

@@ -3141,7 +6419,8 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)

* occur while the move is in progress.

void

-softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)

+softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)

+ struct buf *bp; /* Buffer holding directory block. */

struct inode *dp; /* inode for directory */

caddr_t base; /* address of dp->i_offset */

caddr_t oldloc; /* address of old directory location */

@@ -3150,40 +6429,204 @@ softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)

{

int offset, oldoffset, newoffset;

struct pagedep *pagedep;

+ struct jmvref *jmvref;

struct diradd *dap;

+ struct direct *de;

+ struct mount *mp;

ufs_lbn_t lbn;

+ int flags;

- ACQUIRE_LOCK(&lk);

+ mp = UFSTOVFS(dp->i_ump);

+ de = (struct direct *)oldloc;

+ jmvref = NULL;

+ flags = 0;

+ /*

+ * Moves are always journaled as it would be too complex to

+ * determine if any affected adds or removes are present in the

+ * journal.

+ */

+ if (mp->mnt_kern_flag & MNTK_SUJ) {

+ flags = DEPALLOC;

+ jmvref = newjmvref(dp, de->d_ino,

+ dp->i_offset + (oldloc - base),

+ dp->i_offset + (newloc - base));

+ }

lbn = lblkno(dp->i_fs, dp->i_offset);

offset = blkoff(dp->i_fs, dp->i_offset);

- if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)

- goto done;

oldoffset = offset + (oldloc - base);

newoffset = offset + (newloc - base);

- LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {

- if (dap->da_offset != oldoffset)

- continue;

+ ACQUIRE_LOCK(&lk);

+ if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {

+ if (pagedep)

+ WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);

+ goto done;

+ }

+ dap = diradd_lookup(pagedep, oldoffset);

+ if (dap) {

dap->da_offset = newoffset;

- if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))

- break;

- LIST_REMOVE(dap, da_pdlist);

- LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],

- dap, da_pdlist);

- break;

+ newoffset = DIRADDHASH(newoffset);

+ oldoffset = DIRADDHASH(oldoffset);

+ if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&

+ newoffset != oldoffset) {

+ LIST_REMOVE(dap, da_pdlist);

+ LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],

+ dap, da_pdlist);

+ }

}

- if (dap == NULL) {

+done:

+ if (jmvref) {

+ jmvref->jm_pagedep = pagedep;

+ LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);

+ add_to_journal(&jmvref->jm_list);

+ }

+ bcopy(oldloc, newloc, entrysize);

+ FREE_LOCK(&lk);

+/*

+ * Move the mkdir dependencies and journal work from one diradd to another

+ * when renaming a directory. The new name must depend on the mkdir deps

+ * completing as the old name did. Directories can only have one valid link

+ * at a time so one must be canonical.

+ */

+static void

+merge_diradd(inodedep, newdap)

+ struct inodedep *inodedep;

+ struct diradd *newdap;

+ struct diradd *olddap;

+ struct mkdir *mkdir, *nextmd;

+ short state;

- LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {

- if (dap->da_offset == oldoffset) {

- dap->da_offset = newoffset;

+ olddap = inodedep->id_mkdiradd;

+ inodedep->id_mkdiradd = newdap;

+ if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {

+ newdap->da_state &= ~DEPCOMPLETE;

+ for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {

+ nextmd = LIST_NEXT(mkdir, md_mkdirs);

+ if (mkdir->md_diradd != olddap)

+ continue;

+ mkdir->md_diradd = newdap;

+ state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);

+ newdap->da_state |= state;

+ olddap->da_state &= ~state;

+ if ((olddap->da_state &

+ (MKDIR_PARENT | MKDIR_BODY)) == 0)

break;

+ }

+ if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)

+ panic("merge_diradd: unfound ref");

+ }

+ /*

+ * Any mkdir related journal items are not safe to be freed until

+ * the new name is stable.

+ */

+ jwork_move(&newdap->da_jwork, &olddap->da_jwork);

+ olddap->da_state |= DEPCOMPLETE;

+ complete_diradd(olddap);

+/*

+ * Move the diradd to the pending list when all diradd dependencies are

+ * complete.

+ */

+static void

+complete_diradd(dap)

+ struct diradd *dap;

+ struct pagedep *pagedep;

+ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {

+ if (dap->da_state & DIRCHG)

+ pagedep = dap->da_previous->dm_pagedep;

+ else

+ pagedep = dap->da_pagedep;

+ LIST_REMOVE(dap, da_pdlist);

+ LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);

+ }

+/*

+ * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal

+ * add entries and conditonally journal the remove.

+ */

+static void

+cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)

+ struct diradd *dap;

+ struct dirrem *dirrem;

+ struct jremref *jremref;

+ struct jremref *dotremref;

+ struct jremref *dotdotremref;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct inoref *inoref;

+ struct mkdir *mkdir;

+ /*

+ * If no remove references were allocated we're on a non-journaled

+ * filesystem and can skip the cancel step.

+ */

+ if (jremref == NULL) {

+ free_diradd(dap, NULL);

+ return;

+ }

+ /*

+ * Cancel the primary name an free it if it does not require

+ * journaling.

+ */

+ if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,

+ 0, &inodedep) != 0) {

+ /* Abort the addref that reference this diradd. */

+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {

+ if (inoref->if_list.wk_type != D_JADDREF)

+ continue;

+ jaddref = (struct jaddref *)inoref;

+ if (jaddref->ja_diradd != dap)

+ continue;

+ if (cancel_jaddref(jaddref, inodedep,

+ &dirrem->dm_jwork) == 0) {

+ free_jremref(jremref);

+ jremref = NULL;

}

+ break;

}

-done:

- bcopy(oldloc, newloc, entrysize);

- FREE_LOCK(&lk);

+ /*

+ * Cancel subordinate names and free them if they do not require

+ * journaling.

+ */

+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {

+ LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {

+ if (mkdir->md_diradd != dap)

+ continue;

+ if ((jaddref = mkdir->md_jaddref) == NULL)

+ continue;

+ mkdir->md_jaddref = NULL;

+ if (mkdir->md_state & MKDIR_PARENT) {

+ if (cancel_jaddref(jaddref, NULL,

+ &dirrem->dm_jwork) == 0) {

+ free_jremref(dotdotremref);

+ dotdotremref = NULL;

+ }

+ } else {

+ if (cancel_jaddref(jaddref, inodedep,

+ &dirrem->dm_jwork) == 0) {

+ free_jremref(dotremref);

+ dotremref = NULL;

+ }

+ if (jremref)

+ journal_jremref(dirrem, jremref, inodedep);

+ if (dotremref)

+ journal_jremref(dirrem, dotremref, inodedep);

+ if (dotdotremref)

+ journal_jremref(dirrem, dotdotremref, NULL);

+ jwork_move(&dirrem->dm_jwork, &dap->da_jwork);

+ free_diradd(dap, &dirrem->dm_jwork);

}

@@ -3191,8 +6634,9 @@ done:

* with splbio interrupts blocked.

static void

-free_diradd(dap)

+free_diradd(dap, wkhd)

struct diradd *dap;

+ struct workhead *wkhd;

{

struct dirrem *dirrem;

struct pagedep *pagedep;

@@ -3200,32 +6644,48 @@ free_diradd(dap)

struct mkdir *mkdir, *nextmd;

mtx_assert(&lk, MA_OWNED);

- WORKLIST_REMOVE(&dap->da_list);

LIST_REMOVE(dap, da_pdlist);

+ if (dap->da_state & ONWORKLIST)

+ WORKLIST_REMOVE(&dap->da_list);

if ((dap->da_state & DIRCHG) == 0) {

pagedep = dap->da_pagedep;

} else {

dirrem = dap->da_previous;

pagedep = dirrem->dm_pagedep;

dirrem->dm_dirinum = pagedep->pd_ino;

- add_to_worklist(&dirrem->dm_list);

+ dirrem->dm_state |= COMPLETE;

+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))

+ add_to_worklist(&dirrem->dm_list, 0);

}

if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,

0, &inodedep) != 0)

- (void) free_inodedep(inodedep);

+ if (inodedep->id_mkdiradd == dap)

+ inodedep->id_mkdiradd = NULL;

if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {

for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {

nextmd = LIST_NEXT(mkdir, md_mkdirs);

if (mkdir->md_diradd != dap)

continue;

- dap->da_state &= ~mkdir->md_state;

- WORKLIST_REMOVE(&mkdir->md_list);

+ dap->da_state &=

+ ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));

LIST_REMOVE(mkdir, md_mkdirs);

+ if (mkdir->md_state & ONWORKLIST)

+ WORKLIST_REMOVE(&mkdir->md_list);

+ if (mkdir->md_jaddref != NULL)

+ panic("free_diradd: Unexpected jaddref");

WORKITEM_FREE(mkdir, D_MKDIR);

+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)

+ break;

}

if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)

panic("free_diradd: unfound ref");

}

+ if (inodedep)

+ free_inodedep(inodedep);

+ /*

+ * Free any journal segments waiting for the directory write.

+ */

+ handle_jwork(&dap->da_jwork);

WORKITEM_FREE(dap, D_DIRADD);

}

@@ -3254,11 +6714,24 @@ softdep_setup_remove(bp, dp, ip, isrmdir)

int isrmdir; /* indicates if doing RMDIR */

{

struct dirrem *dirrem, *prevdirrem;

+ struct inodedep *inodedep;

+ int direct;

- * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.

+ * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want

+ * newdirrem() to setup the full directory remove which requires

+ * isrmdir > 1.

- dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);

+ dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem);

+ /*

+ * Add the dirrem to the inodedep's pending remove list for quick

+ * discovery later.

+ */

+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,

+ &inodedep) == 0)

+ panic("softdep_setup_remove: Lost inodedep.");

+ dirrem->dm_state |= ONDEPLIST;

+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);

* If the COMPLETE flag is clear, then there were no active

@@ -3280,9 +6753,146 @@ softdep_setup_remove(bp, dp, ip, isrmdir)

LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,

prevdirrem, dm_next);

dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;

+ direct = LIST_EMPTY(&dirrem->dm_jremrefhd);

FREE_LOCK(&lk);

- handle_workitem_remove(dirrem, NULL);

+ if (direct)

+ handle_workitem_remove(dirrem, NULL);

+ }

+/*

+ * Check for an entry matching 'offset' on both the pd_dirraddhd list and the

+ * pd_pendinghd list of a pagedep.

+ */

+static struct diradd *

+diradd_lookup(pagedep, offset)

+ struct pagedep *pagedep;

+ int offset;

+ struct diradd *dap;

+ LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)

+ if (dap->da_offset == offset)

+ return (dap);

+ LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)

+ if (dap->da_offset == offset)

+ return (dap);

+ return (NULL);

+/*

+ * Search for a .. diradd dependency in a directory that is being removed.

+ * If the directory was renamed to a new parent we have a diradd rather

+ * than a mkdir for the .. entry. We need to cancel it now before

+ * it is found in truncate().

+ */

+static struct jremref *

+cancel_diradd_dotdot(ip, dirrem, jremref)

+ struct inode *ip;

+ struct dirrem *dirrem;

+ struct jremref *jremref;

+ struct pagedep *pagedep;

+ struct diradd *dap;

+ struct worklist *wk;

+ if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,

+ &pagedep) == 0)

+ return (jremref);

+ dap = diradd_lookup(pagedep, DOTDOT_OFFSET);

+ if (dap == NULL)

+ return (jremref);

+ cancel_diradd(dap, dirrem, jremref, NULL, NULL);

+ /*

+ * Mark any journal work as belonging to the parent so it is freed

+ * with the .. reference.

+ */

+ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)

+ wk->wk_state |= MKDIR_PARENT;

+ return (NULL);

+/*

+ * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to

+ * replace it with a dirrem/diradd pair as a result of re-parenting a

+ * directory. This ensures that we don't simultaneously have a mkdir and

+ * a diradd for the same .. entry.

+ */

+static struct jremref *

+cancel_mkdir_dotdot(ip, dirrem, jremref)

+ struct inode *ip;

+ struct dirrem *dirrem;

+ struct jremref *jremref;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref;

+ struct mkdir *mkdir;

+ struct diradd *dap;

+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,

+ &inodedep) == 0)

+ panic("cancel_mkdir_dotdot: Lost inodedep");

+ dap = inodedep->id_mkdiradd;

+ if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)

+ return (jremref);

+ for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;

+ mkdir = LIST_NEXT(mkdir, md_mkdirs))

+ if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)

+ break;

+ if (mkdir == NULL)

+ panic("cancel_mkdir_dotdot: Unable to find mkdir\n");

+ if ((jaddref = mkdir->md_jaddref) != NULL) {

+ mkdir->md_jaddref = NULL;

+ jaddref->ja_state &= ~MKDIR_PARENT;

+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,

+ &inodedep) == 0)

+ panic("cancel_mkdir_dotdot: Lost parent inodedep");

+ if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {

+ journal_jremref(dirrem, jremref, inodedep);

+ jremref = NULL;

+ }

}

+ if (mkdir->md_state & ONWORKLIST)

+ WORKLIST_REMOVE(&mkdir->md_list);

+ mkdir->md_state |= ALLCOMPLETE;

+ complete_mkdir(mkdir);

+ return (jremref);

+static void

+journal_jremref(dirrem, jremref, inodedep)

+ struct dirrem *dirrem;

+ struct jremref *jremref;

+ struct inodedep *inodedep;

+ if (inodedep == NULL)

+ if (inodedep_lookup(jremref->jr_list.wk_mp,

+ jremref->jr_ref.if_ino, 0, &inodedep) == 0)

+ panic("journal_jremref: Lost inodedep");

+ LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);

+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);

+ add_to_journal(&jremref->jr_list);

+static void

+dirrem_journal(dirrem, jremref, dotremref, dotdotremref)

+ struct dirrem *dirrem;

+ struct jremref *jremref;

+ struct jremref *dotremref;

+ struct jremref *dotdotremref;

+ struct inodedep *inodedep;

+ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,

+ &inodedep) == 0)

+ panic("dirrem_journal: Lost inodedep");

+ journal_jremref(dirrem, jremref, inodedep);

+ if (dotremref)

+ journal_jremref(dirrem, dotremref, inodedep);

+ if (dotdotremref)

+ journal_jremref(dirrem, dotdotremref, NULL);

}

@@ -3303,12 +6913,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)

struct diradd *dap;

struct dirrem *dirrem;

struct pagedep *pagedep;

+ struct jremref *jremref;

+ struct jremref *dotremref;

+ struct jremref *dotdotremref;

+ struct vnode *dvp;

* Whiteouts have no deletion dependencies.

if (ip == NULL)

panic("newdirrem: whiteout");

+ dvp = ITOV(dp);

* If we are over our limit, try to improve the situation.

* Limiting the number of dirrem structures will also limit

@@ -3321,34 +6936,75 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)

FREE_LOCK(&lk);

dirrem = malloc(sizeof(struct dirrem),

M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);

- workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);

+ workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);

+ LIST_INIT(&dirrem->dm_jremrefhd);

+ LIST_INIT(&dirrem->dm_jwork);

dirrem->dm_state = isrmdir ? RMDIR : 0;

dirrem->dm_oldinum = ip->i_number;

*prevdirremp = NULL;

+ /*

+ * Allocate remove reference structures to track journal write

+ * dependencies. We will always have one for the link and

+ * when doing directories we will always have one more for dot.

+ * When renaming a directory we skip the dotdot link change so

+ * this is not needed.

+ */

+ jremref = dotremref = dotdotremref = NULL;

+ if (DOINGSUJ(dvp)) {

+ if (isrmdir) {

+ jremref = newjremref(dirrem, dp, ip, dp->i_offset,

+ ip->i_effnlink + 2);

+ dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,

+ ip->i_effnlink + 1);

+ } else

+ jremref = newjremref(dirrem, dp, ip, dp->i_offset,

+ ip->i_effnlink + 1);

+ if (isrmdir > 1) {

+ dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,

+ dp->i_effnlink + 1);

+ dotdotremref->jr_state |= MKDIR_PARENT;

+ }

ACQUIRE_LOCK(&lk);

lbn = lblkno(dp->i_fs, dp->i_offset);

offset = blkoff(dp->i_fs, dp->i_offset);

- if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)

+ if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,

+ &pagedep) == 0)

WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);

dirrem->dm_pagedep = pagedep;

+ * If we're renaming a .. link to a new directory, cancel any

+ * existing MKDIR_PARENT mkdir. If it has already been canceled

+ * the jremref is preserved for any potential diradd in this

+ * location. This can not coincide with a rmdir.

+ */

+ if (dp->i_offset == DOTDOT_OFFSET) {

+ if (isrmdir)

+ panic("newdirrem: .. directory change during remove?");

+ jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);

+ }

+ /*

+ * If we're removing a directory search for the .. dependency now and

+ * cancel it. Any pending journal work will be added to the dirrem

+ * to be completed when the workitem remove completes.

+ */

+ if (isrmdir > 1)

+ dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);

+ /*

* Check for a diradd dependency for the same directory entry.

* If present, then both dependencies become obsolete and can

- * be de-allocated. Check for an entry on both the pd_dirraddhd

- * list and the pd_pendinghd list.

+ * be de-allocated.

- LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)

- if (dap->da_offset == offset)

- break;

+ dap = diradd_lookup(pagedep, offset);

if (dap == NULL) {

- LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)

- if (dap->da_offset == offset)

- break;

- if (dap == NULL)

- return (dirrem);

+ /*

+ * Link the jremref structures into the dirrem so they are

+ * written prior to the pagedep.

+ */

+ if (jremref)

+ dirrem_journal(dirrem, jremref, dotremref,

+ dotdotremref);

+ return (dirrem);

}

* Must be ATTACHED at this point.

@@ -3373,7 +7029,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)

* Mark it COMPLETE so we can delete its inode immediately.

dirrem->dm_state |= COMPLETE;

- free_diradd(dap);

+ cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);

+#ifdef SUJ_DEBUG

+ if (isrmdir == 0) {

+ struct worklist *wk;

+ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)

+ if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))

+ panic("bad wk %p (0x%X)\n", wk, wk->wk_state);

+ }

+#endif

return (dirrem);

}

@@ -3407,6 +7073,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)

struct dirrem *dirrem, *prevdirrem;

struct pagedep *pagedep;

struct inodedep *inodedep;

+ struct jaddref *jaddref;

struct mount *mp;

offset = blkoff(dp->i_fs, dp->i_offset);

@@ -3422,6 +7089,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)

dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;

dap->da_offset = offset;

dap->da_newinum = newinum;

+ LIST_INIT(&dap->da_jwork);

}

@@ -3454,11 +7122,21 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)

dm_next);

} else {

dirrem->dm_dirinum = pagedep->pd_ino;

- add_to_worklist(&dirrem->dm_list);

+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))

+ add_to_worklist(&dirrem->dm_list, 0);

}

FREE_LOCK(&lk);

return;

}

+ /*

+ * Add the dirrem to the inodedep's pending remove list for quick

+ * discovery later. A valid nlinkdelta ensures that this lookup

+ * will not fail.

+ */

+ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)

+ panic("softdep_setup_directory_change: Lost inodedep.");

+ dirrem->dm_state |= ONDEPLIST;

+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);

* If the COMPLETE flag is clear, then there were no active

@@ -3483,15 +7161,29 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)

dap->da_pagedep = pagedep;

}

dirrem->dm_dirinum = pagedep->pd_ino;

- add_to_worklist(&dirrem->dm_list);

+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))

+ add_to_worklist(&dirrem->dm_list, 0);

}

- * Link into its inodedep. Put it on the id_bufwait list if the inode

+ * Lookup the jaddref for this journal entry. We must finish

+ * initializing it and make the diradd write dependent on it.

+ * If we're not journaling Put it on the id_bufwait list if the inode

* is not yet written. If it is written, do the post-inode write

* processing to put it on the id_pendinghd list.

- if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||

- (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {

+ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);

+ if (mp->mnt_kern_flag & MNTK_SUJ) {

+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,

+ inoreflst);

+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,

+ ("softdep_setup_directory_change: bad jaddref %p",

+ jaddref));

+ jaddref->ja_diroff = dp->i_offset;

+ jaddref->ja_diradd = dap;

+ LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],

+ dap, da_pdlist);

+ add_to_journal(&jaddref->ja_list);

+ } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {

dap->da_state |= COMPLETE;

LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);

WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);

@@ -3500,6 +7192,13 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)

dap, da_pdlist);

WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);

}

+ /*

+ * If we're making a new name for a directory that has not been

+ * committed when need to move the dot and dotdot references to

+ * this new name.

+ */

+ if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)

+ merge_diradd(inodedep, dap);

FREE_LOCK(&lk);

}

@@ -3516,8 +7215,7 @@ softdep_change_linkcnt(ip)

struct inodedep *inodedep;

ACQUIRE_LOCK(&lk);

- (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,

- DEPALLOC, &inodedep);

+ inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);

if (ip->i_nlink < ip->i_effnlink)

panic("softdep_change_linkcnt: bad delta");

inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;

@@ -3574,6 +7272,305 @@ softdep_releasefile(ip)

}

+ * Attach a sbdep dependency to the superblock buf so that we can keep

+ * track of the head of the linked list of referenced but unlinked inodes.

+ */

+void

+softdep_setup_sbupdate(ump, fs, bp)

+ struct ufsmount *ump;

+ struct fs *fs;

+ struct buf *bp;

+ struct sbdep *sbdep;

+ struct worklist *wk;

+ if ((fs->fs_flags & FS_SUJ) == 0)

+ return;

+ LIST_FOREACH(wk, &bp->b_dep, wk_list)

+ if (wk->wk_type == D_SBDEP)

+ break;

+ if (wk != NULL)

+ return;

+ sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);

+ workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));

+ sbdep->sb_fs = fs;

+ sbdep->sb_ump = ump;

+ ACQUIRE_LOCK(&lk);

+ WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);

+ FREE_LOCK(&lk);

+/*

+ * Return the first unlinked inodedep which is ready to be the head of the

+ * list. The inodedep and all those after it must have valid next pointers.

+ */

+static struct inodedep *

+first_unlinked_inodedep(ump)

+ struct ufsmount *ump;

+ struct inodedep *inodedep;

+ struct inodedep *idp;

+ for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);

+ inodedep; inodedep = idp) {

+ if ((inodedep->id_state & UNLINKNEXT) == 0)

+ return (NULL);

+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);

+ if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)

+ break;

+ if ((inodedep->id_state & UNLINKPREV) == 0)

+ panic("first_unlinked_inodedep: prev != next");

+ }

+ if (inodedep == NULL)

+ return (NULL);

+ return (inodedep);

+/*

+ * Set the sujfree unlinked head pointer prior to writing a superblock.

+ */

+static void

+initiate_write_sbdep(sbdep)

+ struct sbdep *sbdep;

+ struct inodedep *inodedep;

+ struct fs *bpfs;

+ struct fs *fs;

+ bpfs = sbdep->sb_fs;

+ fs = sbdep->sb_ump->um_fs;

+ inodedep = first_unlinked_inodedep(sbdep->sb_ump);

+ if (inodedep) {

+ fs->fs_sujfree = inodedep->id_ino;

+ inodedep->id_state |= UNLINKPREV;

+ } else

+ fs->fs_sujfree = 0;

+ bpfs->fs_sujfree = fs->fs_sujfree;

+/*

+ * After a superblock is written determine whether it must be written again

+ * due to a changing unlinked list head.

+ */

+static int

+handle_written_sbdep(sbdep, bp)

+ struct sbdep *sbdep;

+ struct buf *bp;

+ struct inodedep *inodedep;

+ struct mount *mp;

+ struct fs *fs;

+ fs = sbdep->sb_fs;

+ mp = UFSTOVFS(sbdep->sb_ump);

+ inodedep = first_unlinked_inodedep(sbdep->sb_ump);

+ if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||

+ (inodedep == NULL && fs->fs_sujfree != 0)) {

+ bdirty(bp);

+ return (1);

+ }

+ WORKITEM_FREE(sbdep, D_SBDEP);

+ if (fs->fs_sujfree == 0)

+ return (0);

+ if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)

+ panic("handle_written_sbdep: lost inodedep");

+ /*

+ * Now that we have a record of this indode in stable store allow it

+ * to be written to free up pending work. Inodes may see a lot of

+ * write activity after they are unlinked which we must not hold up.

+ */

+ for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {

+ if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)

+ panic("handle_written_sbdep: Bad inodedep %p (0x%X)",

+ inodedep, inodedep->id_state);

+ if (inodedep->id_state & UNLINKONLIST)

+ break;

+ inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;

+ }

+ return (0);

+/*

+ * Mark an inodedep has unlinked and insert it into the in-memory unlinked

+ * list.

+ */

+static void

+unlinked_inodedep(mp, inodedep)

+ struct mount *mp;

+ struct inodedep *inodedep;

+ struct ufsmount *ump;

+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)

+ return;

+ ump = VFSTOUFS(mp);

+ ump->um_fs->fs_fmod = 1;

+ inodedep->id_state |= UNLINKED;

+ TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);

+/*

+ * Remove an inodedep from the unlinked inodedep list. This may require

+ * disk writes if the inode has made it that far.

+ */

+static void

+clear_unlinked_inodedep(inodedep)

+ struct inodedep *inodedep;

+ struct ufsmount *ump;

+ struct inodedep *idp;

+ struct inodedep *idn;

+ struct fs *fs;

+ struct buf *bp;

+ ino_t ino;

+ ino_t nino;

+ ino_t pino;

+ int error;

+ ump = VFSTOUFS(inodedep->id_list.wk_mp);

+ fs = ump->um_fs;

+ ino = inodedep->id_ino;

+ error = 0;

+ for (;;) {

+ /*

+ * If nothing has yet been written simply remove us from

+ * the in memory list and return. This is the most common

+ * case where handle_workitem_remove() loses the final

+ * reference.

+ */

+ if ((inodedep->id_state & UNLINKLINKS) == 0)

+ break;

+ /*

+ * If we have a NEXT pointer and no PREV pointer we can simply

+ * clear NEXT's PREV and remove ourselves from the list. Be

+ * careful not to clear PREV if the superblock points at

+ * next as well.

+ */

+ idn = TAILQ_NEXT(inodedep, id_unlinked);

+ if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {

+ if (idn && fs->fs_sujfree != idn->id_ino)

+ idn->id_state &= ~UNLINKPREV;

+ break;

+ }

+ /*

+ * Here we have an inodedep which is actually linked into

+ * the list. We must remove it by forcing a write to the

+ * link before us, whether it be the superblock or an inode.

+ * Unfortunately the list may change while we're waiting

+ * on the buf lock for either resource so we must loop until

+ * we lock. the right one. If both the superblock and an

+ * inode point to this inode we must clear the inode first

+ * followed by the superblock.

+ */

+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);

+ pino = 0;

+ if (idp && (idp->id_state & UNLINKNEXT))

+ pino = idp->id_ino;

+ FREE_LOCK(&lk);

+ if (pino == 0)

+ bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),

+ (int)fs->fs_sbsize, 0, 0, 0);

+ else

+ error = bread(ump->um_devvp,

+ fsbtodb(fs, ino_to_fsba(fs, pino)),

+ (int)fs->fs_bsize, NOCRED, &bp);

+ ACQUIRE_LOCK(&lk);

+ if (error)

+ break;

+ /* If the list has changed restart the loop. */

+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);

+ nino = 0;

+ if (idp && (idp->id_state & UNLINKNEXT))

+ nino = idp->id_ino;

+ if (nino != pino ||

+ (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {

+ FREE_LOCK(&lk);

+ brelse(bp);

+ ACQUIRE_LOCK(&lk);

+ continue;

+ }

+ /*

+ * Remove us from the in memory list. After this we cannot

+ * access the inodedep.

+ */

+ idn = TAILQ_NEXT(inodedep, id_unlinked);

+ inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);

+ TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);

+ /*

+ * Determine the next inode number.

+ */

+ nino = 0;

+ if (idn) {

+ /*

+ * If next isn't on the list we can just clear prev's

+ * state and schedule it to be fixed later. No need

+ * to synchronously write if we're not in the real

+ * list.

+ */

+ if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {

+ idp->id_state &= ~UNLINKNEXT;

+ if ((idp->id_state & ONWORKLIST) == 0)

+ WORKLIST_INSERT(&bp->b_dep,

+ &idp->id_list);

+ FREE_LOCK(&lk);

+ bawrite(bp);

+ ACQUIRE_LOCK(&lk);

+ return;

+ }

+ nino = idn->id_ino;

+ }

+ FREE_LOCK(&lk);

+ /*

+ * The predecessor's next pointer is manually updated here

+ * so that the NEXT flag is never cleared for an element

+ * that is in the list.

+ */

+ if (pino == 0) {

+ bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);

+ ffs_oldfscompat_write((struct fs *)bp->b_data, ump);

+ softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,

+ bp);

+ } else if (fs->fs_magic == FS_UFS1_MAGIC)

+ ((struct ufs1_dinode *)bp->b_data +

+ ino_to_fsbo(fs, pino))->di_freelink = nino;

+ else

+ ((struct ufs2_dinode *)bp->b_data +

+ ino_to_fsbo(fs, pino))->di_freelink = nino;

+ /*

+ * If the bwrite fails we have no recourse to recover. The

+ * filesystem is corrupted already.

+ */

+ bwrite(bp);

+ ACQUIRE_LOCK(&lk);

+ /*

+ * If the superblock pointer still needs to be cleared force

+ * a write here.

+ */

+ if (fs->fs_sujfree == ino) {

+ FREE_LOCK(&lk);

+ bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),

+ (int)fs->fs_sbsize, 0, 0, 0);

+ bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);

+ ffs_oldfscompat_write((struct fs *)bp->b_data, ump);

+ softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,

+ bp);

+ bwrite(bp);

+ ACQUIRE_LOCK(&lk);

+ }

+ if (fs->fs_sujfree != ino)

+ return;

+ panic("clear_unlinked_inodedep: Failed to clear free head");

+ }

+ if (inodedep->id_ino == fs->fs_sujfree)

+ panic("clear_unlinked_inodedep: Freeing head of free list");

+ inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);

+ TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);

+ return;

+/*

* This workitem decrements the inode's link count.

* If the link count reaches zero, the file is removed.

@@ -3584,22 +7581,54 @@ handle_workitem_remove(dirrem, xp)

{

struct thread *td = curthread;

struct inodedep *inodedep;

+ struct workhead dotdotwk;

+ struct worklist *wk;

+ struct ufsmount *ump;

+ struct mount *mp;

struct vnode *vp;

struct inode *ip;

ino_t oldinum;

int error;

+ if (dirrem->dm_state & ONWORKLIST)

+ panic("handle_workitem_remove: dirrem %p still on worklist",

+ dirrem);

+ oldinum = dirrem->dm_oldinum;

+ mp = dirrem->dm_list.wk_mp;

+ ump = VFSTOUFS(mp);

if ((vp = xp) == NULL &&

- (error = ffs_vgetf(dirrem->dm_list.wk_mp,

- dirrem->dm_oldinum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) {

+ (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,

+ FFSV_FORCEINSMQ)) != 0) {

softdep_error("handle_workitem_remove: vget", error);

return;

}

ip = VTOI(vp);

ACQUIRE_LOCK(&lk);

- if ((inodedep_lookup(dirrem->dm_list.wk_mp,

- dirrem->dm_oldinum, 0, &inodedep)) == 0)

+ if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)

panic("handle_workitem_remove: lost inodedep");

+ if (dirrem->dm_state & ONDEPLIST)

+ LIST_REMOVE(dirrem, dm_inonext);

+ KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),

+ ("handle_workitem_remove: Journal entries not written."));

+ /*

+ * Move all dependencies waiting on the remove to complete

+ * from the dirrem to the inode inowait list to be completed

+ * after the inode has been updated and written to disk. Any

+ * marked MKDIR_PARENT are saved to be completed when the .. ref

+ * is removed.

+ */

+ LIST_INIT(&dotdotwk);

+ while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {

+ WORKLIST_REMOVE(wk);

+ if (wk->wk_state & MKDIR_PARENT) {

+ wk->wk_state &= ~MKDIR_PARENT;

+ WORKLIST_INSERT(&dotdotwk, wk);

+ continue;

+ }

+ WORKLIST_INSERT(&inodedep->id_inowait, wk);

+ }

+ LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);

* Normal file deletion.

@@ -3609,12 +7638,16 @@ handle_workitem_remove(dirrem, xp)

ip->i_flag |= IN_CHANGE;

if (ip->i_nlink < ip->i_effnlink)

panic("handle_workitem_remove: bad file delta");

+ if (ip->i_nlink == 0)

+ unlinked_inodedep(mp, inodedep);

inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;

num_dirrem -= 1;

+ KASSERT(LIST_EMPTY(&dirrem->dm_jwork),

+ ("handle_workitem_remove: worklist not empty. %s",

+ TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));

WORKITEM_FREE(dirrem, D_DIRREM);

FREE_LOCK(&lk);

- vput(vp);

- return;

+ goto out;

}

* Directory deletion. Decrement reference count for both the

@@ -3628,6 +7661,8 @@ handle_workitem_remove(dirrem, xp)

ip->i_flag |= IN_CHANGE;

if (ip->i_nlink < ip->i_effnlink)

panic("handle_workitem_remove: bad dir delta");

+ if (ip->i_nlink == 0)

+ unlinked_inodedep(mp, inodedep);

inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;

FREE_LOCK(&lk);

if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)

@@ -3639,36 +7674,47 @@ handle_workitem_remove(dirrem, xp)

* directory should not change. Thus we skip the followup dirrem.

if (dirrem->dm_state & DIRCHG) {

+ KASSERT(LIST_EMPTY(&dirrem->dm_jwork),

+ ("handle_workitem_remove: DIRCHG and worklist not empty."));

num_dirrem -= 1;

WORKITEM_FREE(dirrem, D_DIRREM);

FREE_LOCK(&lk);

- vput(vp);

- return;

+ goto out;

}

+ dirrem->dm_state = ONDEPLIST;

+ dirrem->dm_oldinum = dirrem->dm_dirinum;

- * If the inodedep does not exist, then the zero'ed inode has

- * been written to disk. If the allocated inode has never been

- * written to disk, then the on-disk inode is zero'ed. In either

- * case we can remove the file immediately.

+ * Place the dirrem on the parent's diremhd list.

- dirrem->dm_state = 0;

- oldinum = dirrem->dm_oldinum;

- dirrem->dm_oldinum = dirrem->dm_dirinum;

- if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,

- 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {

+ if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)

+ panic("handle_workitem_remove: lost dir inodedep");

+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);

+ /*

+ * If the allocated inode has never been written to disk, then

+ * the on-disk inode is zero'ed and we can remove the file

+ * immediately. When journaling if the inode has been marked

+ * unlinked and not DEPCOMPLETE we know it can never be written.

+ */

+ inodedep_lookup(mp, oldinum, 0, &inodedep);

+ if (inodedep == NULL ||

+ (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||

+ check_inode_unwritten(inodedep)) {

if (xp != NULL)

- add_to_worklist(&dirrem->dm_list);

+ add_to_worklist(&dirrem->dm_list, 0);

FREE_LOCK(&lk);

- vput(vp);

- if (xp == NULL)

+ if (xp == NULL) {

+ vput(vp);

handle_workitem_remove(dirrem, NULL);

+ }

return;

}

WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);

FREE_LOCK(&lk);

ip->i_flag |= IN_CHANGE;

+out:

ffs_update(vp, 0);

- vput(vp);

+ if (xp == NULL)

+ vput(vp);

}

@@ -3689,6 +7735,7 @@ static void

handle_workitem_freefile(freefile)

struct freefile *freefile;

{

+ struct workhead wkhd;

struct fs *fs;

struct inodedep *idp;

struct ufsmount *ump;

@@ -3701,13 +7748,15 @@ handle_workitem_freefile(freefile)

error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);

FREE_LOCK(&lk);

if (error)

- panic("handle_workitem_freefile: inodedep survived");

+ panic("handle_workitem_freefile: inodedep %p survived", idp);

#endif

UFS_LOCK(ump);

fs->fs_pendinginodes -= 1;

UFS_UNLOCK(ump);

+ LIST_INIT(&wkhd);

+ LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);

if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,

- freefile->fx_oldinum, freefile->fx_mode)) != 0)

+ freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)

softdep_error("handle_workitem_freefile", error);

ACQUIRE_LOCK(&lk);

WORKITEM_FREE(freefile, D_FREEFILE);

@@ -3757,8 +7806,10 @@ softdep_disk_io_initiation(bp)

{

struct worklist *wk;

struct worklist marker;

- struct indirdep *indirdep;

struct inodedep *inodedep;

+ struct freeblks *freeblks;

+ struct jfreeblk *jfreeblk;

+ struct newblk *newblk;

* We only care about write operations. There should never

@@ -3767,6 +7818,10 @@ softdep_disk_io_initiation(bp)

if (bp->b_iocmd != BIO_WRITE)

panic("softdep_disk_io_initiation: not write");

+ if (bp->b_vflags & BV_BKGRDINPROG)

+ panic("softdep_disk_io_initiation: Writing buffer with "

+ "background write in progress: %p", bp);

marker.wk_type = D_LAST + 1; /* Not a normal workitem */

PHOLD(curproc); /* Don't swap out kernel stack */

@@ -3792,46 +7847,58 @@ softdep_disk_io_initiation(bp)

continue;

case D_INDIRDEP:

- indirdep = WK_INDIRDEP(wk);

- if (indirdep->ir_state & GOINGAWAY)

- panic("disk_io_initiation: indirdep gone");

+ initiate_write_indirdep(WK_INDIRDEP(wk), bp);

+ continue;

+ case D_BMSAFEMAP:

+ initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);

+ continue;

+ case D_JSEG:

+ WK_JSEG(wk)->js_buf = NULL;

+ continue;

+ case D_FREEBLKS:

+ freeblks = WK_FREEBLKS(wk);

+ jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);

- * If there are no remaining dependencies, this

- * will be writing the real pointers, so the

- * dependency can be freed.

+ * We have to wait for the jfreeblks to be journaled

+ * before we can write an inodeblock with updated

+ * pointers. Be careful to arrange the marker so

+ * we revisit the jfreeblk if it's not removed by

+ * the first jwait().

- if (LIST_EMPTY(&indirdep->ir_deplisthd)) {

- struct buf *bp;

- bp = indirdep->ir_savebp;

- bp->b_flags |= B_INVAL | B_NOCACHE;

- /* inline expand WORKLIST_REMOVE(wk); */

- wk->wk_state &= ~ONWORKLIST;

- LIST_REMOVE(wk, wk_list);

- WORKITEM_FREE(indirdep, D_INDIRDEP);

- FREE_LOCK(&lk);

- brelse(bp);

- ACQUIRE_LOCK(&lk);

- continue;

+ if (jfreeblk != NULL) {

+ LIST_REMOVE(&marker, wk_list);

+ LIST_INSERT_BEFORE(wk, &marker, wk_list);

+ jwait(&jfreeblk->jf_list);

}

+ continue;

+ case D_ALLOCDIRECT:

+ case D_ALLOCINDIR:

- * Replace up-to-date version with safe version.

+ * We have to wait for the jnewblk to be journaled

+ * before we can write to a block otherwise the

+ * contents may be confused with an earlier file

+ * at recovery time. Handle the marker as described

+ * above.

- FREE_LOCK(&lk);

- indirdep->ir_saveddata = malloc(bp->b_bcount,

- M_INDIRDEP, M_SOFTDEP_FLAGS);

- ACQUIRE_LOCK(&lk);

- indirdep->ir_state &= ~ATTACHED;

- indirdep->ir_state |= UNDONE;

- bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);

- bcopy(indirdep->ir_savebp->b_data, bp->b_data,

- bp->b_bcount);

+ newblk = WK_NEWBLK(wk);

+ if (newblk->nb_jnewblk != NULL) {

+ LIST_REMOVE(&marker, wk_list);

+ LIST_INSERT_BEFORE(wk, &marker, wk_list);

+ jwait(&newblk->nb_jnewblk->jn_list);

+ }

+ continue;

+ case D_SBDEP:

+ initiate_write_sbdep(WK_SBDEP(wk));

continue;

case D_MKDIR:

- case D_BMSAFEMAP:

- case D_ALLOCDIRECT:

- case D_ALLOCINDIR:

+ case D_FREEWORK:

+ case D_FREEDEP:

+ case D_JSEGDEP:

continue;

default:

@@ -3855,6 +7922,9 @@ initiate_write_filepage(pagedep, bp)

struct pagedep *pagedep;

struct buf *bp;

{

+ struct jremref *jremref;

+ struct jmvref *jmvref;

+ struct dirrem *dirrem;

struct diradd *dap;

struct direct *ep;

int i;

@@ -3869,6 +7939,22 @@ initiate_write_filepage(pagedep, bp)

return;

}

pagedep->pd_state |= IOSTARTED;

+ /*

+ * Wait for all journal remove dependencies to hit the disk.

+ * We can not allow any potentially conflicting directory adds

+ * to be visible before removes and rollback is too difficult.

+ * lk may be dropped and re-acquired, however we hold the buf

+ * locked so the dependency can not go away.

+ */

+ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)

+ while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {

+ stat_jwait_filepage++;

+ jwait(&jremref->jr_list);

+ }

+ while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {

+ stat_jwait_filepage++;

+ jwait(&jmvref->jm_list);

+ }

for (i = 0; i < DAHASHSZ; i++) {

LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {

ep = (struct direct *)

@@ -3905,6 +7991,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)

struct allocdirect *adp, *lastadp;

struct ufs1_dinode *dp;

struct ufs1_dinode *sip;

+ struct inoref *inoref;

struct fs *fs;

ufs_lbn_t i;

#ifdef INVARIANTS

@@ -3918,6 +8005,17 @@ initiate_write_inodeblock_ufs1(inodedep, bp)

fs = inodedep->id_fs;

dp = (struct ufs1_dinode *)bp->b_data +

ino_to_fsbo(fs, inodedep->id_ino);

+ /*

+ * If we're on the unlinked list but have not yet written our

+ * next pointer initialize it here.

+ */

+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {

+ struct inodedep *inon;

+ inon = TAILQ_NEXT(inodedep, id_unlinked);

+ dp->di_freelink = inon ? inon->id_ino : 0;

+ }

* If the bitmap is not yet written, then the allocated

* inode cannot be written to disk.

@@ -3933,6 +8031,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)

*inodedep->id_savedino1 = *dp;

bzero((caddr_t)dp, sizeof(struct ufs1_dinode));

dp->di_gen = inodedep->id_savedino1->di_gen;

+ dp->di_freelink = inodedep->id_savedino1->di_freelink;

return;

}

@@ -3940,32 +8039,40 @@ initiate_write_inodeblock_ufs1(inodedep, bp)

inodedep->id_savedsize = dp->di_size;

inodedep->id_savedextsize = 0;

- if (TAILQ_EMPTY(&inodedep->id_inoupdt))

+ inodedep->id_savednlink = dp->di_nlink;

+ if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&

+ TAILQ_EMPTY(&inodedep->id_inoreflst))

return;

+ * Revert the link count to that of the first unwritten journal entry.

+ */

+ inoref = TAILQ_FIRST(&inodedep->id_inoreflst);

+ if (inoref)

+ dp->di_nlink = inoref->if_nlink;

+ /*

* Set the dependencies to busy.

for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;

adp = TAILQ_NEXT(adp, ad_next)) {

#ifdef INVARIANTS

- if (deplist != 0 && prevlbn >= adp->ad_lbn)

+ if (deplist != 0 && prevlbn >= adp->ad_offset)

panic("softdep_write_inodeblock: lbn order");

- prevlbn = adp->ad_lbn;

- if (adp->ad_lbn < NDADDR &&

- dp->di_db[adp->ad_lbn] != adp->ad_newblkno)

+ prevlbn = adp->ad_offset;

+ if (adp->ad_offset < NDADDR &&

+ dp->di_db[adp->ad_offset] != adp->ad_newblkno)

panic("%s: direct pointer #%jd mismatch %d != %jd",

"softdep_write_inodeblock",

- (intmax_t)adp->ad_lbn,

- dp->di_db[adp->ad_lbn],

+ (intmax_t)adp->ad_offset,

+ dp->di_db[adp->ad_offset],

(intmax_t)adp->ad_newblkno);

- if (adp->ad_lbn >= NDADDR &&

- dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)

+ if (adp->ad_offset >= NDADDR &&

+ dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)

panic("%s: indirect pointer #%jd mismatch %d != %jd",

"softdep_write_inodeblock",

- (intmax_t)adp->ad_lbn - NDADDR,

- dp->di_ib[adp->ad_lbn - NDADDR],

+ (intmax_t)adp->ad_offset - NDADDR,

+ dp->di_ib[adp->ad_offset - NDADDR],

(intmax_t)adp->ad_newblkno);

- deplist |= 1 << adp->ad_lbn;

+ deplist |= 1 << adp->ad_offset;

if ((adp->ad_state & ATTACHED) == 0)

panic("softdep_write_inodeblock: Unknown state 0x%x",

adp->ad_state);

@@ -3981,14 +8088,14 @@ initiate_write_inodeblock_ufs1(inodedep, bp)

for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;

lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {

- if (adp->ad_lbn >= NDADDR)

+ if (adp->ad_offset >= NDADDR)

break;

- dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;

+ dp->di_db[adp->ad_offset] = adp->ad_oldblkno;

/* keep going until hitting a rollback to a frag */

if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)

continue;

- dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;

- for (i = adp->ad_lbn + 1; i < NDADDR; i++) {

+ dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;

+ for (i = adp->ad_offset + 1; i < NDADDR; i++) {

#ifdef INVARIANTS

if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)

panic("softdep_write_inodeblock: lost dep1");

@@ -4012,8 +8119,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp)

* we already checked for fragments in the loop above.

if (lastadp != NULL &&

- dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {

- for (i = lastadp->ad_lbn; i >= 0; i--)

+ dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {

+ for (i = lastadp->ad_offset; i >= 0; i--)

if (dp->di_db[i] != 0)

break;

dp->di_size = (i + 1) * fs->fs_bsize;

@@ -4030,7 +8137,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)

* postpone fsck, we are stuck with this argument.

for (; adp; adp = TAILQ_NEXT(adp, ad_next))

- dp->di_ib[adp->ad_lbn - NDADDR] = 0;

+ dp->di_ib[adp->ad_offset - NDADDR] = 0;

}

@@ -4051,6 +8158,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

struct allocdirect *adp, *lastadp;

struct ufs2_dinode *dp;

struct ufs2_dinode *sip;

+ struct inoref *inoref;

struct fs *fs;

ufs_lbn_t i;

#ifdef INVARIANTS

@@ -4064,6 +8172,29 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

fs = inodedep->id_fs;

dp = (struct ufs2_dinode *)bp->b_data +

ino_to_fsbo(fs, inodedep->id_ino);

+ /*

+ * If we're on the unlinked list but have not yet written our

+ * next pointer initialize it here.

+ */

+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {

+ struct inodedep *inon;

+ inon = TAILQ_NEXT(inodedep, id_unlinked);

+ dp->di_freelink = inon ? inon->id_ino : 0;

+ }

+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==

+ (UNLINKED | UNLINKNEXT)) {

+ struct inodedep *inon;

+ ino_t freelink;

+ inon = TAILQ_NEXT(inodedep, id_unlinked);

+ freelink = inon ? inon->id_ino : 0;

+ if (freelink != dp->di_freelink)

+ panic("ino %p(0x%X) %d, %d != %d",

+ inodedep, inodedep->id_state, inodedep->id_ino,

+ freelink, dp->di_freelink);

+ }

* If the bitmap is not yet written, then the allocated

* inode cannot be written to disk.

@@ -4079,6 +8210,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

*inodedep->id_savedino2 = *dp;

bzero((caddr_t)dp, sizeof(struct ufs2_dinode));

dp->di_gen = inodedep->id_savedino2->di_gen;

+ dp->di_freelink = inodedep->id_savedino2->di_freelink;

return;

}

@@ -4086,25 +8218,34 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

inodedep->id_savedsize = dp->di_size;

inodedep->id_savedextsize = dp->di_extsize;

+ inodedep->id_savednlink = dp->di_nlink;

if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&

- TAILQ_EMPTY(&inodedep->id_extupdt))

+ TAILQ_EMPTY(&inodedep->id_extupdt) &&

+ TAILQ_EMPTY(&inodedep->id_inoreflst))

return;

+ * Revert the link count to that of the first unwritten journal entry.

+ */

+ inoref = TAILQ_FIRST(&inodedep->id_inoreflst);

+ if (inoref)

+ dp->di_nlink = inoref->if_nlink;

+ /*

* Set the ext data dependencies to busy.

for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;

adp = TAILQ_NEXT(adp, ad_next)) {

#ifdef INVARIANTS

- if (deplist != 0 && prevlbn >= adp->ad_lbn)

+ if (deplist != 0 && prevlbn >= adp->ad_offset)

panic("softdep_write_inodeblock: lbn order");

- prevlbn = adp->ad_lbn;

- if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)

+ prevlbn = adp->ad_offset;

+ if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)

panic("%s: direct pointer #%jd mismatch %jd != %jd",

"softdep_write_inodeblock",

- (intmax_t)adp->ad_lbn,

- (intmax_t)dp->di_extb[adp->ad_lbn],

+ (intmax_t)adp->ad_offset,

+ (intmax_t)dp->di_extb[adp->ad_offset],

(intmax_t)adp->ad_newblkno);

- deplist |= 1 << adp->ad_lbn;

+ deplist |= 1 << adp->ad_offset;

if ((adp->ad_state & ATTACHED) == 0)

panic("softdep_write_inodeblock: Unknown state 0x%x",

adp->ad_state);

@@ -4120,12 +8261,12 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;

lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {

- dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;

+ dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;

/* keep going until hitting a rollback to a frag */

if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)

continue;

- dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;

- for (i = adp->ad_lbn + 1; i < NXADDR; i++) {

+ dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;

+ for (i = adp->ad_offset + 1; i < NXADDR; i++) {

#ifdef INVARIANTS

if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)

panic("softdep_write_inodeblock: lost dep1");

@@ -4142,8 +8283,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

* we already checked for fragments in the loop above.

if (lastadp != NULL &&

- dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {

- for (i = lastadp->ad_lbn; i >= 0; i--)

+ dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {

+ for (i = lastadp->ad_offset; i >= 0; i--)

if (dp->di_extb[i] != 0)

break;

dp->di_extsize = (i + 1) * fs->fs_bsize;

@@ -4154,24 +8295,24 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;

adp = TAILQ_NEXT(adp, ad_next)) {

#ifdef INVARIANTS

- if (deplist != 0 && prevlbn >= adp->ad_lbn)

+ if (deplist != 0 && prevlbn >= adp->ad_offset)

panic("softdep_write_inodeblock: lbn order");

- prevlbn = adp->ad_lbn;

- if (adp->ad_lbn < NDADDR &&

- dp->di_db[adp->ad_lbn] != adp->ad_newblkno)

+ prevlbn = adp->ad_offset;

+ if (adp->ad_offset < NDADDR &&

+ dp->di_db[adp->ad_offset] != adp->ad_newblkno)

panic("%s: direct pointer #%jd mismatch %jd != %jd",

"softdep_write_inodeblock",

- (intmax_t)adp->ad_lbn,

- (intmax_t)dp->di_db[adp->ad_lbn],

+ (intmax_t)adp->ad_offset,

+ (intmax_t)dp->di_db[adp->ad_offset],

(intmax_t)adp->ad_newblkno);

- if (adp->ad_lbn >= NDADDR &&

- dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)

+ if (adp->ad_offset >= NDADDR &&

+ dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)

panic("%s indirect pointer #%jd mismatch %jd != %jd",

"softdep_write_inodeblock:",

- (intmax_t)adp->ad_lbn - NDADDR,

- (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],

+ (intmax_t)adp->ad_offset - NDADDR,

+ (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],

(intmax_t)adp->ad_newblkno);

- deplist |= 1 << adp->ad_lbn;

+ deplist |= 1 << adp->ad_offset;

if ((adp->ad_state & ATTACHED) == 0)

panic("softdep_write_inodeblock: Unknown state 0x%x",

adp->ad_state);

@@ -4187,14 +8328,14 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;

lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {

- if (adp->ad_lbn >= NDADDR)

+ if (adp->ad_offset >= NDADDR)

break;

- dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;

+ dp->di_db[adp->ad_offset] = adp->ad_oldblkno;

/* keep going until hitting a rollback to a frag */

if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)

continue;

- dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;

- for (i = adp->ad_lbn + 1; i < NDADDR; i++) {

+ dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;

+ for (i = adp->ad_offset + 1; i < NDADDR; i++) {

#ifdef INVARIANTS

if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)

panic("softdep_write_inodeblock: lost dep2");

@@ -4218,8 +8359,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

* we already checked for fragments in the loop above.

if (lastadp != NULL &&

- dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {

- for (i = lastadp->ad_lbn; i >= 0; i--)

+ dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {

+ for (i = lastadp->ad_offset; i >= 0; i--)

if (dp->di_db[i] != 0)

break;

dp->di_size = (i + 1) * fs->fs_bsize;

@@ -4236,7 +8377,355 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

* postpone fsck, we are stuck with this argument.

for (; adp; adp = TAILQ_NEXT(adp, ad_next))

- dp->di_ib[adp->ad_lbn - NDADDR] = 0;

+ dp->di_ib[adp->ad_offset - NDADDR] = 0;

+/*

+ * Cancel an indirdep as a result of truncation. Release all of the

+ * children allocindirs and place their journal work on the appropriate

+ * list.

+ */

+static void

+cancel_indirdep(indirdep, bp, inodedep, freeblks)

+ struct indirdep *indirdep;

+ struct buf *bp;

+ struct inodedep *inodedep;

+ struct freeblks *freeblks;

+ struct allocindir *aip;

+ /*

+ * None of the indirect pointers will ever be visible,

+ * so they can simply be tossed. GOINGAWAY ensures

+ * that allocated pointers will be saved in the buffer

+ * cache until they are freed. Note that they will

+ * only be able to be found by their physical address

+ * since the inode mapping the logical address will

+ * be gone. The save buffer used for the safe copy

+ * was allocated in setup_allocindir_phase2 using

+ * the physical address so it could be used for this

+ * purpose. Hence we swap the safe copy with the real

+ * copy, allowing the safe copy to be freed and holding

+ * on to the real copy for later use in indir_trunc.

+ */

+ if (indirdep->ir_state & GOINGAWAY)

+ panic("cancel_indirdep: already gone");

+ if (indirdep->ir_state & ONDEPLIST) {

+ indirdep->ir_state &= ~ONDEPLIST;

+ LIST_REMOVE(indirdep, ir_next);

+ }

+ indirdep->ir_state |= GOINGAWAY;

+ VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;

+ while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)

+ cancel_allocindir(aip, inodedep, freeblks);

+ while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)

+ cancel_allocindir(aip, inodedep, freeblks);

+ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)

+ cancel_allocindir(aip, inodedep, freeblks);

+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)

+ cancel_allocindir(aip, inodedep, freeblks);

+ bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);

+ WORKLIST_REMOVE(&indirdep->ir_list);

+ WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);

+ indirdep->ir_savebp = NULL;

+/*

+ * Free an indirdep once it no longer has new pointers to track.

+ */

+static void

+free_indirdep(indirdep)

+ struct indirdep *indirdep;

+ KASSERT(LIST_EMPTY(&indirdep->ir_jwork),

+ ("free_indirdep: Journal work not empty."));

+ KASSERT(LIST_EMPTY(&indirdep->ir_completehd),

+ ("free_indirdep: Complete head not empty."));

+ KASSERT(LIST_EMPTY(&indirdep->ir_writehd),

+ ("free_indirdep: write head not empty."));

+ KASSERT(LIST_EMPTY(&indirdep->ir_donehd),

+ ("free_indirdep: done head not empty."));

+ KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),

+ ("free_indirdep: deplist head not empty."));

+ KASSERT(indirdep->ir_savebp == NULL,

+ ("free_indirdep: %p ir_savebp != NULL", indirdep));

+ KASSERT((indirdep->ir_state & ONDEPLIST) == 0,

+ ("free_indirdep: %p still on deplist.", indirdep));

+ if (indirdep->ir_state & ONWORKLIST)

+ WORKLIST_REMOVE(&indirdep->ir_list);

+ WORKITEM_FREE(indirdep, D_INDIRDEP);

+/*

+ * Called before a write to an indirdep. This routine is responsible for

+ * rolling back pointers to a safe state which includes only those

+ * allocindirs which have been completed.

+ */

+static void

+initiate_write_indirdep(indirdep, bp)

+ struct indirdep *indirdep;

+ struct buf *bp;

+ if (indirdep->ir_state & GOINGAWAY)

+ panic("disk_io_initiation: indirdep gone");

+ /*

+ * If there are no remaining dependencies, this will be writing

+ * the real pointers.

+ */

+ if (LIST_EMPTY(&indirdep->ir_deplisthd))

+ return;

+ /*

+ * Replace up-to-date version with safe version.

+ */

+ FREE_LOCK(&lk);

+ indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,

+ M_SOFTDEP_FLAGS);

+ ACQUIRE_LOCK(&lk);

+ indirdep->ir_state &= ~ATTACHED;

+ indirdep->ir_state |= UNDONE;

+ bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);

+ bcopy(indirdep->ir_savebp->b_data, bp->b_data,

+ bp->b_bcount);

+/*

+ * Called when an inode has been cleared in a cg bitmap. This finally

+ * eliminates any canceled jaddrefs

+ */

+void

+softdep_setup_inofree(mp, bp, ino, wkhd)

+ struct mount *mp;

+ struct buf *bp;

+ ino_t ino;

+ struct workhead *wkhd;

+ struct worklist *wk, *wkn;

+ struct inodedep *inodedep;

+ uint8_t *inosused;

+ struct cg *cgp;

+ struct fs *fs;

+ ACQUIRE_LOCK(&lk);

+ fs = VFSTOUFS(mp)->um_fs;

+ cgp = (struct cg *)bp->b_data;

+ inosused = cg_inosused(cgp);

+ if (isset(inosused, ino % fs->fs_ipg))

+ panic("softdep_setup_inofree: inode %d not freed.", ino);

+ if (inodedep_lookup(mp, ino, 0, &inodedep))

+ panic("softdep_setup_inofree: ino %d has existing inodedep %p",

+ ino, inodedep);

+ if (wkhd) {

+ LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {

+ if (wk->wk_type != D_JADDREF)

+ continue;

+ WORKLIST_REMOVE(wk);

+ /*

+ * We can free immediately even if the jaddref

+ * isn't attached in a background write as now

+ * the bitmaps are reconciled.

+ */

+ wk->wk_state |= COMPLETE | ATTACHED;

+ free_jaddref(WK_JADDREF(wk));

+ }

+ jwork_move(&bp->b_dep, wkhd);

+ }

+ FREE_LOCK(&lk);

+/*

+ * Called via ffs_blkfree() after a set of frags has been cleared from a cg

+ * map. Any dependencies waiting for the write to clear are added to the

+ * buf's list and any jnewblks that are being canceled are discarded

+ * immediately.

+ */

+void

+softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)

+ struct mount *mp;

+ struct buf *bp;

+ ufs2_daddr_t blkno;

+ int frags;

+ struct workhead *wkhd;

+ struct jnewblk *jnewblk;

+ struct worklist *wk, *wkn;

+#ifdef SUJ_DEBUG

+ struct bmsafemap *bmsafemap;

+ struct fs *fs;

+ uint8_t *blksfree;

+ struct cg *cgp;

+ ufs2_daddr_t jstart;

+ ufs2_daddr_t jend;

+ ufs2_daddr_t end;

+ long bno;

+ int i;

+#endif

+ ACQUIRE_LOCK(&lk);

+ /*

+ * Detach any jnewblks which have been canceled. They must linger

+ * until the bitmap is cleared again by ffs_blkfree() to prevent

+ * an unjournaled allocation from hitting the disk.

+ */

+ if (wkhd) {

+ LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {

+ if (wk->wk_type != D_JNEWBLK)

+ continue;

+ jnewblk = WK_JNEWBLK(wk);

+ KASSERT(jnewblk->jn_state & GOINGAWAY,

+ ("softdep_setup_blkfree: jnewblk not canceled."));

+ WORKLIST_REMOVE(wk);

+#ifdef SUJ_DEBUG

+ /*

+ * Assert that this block is free in the bitmap

+ * before we discard the jnewblk.

+ */

+ fs = VFSTOUFS(mp)->um_fs;

+ cgp = (struct cg *)bp->b_data;

+ blksfree = cg_blksfree(cgp);

+ bno = dtogd(fs, jnewblk->jn_blkno);

+ for (i = jnewblk->jn_oldfrags;

+ i < jnewblk->jn_frags; i++) {

+ if (isset(blksfree, bno + i))

+ continue;

+ panic("softdep_setup_blkfree: not free");

+ }

+#endif

+ /*

+ * Even if it's not attached we can free immediately

+ * as the new bitmap is correct.

+ */

+ wk->wk_state |= COMPLETE | ATTACHED;

+ free_jnewblk(jnewblk);

+ }

+ /*

+ * The buf must be locked by the caller otherwise these could

+ * be added while it's being written and the write would

+ * complete them before they made it to disk.

+ */

+ jwork_move(&bp->b_dep, wkhd);

+ }

+#ifdef SUJ_DEBUG

+ /*

+ * Assert that we are not freeing a block which has an outstanding

+ * allocation dependency.

+ */

+ fs = VFSTOUFS(mp)->um_fs;

+ bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));

+ end = blkno + frags;

+ LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {

+ /*

+ * Don't match against blocks that will be freed when the

+ * background write is done.

+ */

+ if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==

+ (COMPLETE | DEPCOMPLETE))

+ continue;

+ jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;

+ jend = jnewblk->jn_blkno + jnewblk->jn_frags;

+ if ((blkno >= jstart && blkno < jend) ||

+ (end > jstart && end <= jend)) {

+ printf("state 0x%X %jd - %d %d dep %p\n",

+ jnewblk->jn_state, jnewblk->jn_blkno,

+ jnewblk->jn_oldfrags, jnewblk->jn_frags,

+ jnewblk->jn_newblk);

+ panic("softdep_setup_blkfree: "

+ "%jd-%jd(%d) overlaps with %jd-%jd",

+ blkno, end, frags, jstart, jend);

+ }

+#endif

+ FREE_LOCK(&lk);

+static void

+initiate_write_bmsafemap(bmsafemap, bp)

+ struct bmsafemap *bmsafemap;

+ struct buf *bp; /* The cg block. */

+ struct jaddref *jaddref;

+ struct jnewblk *jnewblk;

+ uint8_t *inosused;

+ uint8_t *blksfree;

+ struct cg *cgp;

+ struct fs *fs;

+ int cleared;

+ ino_t ino;

+ long bno;

+ int i;

+ if (bmsafemap->sm_state & IOSTARTED)

+ panic("initiate_write_bmsafemap: Already started\n");

+ bmsafemap->sm_state |= IOSTARTED;

+ /*

+ * Clear any inode allocations which are pending journal writes.

+ */

+ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {

+ cgp = (struct cg *)bp->b_data;

+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;

+ inosused = cg_inosused(cgp);

+ LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {

+ ino = jaddref->ja_ino % fs->fs_ipg;

+ /*

+ * If this is a background copy the inode may not

+ * be marked used yet.

+ */

+ if (isset(inosused, ino)) {

+ if ((jaddref->ja_mode & IFMT) == IFDIR)

+ cgp->cg_cs.cs_ndir--;

+ cgp->cg_cs.cs_nifree++;

+ clrbit(inosused, ino);

+ jaddref->ja_state &= ~ATTACHED;

+ jaddref->ja_state |= UNDONE;

+ stat_jaddref++;

+ } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)

+ panic("initiate_write_bmsafemap: inode %d "

+ "marked free", jaddref->ja_ino);

+ }

+ /*

+ * Clear any block allocations which are pending journal writes.

+ */

+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {

+ cgp = (struct cg *)bp->b_data;

+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;

+ blksfree = cg_blksfree(cgp);

+ LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {

+ bno = dtogd(fs, jnewblk->jn_blkno);

+ cleared = 0;

+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;

+ i++) {

+ if (isclr(blksfree, bno + i)) {

+ cleared = 1;

+ setbit(blksfree, bno + i);

+ }

+ /*

+ * We may not clear the block if it's a background

+ * copy. In that case there is no reason to detach

+ * it.

+ */

+ if (cleared) {

+ stat_jnewblk++;

+ jnewblk->jn_state &= ~ATTACHED;

+ jnewblk->jn_state |= UNDONE;

+ } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)

+ panic("initiate_write_bmsafemap: block %jd "

+ "marked free", jnewblk->jn_blkno);

+ }

+ /*

+ * Move allocation lists to the written lists so they can be

+ * cleared once the block write is complete.

+ */

+ LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,

+ inodedep, id_deps);

+ LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,

+ newblk, nb_deps);

}

@@ -4246,6 +8735,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)

* a request completion). It should be called early in this

* procedure, before the block is made available to other

* processes or other routines are called.

+ *

static void

softdep_disk_write_complete(bp)

@@ -4254,12 +8744,7 @@ softdep_disk_write_complete(bp)

struct worklist *wk;

struct worklist *owk;

struct workhead reattach;

- struct newblk *newblk;

- struct allocindir *aip;

- struct allocdirect *adp;

- struct indirdep *indirdep;

- struct inodedep *inodedep;

- struct bmsafemap *bmsafemap;

+ struct buf *sbp;

* If an error occurred while doing the write, then the data

@@ -4271,8 +8756,9 @@ softdep_disk_write_complete(bp)

* This lock must not be released anywhere in this code segment.

- ACQUIRE_LOCK(&lk);

+ sbp = NULL;

owk = NULL;

+ ACQUIRE_LOCK(&lk);

while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {

WORKLIST_REMOVE(wk);

if (wk == owk)

@@ -4291,33 +8777,8 @@ softdep_disk_write_complete(bp)

continue;

case D_BMSAFEMAP:

- bmsafemap = WK_BMSAFEMAP(wk);

- while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {

- newblk->nb_state |= DEPCOMPLETE;

- newblk->nb_bmsafemap = NULL;

- LIST_REMOVE(newblk, nb_deps);

- }

- while ((adp =

- LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {

- adp->ad_state |= DEPCOMPLETE;

- adp->ad_buf = NULL;

- LIST_REMOVE(adp, ad_deps);

- handle_allocdirect_partdone(adp);

- }

- while ((aip =

- LIST_FIRST(&bmsafemap->sm_allocindirhd))) {

- aip->ai_state |= DEPCOMPLETE;

- aip->ai_buf = NULL;

- LIST_REMOVE(aip, ai_deps);

- handle_allocindir_partdone(aip);

- }

- while ((inodedep =

- LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {

- inodedep->id_state |= DEPCOMPLETE;

- LIST_REMOVE(inodedep, id_deps);

- inodedep->id_buf = NULL;

- }

- WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);

+ if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))

+ WORKLIST_INSERT(&reattach, wk);

continue;

case D_MKDIR:

@@ -4325,35 +8786,45 @@ softdep_disk_write_complete(bp)

continue;

case D_ALLOCDIRECT:

- adp = WK_ALLOCDIRECT(wk);

- adp->ad_state |= COMPLETE;

- handle_allocdirect_partdone(adp);

+ wk->wk_state |= COMPLETE;

+ handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);

continue;

case D_ALLOCINDIR:

- aip = WK_ALLOCINDIR(wk);

- aip->ai_state |= COMPLETE;

- handle_allocindir_partdone(aip);

+ wk->wk_state |= COMPLETE;

+ handle_allocindir_partdone(WK_ALLOCINDIR(wk));

continue;

case D_INDIRDEP:

- indirdep = WK_INDIRDEP(wk);

- if (indirdep->ir_state & GOINGAWAY)

- panic("disk_write_complete: indirdep gone");

- bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);

- free(indirdep->ir_saveddata, M_INDIRDEP);

- indirdep->ir_saveddata = 0;

- indirdep->ir_state &= ~UNDONE;

- indirdep->ir_state |= ATTACHED;

- while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {

- handle_allocindir_partdone(aip);

- if (aip == LIST_FIRST(&indirdep->ir_donehd))

- panic("disk_write_complete: not gone");

- }

- WORKLIST_INSERT(&reattach, wk);

- if ((bp->b_flags & B_DELWRI) == 0)

- stat_indir_blk_ptrs++;

- bdirty(bp);

+ if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))

+ WORKLIST_INSERT(&reattach, wk);

+ continue;

+ case D_FREEBLKS:

+ wk->wk_state |= COMPLETE;

+ if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)

+ add_to_worklist(wk, 1);

+ continue;

+ case D_FREEWORK:

+ handle_written_freework(WK_FREEWORK(wk));

+ break;

+ case D_FREEDEP:

+ free_freedep(WK_FREEDEP(wk));

+ continue;

+ case D_JSEGDEP:

+ free_jsegdep(WK_JSEGDEP(wk));

+ continue;

+ case D_JSEG:

+ handle_written_jseg(WK_JSEG(wk), bp);

+ continue;

+ case D_SBDEP:

+ if (handle_written_sbdep(WK_SBDEP(wk), bp))

+ WORKLIST_INSERT(&reattach, wk);

continue;

default:

@@ -4370,6 +8841,8 @@ softdep_disk_write_complete(bp)

WORKLIST_INSERT(&bp->b_dep, wk);

}

FREE_LOCK(&lk);

+ if (sbp)

+ brelse(sbp);

}

@@ -4378,18 +8851,17 @@ softdep_disk_write_complete(bp)

* splbio interrupts blocked.

static void

-handle_allocdirect_partdone(adp)

+handle_allocdirect_partdone(adp, wkhd)

struct allocdirect *adp; /* the completed allocdirect */

+ struct workhead *wkhd; /* Work to do when inode is writtne. */

{

struct allocdirectlst *listhead;

struct allocdirect *listadp;

struct inodedep *inodedep;

- long bsize, delay;

+ long bsize;

if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)

return;

- if (adp->ad_buf != NULL)

- panic("handle_allocdirect_partdone: dangling dep");

* The on-disk inode cannot claim to be any larger than the last

* fragment that has been written. Otherwise, the on-disk inode

@@ -4439,25 +8911,27 @@ handle_allocdirect_partdone(adp)

return;

}

- * If we have found the just finished dependency, then free

+ * If we have found the just finished dependency, then queue

* it along with anything that follows it that is complete.

- * If the inode still has a bitmap dependency, then it has

- * never been written to disk, hence the on-disk inode cannot

- * reference the old fragment so we can free it without delay.

+ * Since the pointer has not yet been written in the inode

+ * as the dependency prevents it, place the allocdirect on the

+ * bufwait list where it will be freed once the pointer is

+ * valid.

- delay = (inodedep->id_state & DEPCOMPLETE);

+ if (wkhd == NULL)

+ wkhd = &inodedep->id_bufwait;

for (; adp; adp = listadp) {

listadp = TAILQ_NEXT(adp, ad_next);

if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)

return;

- free_allocdirect(listhead, adp, delay);

+ TAILQ_REMOVE(listhead, adp, ad_next);

+ WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);

}

- * Called from within softdep_disk_write_complete above. Note that

- * this routine is always called from interrupt level with further

- * splbio interrupts blocked.

+ * Called from within softdep_disk_write_complete above. This routine

+ * completes successfully written allocindirs.

static void

handle_allocindir_partdone(aip)

@@ -4467,11 +8941,9 @@ handle_allocindir_partdone(aip)

if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)

return;

- if (aip->ai_buf != NULL)

- panic("handle_allocindir_partdone: dangling dependency");

indirdep = aip->ai_indirdep;

+ LIST_REMOVE(aip, ai_next);

if (indirdep->ir_state & UNDONE) {

- LIST_REMOVE(aip, ai_next);

LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);

return;

}

@@ -4481,13 +8953,130 @@ handle_allocindir_partdone(aip)

else

((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =

aip->ai_newblkno;

- LIST_REMOVE(aip, ai_next);

- if (aip->ai_freefrag != NULL)

- add_to_worklist(&aip->ai_freefrag->ff_list);

- WORKITEM_FREE(aip, D_ALLOCINDIR);

+ /*

+ * Await the pointer write before freeing the allocindir.

+ */

+ LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);

}

+ * Release segments held on a jwork list.

+ */

+static void

+handle_jwork(wkhd)

+ struct workhead *wkhd;

+ struct worklist *wk;

+ while ((wk = LIST_FIRST(wkhd)) != NULL) {

+ WORKLIST_REMOVE(wk);

+ switch (wk->wk_type) {

+ case D_JSEGDEP:

+ free_jsegdep(WK_JSEGDEP(wk));

+ continue;

+ default:

+ panic("handle_jwork: Unknown type %s\n",

+ TYPENAME(wk->wk_type));

+ }

+/*

+ * Handle the bufwait list on an inode when it is safe to release items

+ * held there. This normally happens after an inode block is written but

+ * may be delayed and handle later if there are pending journal items that

+ * are not yet safe to be released.

+ */

+static struct freefile *

+handle_bufwait(inodedep, refhd)

+ struct inodedep *inodedep;

+ struct workhead *refhd;

+ struct jaddref *jaddref;

+ struct freefile *freefile;

+ struct worklist *wk;

+ freefile = NULL;

+ while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {

+ WORKLIST_REMOVE(wk);

+ switch (wk->wk_type) {

+ case D_FREEFILE:

+ /*

+ * We defer adding freefile to the worklist

+ * until all other additions have been made to

+ * ensure that it will be done after all the

+ * old blocks have been freed.

+ */

+ if (freefile != NULL)

+ panic("handle_bufwait: freefile");

+ freefile = WK_FREEFILE(wk);

+ continue;

+ case D_MKDIR:

+ handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);

+ continue;

+ case D_DIRADD:

+ diradd_inode_written(WK_DIRADD(wk), inodedep);

+ continue;

+ case D_FREEFRAG:

+ wk->wk_state |= COMPLETE;

+ if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)

+ add_to_worklist(wk, 0);

+ continue;

+ case D_DIRREM:

+ wk->wk_state |= COMPLETE;

+ add_to_worklist(wk, 0);

+ continue;

+ case D_ALLOCDIRECT:

+ case D_ALLOCINDIR:

+ free_newblk(WK_NEWBLK(wk));

+ continue;

+ case D_JNEWBLK:

+ wk->wk_state |= COMPLETE;

+ free_jnewblk(WK_JNEWBLK(wk));

+ continue;

+ /*

+ * Save freed journal segments and add references on

+ * the supplied list which will delay their release

+ * until the cg bitmap is cleared on disk.

+ */

+ case D_JSEGDEP:

+ if (refhd == NULL)

+ free_jsegdep(WK_JSEGDEP(wk));

+ else

+ WORKLIST_INSERT(refhd, wk);

+ continue;

+ case D_JADDREF:

+ jaddref = WK_JADDREF(wk);

+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,

+ if_deps);

+ /*

+ * Transfer any jaddrefs to the list to be freed with

+ * the bitmap if we're handling a removed file.

+ */

+ if (refhd == NULL) {

+ wk->wk_state |= COMPLETE;

+ free_jaddref(jaddref);

+ } else

+ WORKLIST_INSERT(refhd, wk);

+ continue;

+ default:

+ panic("handle_bufwait: Unknown type %p(%s)",

+ wk, TYPENAME(wk->wk_type));

+ /* NOTREACHED */

+ }

+ return (freefile);

+/*

* Called from within softdep_disk_write_complete above to restore

* in-memory inode block contents to their most up-to-date state. Note

* that this routine is always called from interrupt level with further

@@ -4498,12 +9087,17 @@ handle_written_inodeblock(inodedep, bp)

struct inodedep *inodedep;

struct buf *bp; /* buffer containing the inode block */

{

- struct worklist *wk, *filefree;

+ struct freefile *freefile;

struct allocdirect *adp, *nextadp;

struct ufs1_dinode *dp1 = NULL;

struct ufs2_dinode *dp2 = NULL;

+ struct workhead wkhd;

int hadchanges, fstype;

+ ino_t freelink;

+ LIST_INIT(&wkhd);

+ hadchanges = 0;

+ freefile = NULL;

if ((inodedep->id_state & IOSTARTED) == 0)

panic("handle_written_inodeblock: not started");

inodedep->id_state &= ~IOSTARTED;

@@ -4511,11 +9105,32 @@ handle_written_inodeblock(inodedep, bp)

fstype = UFS1;

dp1 = (struct ufs1_dinode *)bp->b_data +

ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);

+ freelink = dp1->di_freelink;

} else {

fstype = UFS2;

dp2 = (struct ufs2_dinode *)bp->b_data +

ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);

+ freelink = dp2->di_freelink;

+ }

+ /*

+ * If we wrote a valid freelink pointer during the last write

+ * record it here.

+ */

+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {

+ struct inodedep *inon;

+ inon = TAILQ_NEXT(inodedep, id_unlinked);

+ if ((inon == NULL && freelink == 0) ||

+ (inon && inon->id_ino == freelink)) {

+ if (inon)

+ inon->id_state |= UNLINKPREV;

+ inodedep->id_state |= UNLINKNEXT;

+ } else

+ hadchanges = 1;

}

+ /* Leave this inodeblock dirty until it's in the list. */

+ if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED)

+ hadchanges = 1;

* If we had to rollback the inode allocation because of

* bitmaps being incomplete, then simply restore it.

@@ -4524,6 +9139,7 @@ handle_written_inodeblock(inodedep, bp)

* corresponding updates written to disk.

if (inodedep->id_savedino1 != NULL) {

+ hadchanges = 1;

if (fstype == UFS1)

*dp1 = *inodedep->id_savedino1;

else

@@ -4533,6 +9149,13 @@ handle_written_inodeblock(inodedep, bp)

if ((bp->b_flags & B_DELWRI) == 0)

stat_inode_bitmap++;

bdirty(bp);

+ /*

+ * If the inode is clear here and GOINGAWAY it will never

+ * be written. Process the bufwait and clear any pending

+ * work which may include the freefile.

+ */

+ if (inodedep->id_state & GOINGAWAY)

+ goto bufwait;

return (1);

}

inodedep->id_state |= COMPLETE;

@@ -4540,50 +9163,49 @@ handle_written_inodeblock(inodedep, bp)

* Roll forward anything that had to be rolled back before

* the inode could be updated.

- hadchanges = 0;

for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {

nextadp = TAILQ_NEXT(adp, ad_next);

if (adp->ad_state & ATTACHED)

panic("handle_written_inodeblock: new entry");

if (fstype == UFS1) {

- if (adp->ad_lbn < NDADDR) {

- if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)

+ if (adp->ad_offset < NDADDR) {

+ if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)

panic("%s %s #%jd mismatch %d != %jd",

"handle_written_inodeblock:",

"direct pointer",

- (intmax_t)adp->ad_lbn,

- dp1->di_db[adp->ad_lbn],

+ (intmax_t)adp->ad_offset,

+ dp1->di_db[adp->ad_offset],

(intmax_t)adp->ad_oldblkno);

- dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;

+ dp1->di_db[adp->ad_offset] = adp->ad_newblkno;

} else {

- if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)

+ if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)

panic("%s: %s #%jd allocated as %d",

"handle_written_inodeblock",

"indirect pointer",

- (intmax_t)adp->ad_lbn - NDADDR,

- dp1->di_ib[adp->ad_lbn - NDADDR]);

- dp1->di_ib[adp->ad_lbn - NDADDR] =

+ (intmax_t)adp->ad_offset - NDADDR,

+ dp1->di_ib[adp->ad_offset - NDADDR]);

+ dp1->di_ib[adp->ad_offset - NDADDR] =

adp->ad_newblkno;

}

} else {

- if (adp->ad_lbn < NDADDR) {

- if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)

+ if (adp->ad_offset < NDADDR) {

+ if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)

panic("%s: %s #%jd %s %jd != %jd",

"handle_written_inodeblock",

"direct pointer",

- (intmax_t)adp->ad_lbn, "mismatch",

- (intmax_t)dp2->di_db[adp->ad_lbn],

+ (intmax_t)adp->ad_offset, "mismatch",

+ (intmax_t)dp2->di_db[adp->ad_offset],

(intmax_t)adp->ad_oldblkno);

- dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;

+ dp2->di_db[adp->ad_offset] = adp->ad_newblkno;

} else {

- if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)

+ if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)

panic("%s: %s #%jd allocated as %jd",

"handle_written_inodeblock",

"indirect pointer",

- (intmax_t)adp->ad_lbn - NDADDR,

+ (intmax_t)adp->ad_offset - NDADDR,

(intmax_t)

- dp2->di_ib[adp->ad_lbn - NDADDR]);

- dp2->di_ib[adp->ad_lbn - NDADDR] =

+ dp2->di_ib[adp->ad_offset - NDADDR]);

+ dp2->di_ib[adp->ad_offset - NDADDR] =

adp->ad_newblkno;

}

@@ -4595,13 +9217,13 @@ handle_written_inodeblock(inodedep, bp)

nextadp = TAILQ_NEXT(adp, ad_next);

if (adp->ad_state & ATTACHED)

panic("handle_written_inodeblock: new entry");

- if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)

+ if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)

panic("%s: direct pointers #%jd %s %jd != %jd",

"handle_written_inodeblock",

- (intmax_t)adp->ad_lbn, "mismatch",

- (intmax_t)dp2->di_extb[adp->ad_lbn],

+ (intmax_t)adp->ad_offset, "mismatch",

+ (intmax_t)dp2->di_extb[adp->ad_offset],

(intmax_t)adp->ad_oldblkno);

- dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;

+ dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;

adp->ad_state &= ~UNDONE;

adp->ad_state |= ATTACHED;

hadchanges = 1;

@@ -4613,12 +9235,23 @@ handle_written_inodeblock(inodedep, bp)

if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)

panic("handle_written_inodeblock: bad size");

+ if (inodedep->id_savednlink > LINK_MAX)

+ panic("handle_written_inodeblock: Invalid link count "

+ "%d for inodedep %p", inodedep->id_savednlink, inodedep);

if (fstype == UFS1) {

+ if (dp1->di_nlink != inodedep->id_savednlink) {

+ dp1->di_nlink = inodedep->id_savednlink;

+ hadchanges = 1;

+ }

if (dp1->di_size != inodedep->id_savedsize) {

dp1->di_size = inodedep->id_savedsize;

hadchanges = 1;

}

} else {

+ if (dp2->di_nlink != inodedep->id_savednlink) {

+ dp2->di_nlink = inodedep->id_savednlink;

+ hadchanges = 1;

+ }

if (dp2->di_size != inodedep->id_savedsize) {

dp2->di_size = inodedep->id_savedsize;

hadchanges = 1;

@@ -4630,6 +9263,7 @@ handle_written_inodeblock(inodedep, bp)

}

inodedep->id_savedsize = -1;

inodedep->id_savedextsize = -1;

+ inodedep->id_savednlink = -1;

* If there were any rollbacks in the inode block, then it must be

* marked dirty so that its will eventually get written back in

@@ -4637,69 +9271,49 @@ handle_written_inodeblock(inodedep, bp)

if (hadchanges)

bdirty(bp);

+bufwait:

* Process any allocdirects that completed during the update.

if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)

- handle_allocdirect_partdone(adp);

+ handle_allocdirect_partdone(adp, &wkhd);

if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)

- handle_allocdirect_partdone(adp);

+ handle_allocdirect_partdone(adp, &wkhd);

* Process deallocations that were held pending until the

* inode had been written to disk. Freeing of the inode

* is delayed until after all blocks have been freed to

* avoid creation of new <vfsid, inum, lbn> triples

- * before the old ones have been deleted.

+ * before the old ones have been deleted. Completely

+ * unlinked inodes are not processed until the unlinked

+ * inode list is written or the last reference is removed.

- filefree = NULL;

- while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {

- WORKLIST_REMOVE(wk);

- switch (wk->wk_type) {

- case D_FREEFILE:

- /*

- * We defer adding filefree to the worklist until

- * all other additions have been made to ensure

- * that it will be done after all the old blocks

- * have been freed.

- */

- if (filefree != NULL)

- panic("handle_written_inodeblock: filefree");

- filefree = wk;

- continue;

- case D_MKDIR:

- handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);

- continue;

- case D_DIRADD:

- diradd_inode_written(WK_DIRADD(wk), inodedep);

- continue;

- case D_FREEBLKS:

- wk->wk_state |= COMPLETE;

- if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)

- continue;

- /* -- fall through -- */

- case D_FREEFRAG:

- case D_DIRREM:

- add_to_worklist(wk);

- continue;

- case D_NEWDIRBLK:

- free_newdirblk(WK_NEWDIRBLK(wk));

- continue;

- default:

- panic("handle_written_inodeblock: Unknown type %s",

- TYPENAME(wk->wk_type));

- /* NOTREACHED */

+ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {

+ freefile = handle_bufwait(inodedep, NULL);

+ if (freefile && !LIST_EMPTY(&wkhd)) {

+ WORKLIST_INSERT(&wkhd, &freefile->fx_list);

+ freefile = NULL;

}

- if (filefree != NULL) {

+ /*

+ * Move rolled forward dependency completions to the bufwait list

+ * now that those that were already written have been processed.

+ */

+ if (!LIST_EMPTY(&wkhd) && hadchanges == 0)

+ panic("handle_written_inodeblock: bufwait but no changes");

+ jwork_move(&inodedep->id_bufwait, &wkhd);

+ if (freefile != NULL) {

+ /*

+ * If the inode is goingaway it was never written. Fake up

+ * the state here so free_inodedep() can succeed.

+ */

+ if (inodedep->id_state & GOINGAWAY)

+ inodedep->id_state |= COMPLETE | DEPCOMPLETE;

if (free_inodedep(inodedep) == 0)

- panic("handle_written_inodeblock: live inodedep");

- add_to_worklist(filefree);

+ panic("handle_written_inodeblock: live inodedep %p",

+ inodedep);

+ add_to_worklist(&freefile->fx_list, 0);

return (0);

}

@@ -4707,12 +9321,101 @@ handle_written_inodeblock(inodedep, bp)

* If no outstanding dependencies, free it.

if (free_inodedep(inodedep) ||

- (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&

- TAILQ_FIRST(&inodedep->id_extupdt) == 0))

+ (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&

+ TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&

+ TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&

+ LIST_FIRST(&inodedep->id_bufwait) == 0))

return (0);

return (hadchanges);

}

+static int

+handle_written_indirdep(indirdep, bp, bpp)

+ struct indirdep *indirdep;

+ struct buf *bp;

+ struct buf **bpp;

+ struct allocindir *aip;

+ int chgs;

+ if (indirdep->ir_state & GOINGAWAY)

+ panic("disk_write_complete: indirdep gone");

+ chgs = 0;

+ /*

+ * If there were rollbacks revert them here.

+ */

+ if (indirdep->ir_saveddata) {

+ bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);

+ free(indirdep->ir_saveddata, M_INDIRDEP);

+ indirdep->ir_saveddata = 0;

+ chgs = 1;

+ }

+ indirdep->ir_state &= ~UNDONE;

+ indirdep->ir_state |= ATTACHED;

+ /*

+ * Move allocindirs with written pointers to the completehd if

+ * the the indirdep's pointer is not yet written. Otherwise

+ * free them here.

+ */

+ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {

+ LIST_REMOVE(aip, ai_next);

+ if ((indirdep->ir_state & DEPCOMPLETE) == 0) {

+ LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,

+ ai_next);

+ continue;

+ }

+ free_newblk(&aip->ai_block);

+ }

+ /*

+ * Move allocindirs that have finished dependency processing from

+ * the done list to the write list after updating the pointers.

+ */

+ while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {

+ handle_allocindir_partdone(aip);

+ if (aip == LIST_FIRST(&indirdep->ir_donehd))

+ panic("disk_write_complete: not gone");

+ chgs = 1;

+ }

+ /*

+ * If this indirdep has been detached from its newblk during

+ * I/O we need to keep this dep attached to the buffer so

+ * deallocate_dependencies can find it and properly resolve

+ * any outstanding dependencies.

+ */

+ if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)

+ chgs = 1;

+ if ((bp->b_flags & B_DELWRI) == 0)

+ stat_indir_blk_ptrs++;

+ /*

+ * If there were no changes we can discard the savedbp and detach

+ * ourselves from the buf. We are only carrying completed pointers

+ * in this case.

+ */

+ if (chgs == 0) {

+ struct buf *sbp;

+ sbp = indirdep->ir_savebp;

+ sbp->b_flags |= B_INVAL | B_NOCACHE;

+ indirdep->ir_savebp = NULL;

+ if (*bpp != NULL)

+ panic("handle_written_indirdep: bp already exists.");

+ *bpp = sbp;

+ } else

+ bdirty(bp);

+ /*

+ * If there are no fresh dependencies and none waiting on writes

+ * we can free the indirdep.

+ */

+ if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {

+ if (indirdep->ir_state & ONDEPLIST)

+ LIST_REMOVE(indirdep, ir_next);

+ free_indirdep(indirdep);

+ return (0);

+ }

+ return (chgs);

* Process a diradd entry after its dependent inode has been written.

* This routine must be called with splbio interrupts blocked.

@@ -4722,50 +9425,200 @@ diradd_inode_written(dap, inodedep)

struct diradd *dap;

struct inodedep *inodedep;

{

- struct pagedep *pagedep;

dap->da_state |= COMPLETE;

- if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {

- if (dap->da_state & DIRCHG)

- pagedep = dap->da_previous->dm_pagedep;

- else

- pagedep = dap->da_pagedep;

- LIST_REMOVE(dap, da_pdlist);

- LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);

- }

+ complete_diradd(dap);

WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);

}

- * Handle the completion of a mkdir dependency.

+ * Returns true if the bmsafemap will have rollbacks when written. Must

+ * only be called with lk and the buf lock on the cg held.

+ */

+static int

+bmsafemap_rollbacks(bmsafemap)

+ struct bmsafemap *bmsafemap;

+ return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |

+ !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));

+/*

+ * Complete a write to a bmsafemap structure. Roll forward any bitmap

+ * changes if it's not a background write. Set all written dependencies

+ * to DEPCOMPLETE and free the structure if possible.

+ */

+static int

+handle_written_bmsafemap(bmsafemap, bp)

+ struct bmsafemap *bmsafemap;

+ struct buf *bp;

+ struct newblk *newblk;

+ struct inodedep *inodedep;

+ struct jaddref *jaddref, *jatmp;

+ struct jnewblk *jnewblk, *jntmp;

+ uint8_t *inosused;

+ uint8_t *blksfree;

+ struct cg *cgp;

+ struct fs *fs;

+ ino_t ino;

+ long bno;

+ int chgs;

+ int i;

+ if ((bmsafemap->sm_state & IOSTARTED) == 0)

+ panic("initiate_write_bmsafemap: Not started\n");

+ chgs = 0;

+ bmsafemap->sm_state &= ~IOSTARTED;

+ /*

+ * Restore unwritten inode allocation pending jaddref writes.

+ */

+ if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {

+ cgp = (struct cg *)bp->b_data;

+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;

+ inosused = cg_inosused(cgp);

+ LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,

+ ja_bmdeps, jatmp) {

+ if ((jaddref->ja_state & UNDONE) == 0)

+ continue;

+ ino = jaddref->ja_ino % fs->fs_ipg;

+ if (isset(inosused, ino))

+ panic("handle_written_bmsafemap: "

+ "re-allocated inode");

+ if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {

+ if ((jaddref->ja_mode & IFMT) == IFDIR)

+ cgp->cg_cs.cs_ndir++;

+ cgp->cg_cs.cs_nifree--;

+ setbit(inosused, ino);

+ chgs = 1;

+ }

+ jaddref->ja_state &= ~UNDONE;

+ jaddref->ja_state |= ATTACHED;

+ free_jaddref(jaddref);

+ }

+ /*

+ * Restore any block allocations which are pending journal writes.

+ */

+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {

+ cgp = (struct cg *)bp->b_data;

+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;

+ blksfree = cg_blksfree(cgp);

+ LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,

+ jntmp) {

+ if ((jnewblk->jn_state & UNDONE) == 0)

+ continue;

+ bno = dtogd(fs, jnewblk->jn_blkno);

+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;

+ i++) {

+ if (bp->b_xflags & BX_BKGRDMARKER)

+ break;

+ if ((jnewblk->jn_state & NEWBLOCK) == 0 &&

+ isclr(blksfree, bno + i))

+ panic("handle_written_bmsafemap: "

+ "re-allocated fragment");

+ clrbit(blksfree, bno + i);

+ chgs = 1;

+ }

+ jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);

+ jnewblk->jn_state |= ATTACHED;

+ free_jnewblk(jnewblk);

+ }

+ while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {

+ newblk->nb_state |= DEPCOMPLETE;

+ newblk->nb_state &= ~ONDEPLIST;

+ newblk->nb_bmsafemap = NULL;

+ LIST_REMOVE(newblk, nb_deps);

+ if (newblk->nb_list.wk_type == D_ALLOCDIRECT)

+ handle_allocdirect_partdone(

+ WK_ALLOCDIRECT(&newblk->nb_list), NULL);

+ else if (newblk->nb_list.wk_type == D_ALLOCINDIR)

+ handle_allocindir_partdone(

+ WK_ALLOCINDIR(&newblk->nb_list));

+ else if (newblk->nb_list.wk_type != D_NEWBLK)

+ panic("handle_written_bmsafemap: Unexpected type: %s",

+ TYPENAME(newblk->nb_list.wk_type));

+ }

+ while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {

+ inodedep->id_state |= DEPCOMPLETE;

+ inodedep->id_state &= ~ONDEPLIST;

+ LIST_REMOVE(inodedep, id_deps);

+ inodedep->id_bmsafemap = NULL;

+ }

+ if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&

+ LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&

+ LIST_EMPTY(&bmsafemap->sm_newblkhd) &&

+ LIST_EMPTY(&bmsafemap->sm_inodedephd)) {

+ if (chgs)

+ bdirty(bp);

+ LIST_REMOVE(bmsafemap, sm_hash);

+ WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);

+ return (0);

+ }

+ bdirty(bp);

+ return (1);

+/*

+ * Try to free a mkdir dependency.

static void

-handle_written_mkdir(mkdir, type)

+complete_mkdir(mkdir)

struct mkdir *mkdir;

- int type;

{

struct diradd *dap;

- struct pagedep *pagedep;

- if (mkdir->md_state != type)

- panic("handle_written_mkdir: bad type");

+ if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)

+ return;

+ LIST_REMOVE(mkdir, md_mkdirs);

dap = mkdir->md_diradd;

- dap->da_state &= ~type;

- if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)

+ dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));

+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {

dap->da_state |= DEPCOMPLETE;

- if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {

- if (dap->da_state & DIRCHG)

- pagedep = dap->da_previous->dm_pagedep;

- else

- pagedep = dap->da_pagedep;

- LIST_REMOVE(dap, da_pdlist);

- LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);

+ complete_diradd(dap);

}

- LIST_REMOVE(mkdir, md_mkdirs);

WORKITEM_FREE(mkdir, D_MKDIR);

}

+ * Handle the completion of a mkdir dependency.

+ */

+static void

+handle_written_mkdir(mkdir, type)

+ struct mkdir *mkdir;

+ int type;

+ if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)

+ panic("handle_written_mkdir: bad type");

+ mkdir->md_state |= COMPLETE;

+ complete_mkdir(mkdir);

+static void

+free_pagedep(pagedep)

+ struct pagedep *pagedep;

+ int i;

+ if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))

+ return;

+ for (i = 0; i < DAHASHSZ; i++)

+ if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))

+ return;

+ if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))

+ return;

+ if (!LIST_EMPTY(&pagedep->pd_dirremhd))

+ return;

+ if (!LIST_EMPTY(&pagedep->pd_pendinghd))

+ return;

+ LIST_REMOVE(pagedep, pd_hash);

+ WORKITEM_FREE(pagedep, D_PAGEDEP);

+/*

* Called from within softdep_disk_write_complete above.

* A write operation was just completed. Removed inodes can

* now be freed and associated block pointers may be committed.

@@ -4790,8 +9643,11 @@ handle_written_filepage(pagedep, bp)

while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {

LIST_REMOVE(dirrem, dm_next);

+ dirrem->dm_state |= COMPLETE;

dirrem->dm_dirinum = pagedep->pd_ino;

- add_to_worklist(&dirrem->dm_list);

+ KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),

+ ("handle_written_filepage: Journal entries not written."));

+ add_to_worklist(&dirrem->dm_list, 0);

}

* Free any directory additions that have been committed.

@@ -4800,7 +9656,7 @@ handle_written_filepage(pagedep, bp)

if ((pagedep->pd_state & NEWBLOCK) == 0)

while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)

- free_diradd(dap);

+ free_diradd(dap, NULL);

* Uncommitted directory entries must be restored.

@@ -4845,7 +9701,8 @@ handle_written_filepage(pagedep, bp)

* Otherwise it will remain to track any new entries on

* the page in case they are fsync'ed.

- if ((pagedep->pd_state & NEWBLOCK) == 0) {

+ if ((pagedep->pd_state & NEWBLOCK) == 0 &&

+ LIST_EMPTY(&pagedep->pd_jmvrefhd)) {

LIST_REMOVE(pagedep, pd_hash);

WORKITEM_FREE(pagedep, D_PAGEDEP);

}

@@ -4880,8 +9737,8 @@ softdep_load_inodeblock(ip)

ip->i_effnlink = ip->i_nlink;

ACQUIRE_LOCK(&lk);

- if (inodedep_lookup(UFSTOVFS(ip->i_ump),

- ip->i_number, 0, &inodedep) == 0) {

+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,

+ &inodedep) == 0) {

FREE_LOCK(&lk);

return;

}

@@ -4908,11 +9765,26 @@ softdep_update_inodeblock(ip, bp, waitfor)

int waitfor; /* nonzero => update must be allowed */

{

struct inodedep *inodedep;

+ struct inoref *inoref;

struct worklist *wk;

struct mount *mp;

struct buf *ibp;

+ struct fs *fs;

int error;

+ mp = UFSTOVFS(ip->i_ump);

+ fs = ip->i_fs;

+ /*

+ * Preserve the freelink that is on disk. clear_unlinked_inodedep()

+ * does not have access to the in-core ip so must write directly into

+ * the inode block buffer when setting freelink.

+ */

+ if (fs->fs_magic == FS_UFS1_MAGIC)

+ DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +

+ ino_to_fsbo(fs, ip->i_number))->di_freelink);

+ else

+ DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +

+ ino_to_fsbo(fs, ip->i_number))->di_freelink);

* If the effective link count is not equal to the actual link

* count, then we must track the difference in an inodedep while

@@ -4920,8 +9792,8 @@ softdep_update_inodeblock(ip, bp, waitfor)

* if there is no existing inodedep, then there are no dependencies

* to track.

- mp = UFSTOVFS(ip->i_ump);

ACQUIRE_LOCK(&lk);

+again:

if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {

FREE_LOCK(&lk);

if (ip->i_effnlink != ip->i_nlink)

@@ -4931,6 +9803,20 @@ softdep_update_inodeblock(ip, bp, waitfor)

if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)

panic("softdep_update_inodeblock: bad delta");

+ * If we're flushing all dependencies we must also move any waiting

+ * for journal writes onto the bufwait list prior to I/O.

+ */

+ if (waitfor) {

+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {

+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))

+ == DEPCOMPLETE) {

+ stat_jwait_inode++;

+ jwait(&inoref->if_list);

+ goto again;

+ }

+ /*

* Changes have been initiated. Anything depending on these

* changes cannot occur until this inode has been written.

@@ -4945,10 +9831,12 @@ softdep_update_inodeblock(ip, bp, waitfor)

merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);

if (!TAILQ_EMPTY(&inodedep->id_inoupdt))

- handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));

+ handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),

+ NULL);

merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);

if (!TAILQ_EMPTY(&inodedep->id_extupdt))

- handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));

+ handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),

+ NULL);

* Now that the inode has been pushed into the buffer, the

* operations dependent on the inode being written to disk

@@ -4971,11 +9859,11 @@ softdep_update_inodeblock(ip, bp, waitfor)

return;

}

retry:

- if ((inodedep->id_state & DEPCOMPLETE) != 0) {

+ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {

FREE_LOCK(&lk);

return;

}

- ibp = inodedep->id_buf;

+ ibp = inodedep->id_bmsafemap->sm_buf;

ibp = getdirtybuf(ibp, &lk, MNT_WAIT);

if (ibp == NULL) {

@@ -5007,13 +9895,13 @@ merge_inode_lists(newlisthead, oldlisthead)

newadp = TAILQ_FIRST(newlisthead);

for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {

- if (listadp->ad_lbn < newadp->ad_lbn) {

+ if (listadp->ad_offset < newadp->ad_offset) {

listadp = TAILQ_NEXT(listadp, ad_next);

continue;

}

TAILQ_REMOVE(newlisthead, newadp, ad_next);

TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);

- if (listadp->ad_lbn == newadp->ad_lbn) {

+ if (listadp->ad_offset == newadp->ad_offset) {

allocdirect_merge(oldlisthead, newadp,

listadp);

listadp = newadp;

@@ -5036,6 +9924,7 @@ softdep_fsync(vp)

{

struct inodedep *inodedep;

struct pagedep *pagedep;

+ struct inoref *inoref;

struct worklist *wk;

struct diradd *dap;

struct mount *mp;

@@ -5052,17 +9941,25 @@ softdep_fsync(vp)

fs = ip->i_fs;

mp = vp->v_mount;

ACQUIRE_LOCK(&lk);

+restart:

if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {

FREE_LOCK(&lk);

return (0);

}

+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {

+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))

+ == DEPCOMPLETE) {

+ stat_jwait_inode++;

+ jwait(&inoref->if_list);

+ goto restart;

+ }

if (!LIST_EMPTY(&inodedep->id_inowait) ||

- !LIST_EMPTY(&inodedep->id_bufwait) ||

!TAILQ_EMPTY(&inodedep->id_extupdt) ||

!TAILQ_EMPTY(&inodedep->id_newextupdt) ||

!TAILQ_EMPTY(&inodedep->id_inoupdt) ||

!TAILQ_EMPTY(&inodedep->id_newinoupdt))

- panic("softdep_fsync: pending ops");

+ panic("softdep_fsync: pending ops %p", inodedep);

for (error = 0, flushparent = 0; ; ) {

if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)

break;

@@ -5254,8 +10151,8 @@ int

softdep_sync_metadata(struct vnode *vp)

{

struct pagedep *pagedep;

- struct allocdirect *adp;

struct allocindir *aip;

+ struct newblk *newblk;

struct buf *bp, *nbp;

struct worklist *wk;

struct bufobj *bo;

@@ -5319,27 +10216,16 @@ loop:

switch (wk->wk_type) {

case D_ALLOCDIRECT:

- adp = WK_ALLOCDIRECT(wk);

- if (adp->ad_state & DEPCOMPLETE)

- continue;

- nbp = adp->ad_buf;

- nbp = getdirtybuf(nbp, &lk, waitfor);

- if (nbp == NULL)

- continue;

- FREE_LOCK(&lk);

- if (waitfor == MNT_NOWAIT) {

- bawrite(nbp);

- } else if ((error = bwrite(nbp)) != 0) {

- break;

- }

- ACQUIRE_LOCK(&lk);

- continue;

case D_ALLOCINDIR:

- aip = WK_ALLOCINDIR(wk);

- if (aip->ai_state & DEPCOMPLETE)

+ newblk = WK_NEWBLK(wk);

+ if (newblk->nb_jnewblk != NULL) {

+ stat_jwait_newblk++;

+ jwait(&newblk->nb_jnewblk->jn_list);

+ goto restart;

+ }

+ if (newblk->nb_state & DEPCOMPLETE)

continue;

- nbp = aip->ai_buf;

+ nbp = newblk->nb_bmsafemap->sm_buf;

nbp = getdirtybuf(nbp, &lk, waitfor);

if (nbp == NULL)

continue;

@@ -5355,10 +10241,17 @@ loop:

case D_INDIRDEP:

restart:

- LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {

- if (aip->ai_state & DEPCOMPLETE)

+ LIST_FOREACH(aip,

+ &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {

+ newblk = (struct newblk *)aip;

+ if (newblk->nb_jnewblk != NULL) {

+ stat_jwait_newblk++;

+ jwait(&newblk->nb_jnewblk->jn_list);

+ goto restart;

+ }

+ if (newblk->nb_state & DEPCOMPLETE)

continue;

- nbp = aip->ai_buf;

+ nbp = newblk->nb_bmsafemap->sm_buf;

nbp = getdirtybuf(nbp, &lk, MNT_WAIT);

if (nbp == NULL)

goto restart;

@@ -5371,14 +10264,6 @@ loop:

}

continue;

- case D_INODEDEP:

- if ((error = flush_inodedep_deps(wk->wk_mp,

- WK_INODEDEP(wk)->id_ino)) != 0) {

- FREE_LOCK(&lk);

- break;

- }

- continue;

case D_PAGEDEP:

* We are trying to sync a directory that may

@@ -5400,48 +10285,6 @@ loop:

}

continue;

- case D_MKDIR:

- /*

- * This case should never happen if the vnode has

- * been properly sync'ed. However, if this function

- * is used at a place where the vnode has not yet

- * been sync'ed, this dependency can show up. So,

- * rather than panic, just flush it.

- */

- nbp = WK_MKDIR(wk)->md_buf;

- nbp = getdirtybuf(nbp, &lk, waitfor);

- if (nbp == NULL)

- continue;

- FREE_LOCK(&lk);

- if (waitfor == MNT_NOWAIT) {

- bawrite(nbp);

- } else if ((error = bwrite(nbp)) != 0) {

- break;

- }

- ACQUIRE_LOCK(&lk);

- continue;

- case D_BMSAFEMAP:

- /*

- * This case should never happen if the vnode has

- * been properly sync'ed. However, if this function

- * is used at a place where the vnode has not yet

- * been sync'ed, this dependency can show up. So,

- * rather than panic, just flush it.

- */

- nbp = WK_BMSAFEMAP(wk)->sm_buf;

- nbp = getdirtybuf(nbp, &lk, waitfor);

- if (nbp == NULL)

- continue;

- FREE_LOCK(&lk);

- if (waitfor == MNT_NOWAIT) {

- bawrite(nbp);

- } else if ((error = bwrite(nbp)) != 0) {

- break;

- }

- ACQUIRE_LOCK(&lk);

- continue;

default:

panic("softdep_sync_metadata: Unknown type %s",

TYPENAME(wk->wk_type));

@@ -5489,7 +10332,8 @@ loop:

BO_LOCK(bo);

drain_output(vp);

BO_UNLOCK(bo);

- return (0);

+ return ffs_update(vp, 1);

+ /* return (0); */

}

@@ -5502,6 +10346,7 @@ flush_inodedep_deps(mp, ino)

ino_t ino;

{

struct inodedep *inodedep;

+ struct inoref *inoref;

int error, waitfor;

@@ -5522,8 +10367,17 @@ flush_inodedep_deps(mp, ino)

return (error);

FREE_LOCK(&lk);

ACQUIRE_LOCK(&lk);

+restart:

if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)

return (0);

+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {

+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))

+ == DEPCOMPLETE) {

+ stat_jwait_inode++;

+ jwait(&inoref->if_list);

+ goto restart;

+ }

if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||

flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||

flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||

@@ -5555,13 +10409,20 @@ flush_deplist(listhead, waitfor, errorp)

int *errorp;

{

struct allocdirect *adp;

+ struct newblk *newblk;

struct buf *bp;

mtx_assert(&lk, MA_OWNED);

TAILQ_FOREACH(adp, listhead, ad_next) {

- if (adp->ad_state & DEPCOMPLETE)

+ newblk = (struct newblk *)adp;

+ if (newblk->nb_jnewblk != NULL) {

+ stat_jwait_newblk++;

+ jwait(&newblk->nb_jnewblk->jn_list);

+ return (1);

+ }

+ if (newblk->nb_state & DEPCOMPLETE)

continue;

- bp = adp->ad_buf;

+ bp = newblk->nb_bmsafemap->sm_buf;

bp = getdirtybuf(bp, &lk, waitfor);

if (bp == NULL) {

if (waitfor == MNT_NOWAIT)

@@ -5582,6 +10443,101 @@ flush_deplist(listhead, waitfor, errorp)

}

+ * Flush dependencies associated with an allocdirect block.

+ */

+static int

+flush_newblk_dep(vp, mp, lbn)

+ struct vnode *vp;

+ struct mount *mp;

+ ufs_lbn_t lbn;

+ struct newblk *newblk;

+ struct bufobj *bo;

+ struct inode *ip;

+ struct buf *bp;

+ ufs2_daddr_t blkno;

+ int error;

+ error = 0;

+ bo = &vp->v_bufobj;

+ ip = VTOI(vp);

+ blkno = DIP(ip, i_db[lbn]);

+ if (blkno == 0)

+ panic("flush_newblk_dep: Missing block");

+ ACQUIRE_LOCK(&lk);

+ /*

+ * Loop until all dependencies related to this block are satisfied.

+ * We must be careful to restart after each sleep in case a write

+ * completes some part of this process for us.

+ */

+ for (;;) {

+ if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {

+ FREE_LOCK(&lk);

+ break;

+ }

+ if (newblk->nb_list.wk_type != D_ALLOCDIRECT)

+ panic("flush_newblk_deps: Bad newblk %p", newblk);

+ /*

+ * Flush the journal.

+ */

+ if (newblk->nb_jnewblk != NULL) {

+ stat_jwait_newblk++;

+ jwait(&newblk->nb_jnewblk->jn_list);

+ continue;

+ }

+ /*

+ * Write the bitmap dependency.

+ */

+ if ((newblk->nb_state & DEPCOMPLETE) == 0) {

+ bp = newblk->nb_bmsafemap->sm_buf;

+ bp = getdirtybuf(bp, &lk, MNT_WAIT);

+ if (bp == NULL)

+ continue;

+ FREE_LOCK(&lk);

+ error = bwrite(bp);

+ if (error)

+ break;

+ ACQUIRE_LOCK(&lk);

+ continue;

+ }

+ /*

+ * Write the buffer.

+ */

+ FREE_LOCK(&lk);

+ BO_LOCK(bo);

+ bp = gbincore(bo, lbn);

+ if (bp != NULL) {

+ error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |

+ LK_INTERLOCK, BO_MTX(bo));

+ if (error == ENOLCK) {

+ ACQUIRE_LOCK(&lk);

+ continue; /* Slept, retry */

+ }

+ if (error != 0)

+ break; /* Failed */

+ if (bp->b_flags & B_DELWRI) {

+ bremfree(bp);

+ error = bwrite(bp);

+ if (error)

+ break;

+ } else

+ BUF_UNLOCK(bp);

+ } else

+ BO_UNLOCK(bo);

+ /*

+ * We have to wait for the direct pointers to

+ * point at the newdirblk before the dependency

+ * will go away.

+ */

+ error = ffs_update(vp, MNT_WAIT);

+ if (error)

+ break;

+ ACQUIRE_LOCK(&lk);

+ }

+ return (error);

+/*

* Eliminate a pagedep dependency by flushing out all its diradd dependencies.

* Called with splbio blocked.

@@ -5592,16 +10548,16 @@ flush_pagedep_deps(pvp, mp, diraddhdp)

struct diraddhd *diraddhdp;

{

struct inodedep *inodedep;

+ struct inoref *inoref;

struct ufsmount *ump;

struct diradd *dap;

struct vnode *vp;

- struct bufobj *bo;

int error = 0;

struct buf *bp;

ino_t inum;

- struct worklist *wk;

ump = VFSTOUFS(mp);

+restart:

while ((dap = LIST_FIRST(diraddhdp)) != NULL) {

* Flush ourselves if this directory entry

@@ -5609,7 +10565,7 @@ flush_pagedep_deps(pvp, mp, diraddhdp)

if (dap->da_state & MKDIR_PARENT) {

FREE_LOCK(&lk);

- if ((error = ffs_update(pvp, 1)) != 0)

+ if ((error = ffs_update(pvp, MNT_WAIT)) != 0)

break;

ACQUIRE_LOCK(&lk);

@@ -5623,84 +10579,52 @@ flush_pagedep_deps(pvp, mp, diraddhdp)

* A newly allocated directory must have its "." and

* ".." entries written out before its name can be

- * committed in its parent. We do not want or need

- * the full semantics of a synchronous ffs_syncvnode as

- * that may end up here again, once for each directory

- * level in the filesystem. Instead, we push the blocks

- * and wait for them to clear. We have to fsync twice

- * because the first call may choose to defer blocks

- * that still have dependencies, but deferral will

- * happen at most once.

+ * committed in its parent.

inum = dap->da_newinum;

+ if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)

+ panic("flush_pagedep_deps: lost inode1");

+ /*

+ * Wait for any pending journal adds to complete so we don't

+ * cause rollbacks while syncing.

+ */

+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {

+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))

+ == DEPCOMPLETE) {

+ stat_jwait_inode++;

+ jwait(&inoref->if_list);

+ goto restart;

+ }

if (dap->da_state & MKDIR_BODY) {

FREE_LOCK(&lk);

if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,

FFSV_FORCEINSMQ)))

break;

- if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||

- (error=ffs_syncvnode(vp, MNT_NOWAIT))) {

- vput(vp);

- break;

- }

- bo = &vp->v_bufobj;

- BO_LOCK(bo);

- drain_output(vp);

+ error = flush_newblk_dep(vp, mp, 0);

- * If first block is still dirty with a D_MKDIR

- * dependency then it needs to be written now.

+ * If we still have the dependency we might need to

+ * update the vnode to sync the new link count to

+ * disk.

- for (;;) {

- error = 0;

- bp = gbincore(bo, 0);

- if (bp == NULL)

- break; /* First block not present */

- error = BUF_LOCK(bp,

- LK_EXCLUSIVE |

- LK_SLEEPFAIL |

- LK_INTERLOCK,

- BO_MTX(bo));

- BO_LOCK(bo);

- if (error == ENOLCK)

- continue; /* Slept, retry */

- if (error != 0)

- break; /* Failed */

- if ((bp->b_flags & B_DELWRI) == 0) {

- BUF_UNLOCK(bp);

- break; /* Buffer not dirty */

- }

- for (wk = LIST_FIRST(&bp->b_dep);

- wk != NULL;

- wk = LIST_NEXT(wk, wk_list))

- if (wk->wk_type == D_MKDIR)

- break;

- if (wk == NULL)

- BUF_UNLOCK(bp); /* Dependency gone */

- else {

- /*

- * D_MKDIR dependency remains,

- * must write buffer to stable

- * storage.

- */

- BO_UNLOCK(bo);

- bremfree(bp);

- error = bwrite(bp);

- BO_LOCK(bo);

- }

- break;

- }

- BO_UNLOCK(bo);

+ if (error == 0 && dap == LIST_FIRST(diraddhdp))

+ error = ffs_update(vp, MNT_WAIT);

vput(vp);

if (error != 0)

- break; /* Flushing of first block failed */

+ break;

ACQUIRE_LOCK(&lk);

* If that cleared dependencies, go on to next.

if (dap != LIST_FIRST(diraddhdp))

continue;

- if (dap->da_state & MKDIR_BODY)

- panic("flush_pagedep_deps: MKDIR_BODY");

+ if (dap->da_state & MKDIR_BODY) {

+ inodedep_lookup(UFSTOVFS(ump), inum, 0,

+ &inodedep);

+ panic("flush_pagedep_deps: MKDIR_BODY "

+ "inodedep %p dap %p vp %p",

+ inodedep, dap, vp);

+ }

}

* Flush the inode on which the directory entry depends.

@@ -5719,8 +10643,8 @@ retry:

* If the inode still has bitmap dependencies,

* push them to disk.

- if ((inodedep->id_state & DEPCOMPLETE) == 0) {

- bp = inodedep->id_buf;

+ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {

+ bp = inodedep->id_bmsafemap->sm_buf;

bp = getdirtybuf(bp, &lk, MNT_WAIT);

if (bp == NULL)

goto retry;

@@ -5733,24 +10657,29 @@ retry:

}

* If the inode is still sitting in a buffer waiting

- * to be written, push it to disk.

+ * to be written or waiting for the link count to be

+ * adjusted update it here to flush it to disk.

- FREE_LOCK(&lk);

- if ((error = bread(ump->um_devvp,

- fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),

- (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {

- brelse(bp);

- break;

+ if (dap == LIST_FIRST(diraddhdp)) {

+ FREE_LOCK(&lk);

+ if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,

+ FFSV_FORCEINSMQ)))

+ break;

+ error = ffs_update(vp, MNT_WAIT);

+ vput(vp);

+ if (error)

+ break;

+ ACQUIRE_LOCK(&lk);

}

- if ((error = bwrite(bp)) != 0)

- break;

- ACQUIRE_LOCK(&lk);

* If we have failed to get rid of all the dependencies

* then something is seriously wrong.

- if (dap == LIST_FIRST(diraddhdp))

- panic("flush_pagedep_deps: flush failed");

+ if (dap == LIST_FIRST(diraddhdp)) {

+ inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);

+ panic("flush_pagedep_deps: failed to flush "

+ "inodedep %p ino %d dap %p", inodedep, inum, dap);

+ }

}

if (error)

ACQUIRE_LOCK(&lk);

@@ -5828,6 +10757,7 @@ softdep_request_cleanup(fs, vp)

return (0);

UFS_UNLOCK(ump);

ACQUIRE_LOCK(&lk);

+ process_removes(vp);

if (ump->softdep_on_worklist > 0 &&

process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {

stat_worklist_push += 1;

@@ -6100,10 +11030,15 @@ softdep_count_dependencies(bp, wantcount)

int wantcount;

{

struct worklist *wk;

+ struct bmsafemap *bmsafemap;

struct inodedep *inodedep;

struct indirdep *indirdep;

+ struct freeblks *freeblks;

struct allocindir *aip;

struct pagedep *pagedep;

+ struct dirrem *dirrem;

+ struct newblk *newblk;

+ struct mkdir *mkdir;

struct diradd *dap;

int i, retval;

@@ -6132,6 +11067,12 @@ softdep_count_dependencies(bp, wantcount)

if (!wantcount)

goto out;

}

+ if (TAILQ_FIRST(&inodedep->id_inoreflst)) {

+ /* Add reference dependency. */

+ retval += 1;

+ if (!wantcount)

+ goto out;

+ }

continue;

case D_INDIRDEP:

@@ -6147,6 +11088,14 @@ softdep_count_dependencies(bp, wantcount)

case D_PAGEDEP:

pagedep = WK_PAGEDEP(wk);

+ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {

+ if (LIST_FIRST(&dirrem->dm_jremrefhd)) {

+ /* Journal remove ref dependency. */

+ retval += 1;

+ if (!wantcount)

+ goto out;

+ }

for (i = 0; i < DAHASHSZ; i++) {

LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {

@@ -6159,14 +11108,62 @@ softdep_count_dependencies(bp, wantcount)

continue;

case D_BMSAFEMAP:

+ bmsafemap = WK_BMSAFEMAP(wk);

+ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {

+ /* Add reference dependency. */

+ retval += 1;

+ if (!wantcount)

+ goto out;

+ }

+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {

+ /* Allocate block dependency. */

+ retval += 1;

+ if (!wantcount)

+ goto out;

+ }

+ continue;

+ case D_FREEBLKS:

+ freeblks = WK_FREEBLKS(wk);

+ if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {

+ /* Freeblk journal dependency. */

+ retval += 1;

+ if (!wantcount)

+ goto out;

+ }

+ continue;

case D_ALLOCDIRECT:

case D_ALLOCINDIR:

+ newblk = WK_NEWBLK(wk);

+ if (newblk->nb_jnewblk) {

+ /* Journal allocate dependency. */

+ retval += 1;

+ if (!wantcount)

+ goto out;

+ }

+ continue;

case D_MKDIR:

+ mkdir = WK_MKDIR(wk);

+ if (mkdir->md_jaddref) {

+ /* Journal reference dependency. */

+ retval += 1;

+ if (!wantcount)

+ goto out;

+ }

+ continue;

+ case D_FREEWORK:

+ case D_FREEDEP:

+ case D_JSEGDEP:

+ case D_JSEG:

+ case D_SBDEP:

/* never a dependency on these blocks */

continue;

default:

- panic("softdep_check_for_rollback: Unexpected type %s",

+ panic("softdep_count_dependencies: Unexpected type %s",

TYPENAME(wk->wk_type));

/* NOTREACHED */

}

@@ -6382,6 +11379,45 @@ softdep_error(func, error)

#ifdef DDB

+static void

+inodedep_print(struct inodedep *inodedep, int verbose)

+ db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"

+ " saveino %p\n",

+ inodedep, inodedep->id_fs, inodedep->id_state,

+ (intmax_t)inodedep->id_ino,

+ (intmax_t)fsbtodb(inodedep->id_fs,

+ ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),

+ inodedep->id_nlinkdelta, inodedep->id_savednlink,

+ inodedep->id_savedino1);

+ if (verbose == 0)

+ return;

+ db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "

+ "mkdiradd %p\n",

+ LIST_FIRST(&inodedep->id_pendinghd),

+ LIST_FIRST(&inodedep->id_bufwait),

+ LIST_FIRST(&inodedep->id_inowait),

+ TAILQ_FIRST(&inodedep->id_inoreflst),

+ inodedep->id_mkdiradd);

+ db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",

+ TAILQ_FIRST(&inodedep->id_inoupdt),

+ TAILQ_FIRST(&inodedep->id_newinoupdt),

+ TAILQ_FIRST(&inodedep->id_extupdt),

+ TAILQ_FIRST(&inodedep->id_newextupdt));

+DB_SHOW_COMMAND(inodedep, db_show_inodedep)

+ if (have_addr == 0) {

+ db_printf("Address required\n");

+ return;

+ }

+ inodedep_print((struct inodedep*)addr, 1);

DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)

{

struct inodedep_hashhead *inodedephd;

@@ -6395,15 +11431,62 @@ DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)

LIST_FOREACH(inodedep, inodedephd, id_hash) {

if (fs != NULL && fs != inodedep->id_fs)

continue;

- db_printf("%p fs %p st %x ino %jd inoblk %jd\n",

- inodedep, inodedep->id_fs, inodedep->id_state,

- (intmax_t)inodedep->id_ino,

- (intmax_t)fsbtodb(inodedep->id_fs,

- ino_to_fsba(inodedep->id_fs, inodedep->id_ino)));

+ inodedep_print(inodedep, 0);

}

+DB_SHOW_COMMAND(worklist, db_show_worklist)

+ struct worklist *wk;

+ if (have_addr == 0) {

+ db_printf("Address required\n");

+ return;

+ }

+ wk = (struct worklist *)addr;

+ printf("worklist: %p type %s state 0x%X\n",

+ wk, TYPENAME(wk->wk_type), wk->wk_state);

+DB_SHOW_COMMAND(workhead, db_show_workhead)

+ struct workhead *wkhd;

+ struct worklist *wk;

+ int i;

+ if (have_addr == 0) {

+ db_printf("Address required\n");

+ return;

+ }

+ wkhd = (struct workhead *)addr;

+ wk = LIST_FIRST(wkhd);

+ for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))

+ db_printf("worklist: %p type %s state 0x%X",

+ wk, TYPENAME(wk->wk_type), wk->wk_state);

+ if (i == 100)

+ db_printf("workhead overflow");

+ printf("\n");

+DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)

+ struct jaddref *jaddref;

+ struct diradd *diradd;

+ struct mkdir *mkdir;

+ LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {

+ diradd = mkdir->md_diradd;

+ db_printf("mkdir: %p state 0x%X dap %p state 0x%X",

+ mkdir, mkdir->md_state, diradd, diradd->da_state);

+ if ((jaddref = mkdir->md_jaddref) != NULL)

+ db_printf(" jaddref %p jaddref state 0x%X",

+ jaddref, jaddref->ja_state);

+ db_printf("\n");

+ }

#endif /* DDB */

#endif /* SOFTUPDATES */

diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c
index e34bc1372a2e..e2460a36be2d 100644
--- a/sys/ufs/ffs/ffs_subr.c
+++ b/sys/ufs/ffs/ffs_subr.c

@@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");

#ifndef _KERNEL

#include <ufs/ufs/dinode.h>

#include <ufs/ffs/fs.h>

-#include "fsck.h"

#else

#include <sys/systm.h>

#include <sys/lock.h>

@@ -223,7 +222,38 @@ ffs_isblock(fs, cp, h)

mask = 0x01 << (h & 0x7);

return ((cp[h >> 3] & mask) == mask);

default:

+#ifdef _KERNEL

panic("ffs_isblock");

+#endif

+ break;

+ }

+ return (0);

+/*

+ * check if a block is free

+ */

+int

+ffs_isfreeblock(fs, cp, h)

+ struct fs *fs;

+ u_char *cp;

+ ufs1_daddr_t h;

+ switch ((int)fs->fs_frag) {

+ case 8:

+ return (cp[h] == 0);

+ case 4:

+ return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);

+ case 2:

+ return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);

+ case 1:

+ return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);

+ default:

+#ifdef _KERNEL

+ panic("ffs_isfreeblock");

+#endif

+ break;

}

return (0);

}

@@ -252,7 +282,10 @@ ffs_clrblock(fs, cp, h)

cp[h >> 3] &= ~(0x01 << (h & 0x7));

return;

default:

+#ifdef _KERNEL

panic("ffs_clrblock");

+#endif

+ break;

}

@@ -281,6 +314,101 @@ ffs_setblock(fs, cp, h)

cp[h >> 3] |= (0x01 << (h & 0x7));

return;

default:

+#ifdef _KERNEL

panic("ffs_setblock");

+#endif

+ break;

+ }

+/*

+ * Update the cluster map because of an allocation or free.

+ *

+ * Cnt == 1 means free; cnt == -1 means allocating.

+ */

+void

+ffs_clusteracct(fs, cgp, blkno, cnt)

+ struct fs *fs;

+ struct cg *cgp;

+ ufs1_daddr_t blkno;

+ int cnt;

+ int32_t *sump;

+ int32_t *lp;

+ u_char *freemapp, *mapp;

+ int i, start, end, forw, back, map, bit;

+ if (fs->fs_contigsumsize <= 0)

+ return;

+ freemapp = cg_clustersfree(cgp);

+ sump = cg_clustersum(cgp);

+ /*

+ * Allocate or clear the actual block.

+ */

+ if (cnt > 0)

+ setbit(freemapp, blkno);

+ else

+ clrbit(freemapp, blkno);

+ /*

+ * Find the size of the cluster going forward.

+ */

+ start = blkno + 1;

+ end = start + fs->fs_contigsumsize;

+ if (end >= cgp->cg_nclusterblks)

+ end = cgp->cg_nclusterblks;

+ mapp = &freemapp[start / NBBY];

+ map = *mapp++;

+ bit = 1 << (start % NBBY);

+ for (i = start; i < end; i++) {

+ if ((map & bit) == 0)

+ break;

+ if ((i & (NBBY - 1)) != (NBBY - 1)) {

+ bit <<= 1;

+ } else {

+ map = *mapp++;

+ bit = 1;

+ }

+ forw = i - start;

+ /*

+ * Find the size of the cluster going backward.

+ */

+ start = blkno - 1;

+ end = start - fs->fs_contigsumsize;

+ if (end < 0)

+ end = -1;

+ mapp = &freemapp[start / NBBY];

+ map = *mapp--;

+ bit = 1 << (start % NBBY);

+ for (i = start; i > end; i--) {

+ if ((map & bit) == 0)

+ break;

+ if ((i & (NBBY - 1)) != 0) {

+ bit >>= 1;

+ } else {

+ map = *mapp--;

+ bit = 1 << (NBBY - 1);

+ }

}

+ back = start - i;

+ /*

+ * Account for old cluster and the possibly new forward and

+ * back clusters.

+ */

+ i = back + forw + 1;

+ if (i > fs->fs_contigsumsize)

+ i = fs->fs_contigsumsize;

+ sump[i] += cnt;

+ if (back > 0)

+ sump[back] -= cnt;

+ if (forw > 0)

+ sump[forw] -= cnt;

+ /*

+ * Update cluster summary information.

+ */

+ lp = &sump[fs->fs_contigsumsize];

+ for (i = fs->fs_contigsumsize; i > 0; i--)

+ if (*lp-- > 0)

+ break;

+ fs->fs_maxcluster[cgp->cg_cgx] = i;

}

diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 8aa9f9c53a4c..e40336863248 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c

@@ -79,7 +79,6 @@ static int ffs_reload(struct mount *, struct thread *);

static int ffs_mountfs(struct vnode *, struct mount *, struct thread *);

static void ffs_oldfscompat_read(struct fs *, struct ufsmount *,

ufs2_daddr_t);

-static void ffs_oldfscompat_write(struct fs *, struct ufsmount *);

static void ffs_ifree(struct ufsmount *ump, struct inode *ip);

static vfs_init_t ffs_init;

static vfs_uninit_t ffs_uninit;

@@ -299,7 +298,8 @@ ffs_mount(struct mount *mp)

if (fs->fs_clean == 0) {

fs->fs_flags |= FS_UNCLEAN;

if ((mp->mnt_flag & MNT_FORCE) ||

- ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&

+ ((fs->fs_flags &

+ (FS_SUJ | FS_NEEDSFSCK)) == 0 &&

(fs->fs_flags & FS_DOSOFTDEP))) {

printf("WARNING: %s was not %s\n",

fs->fs_fsmnt, "properly dismounted");

@@ -307,6 +307,9 @@ ffs_mount(struct mount *mp)

printf(

"WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n",

fs->fs_fsmnt);

+ if (fs->fs_flags & FS_SUJ)

+ printf(

+"WARNING: Forced mount will invalidated journal contents\n");

return (EPERM);

}

@@ -330,17 +333,18 @@ ffs_mount(struct mount *mp)

MNT_ILOCK(mp);

mp->mnt_flag &= ~MNT_RDONLY;

MNT_IUNLOCK(mp);

- fs->fs_clean = 0;

- if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {

- vn_finished_write(mp);

- return (error);

- }

+ fs->fs_mtime = time_second;

/* check to see if we need to start softdep */

if ((fs->fs_flags & FS_DOSOFTDEP) &&

(error = softdep_mount(devvp, mp, fs, td->td_ucred))){

vn_finished_write(mp);

return (error);

}

+ fs->fs_clean = 0;

+ if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {

+ vn_finished_write(mp);

+ return (error);

+ }

if (fs->fs_snapinum[0] != 0)

ffs_snapshot_mount(mp);

vn_finished_write(mp);

@@ -665,7 +669,6 @@ ffs_mountfs(devvp, mp, td)

if (mp->mnt_iosize_max > MAXPHYS)

mp->mnt_iosize_max = MAXPHYS;

- devvp->v_bufobj.bo_private = cp;

devvp->v_bufobj.bo_ops = &ffs_ops;

fs = NULL;

@@ -706,7 +709,7 @@ ffs_mountfs(devvp, mp, td)

if (fs->fs_clean == 0) {

fs->fs_flags |= FS_UNCLEAN;

if (ronly || (mp->mnt_flag & MNT_FORCE) ||

- ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&

+ ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 &&

(fs->fs_flags & FS_DOSOFTDEP))) {

printf(

"WARNING: %s was not properly dismounted\n",

@@ -715,6 +718,9 @@ ffs_mountfs(devvp, mp, td)

printf(

"WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n",

fs->fs_fsmnt);

+ if (fs->fs_flags & FS_SUJ)

+ printf(

+"WARNING: Forced mount will invalidated journal contents\n");

error = EPERM;

goto out;

}

@@ -897,6 +903,7 @@ ffs_mountfs(devvp, mp, td)

bzero(fs->fs_fsmnt, MAXMNTLEN);

strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);

+ mp->mnt_stat.f_iosize = fs->fs_bsize;

if( mp->mnt_flag & MNT_ROOTFS) {

@@ -908,6 +915,7 @@ ffs_mountfs(devvp, mp, td)

}

if (ronly == 0) {

+ fs->fs_mtime = time_second;

if ((fs->fs_flags & FS_DOSOFTDEP) &&

(error = softdep_mount(devvp, mp, fs, cred)) != 0) {

free(fs->fs_csp, M_UFSMNT);

@@ -938,7 +946,6 @@ ffs_mountfs(devvp, mp, td)

* This would all happen while the filesystem was busy/not

* available, so would effectively be "atomic".

- mp->mnt_stat.f_iosize = fs->fs_bsize;

(void) ufs_extattr_autostart(mp, td);

#endif /* !UFS_EXTATTR_AUTOSTART */

#endif /* !UFS_EXTATTR */

@@ -1038,7 +1045,7 @@ ffs_oldfscompat_read(fs, ump, sblockloc)

* XXX - Parts get retired eventually.

* Unfortunately new bits get added.

-static void

+void

ffs_oldfscompat_write(fs, ump)

struct fs *fs;

struct ufsmount *ump;

@@ -1133,6 +1140,7 @@ ffs_unmount(mp, mntflags)

fs->fs_pendinginodes = 0;

}

UFS_UNLOCK(ump);

+ softdep_unmount(mp);

if (fs->fs_ronly == 0) {

fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;

error = ffs_sbupdate(ump, MNT_WAIT, 0);

@@ -1574,16 +1582,6 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags)

DIP_SET(ip, i_gen, ip->i_gen);

}

- /*

- * Ensure that uid and gid are correct. This is a temporary

- * fix until fsck has been changed to do the update.

- */

- if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */

- fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */

- ip->i_uid = ip->i_din1->di_ouid; /* XXX */

- ip->i_gid = ip->i_din1->di_ogid; /* XXX */

- } /* XXX */

#ifdef MAC

if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {

@@ -1727,6 +1725,8 @@ ffs_sbupdate(mp, waitfor, suspended)

}

fs->fs_fmod = 0;

fs->fs_time = time_second;

+ if (fs->fs_flags & FS_DOSOFTDEP)

+ softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp);

bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);

ffs_oldfscompat_write((struct fs *)bp->b_data, mp);

if (suspended)

@@ -1868,9 +1868,6 @@ ffs_bufwrite(struct buf *bp)

}

BO_UNLOCK(bp->b_bufobj);

- /* Mark the buffer clean */

- bundirty(bp);

* If this buffer is marked for background writing and we

* do not have to wait for it, make a copy and write the

@@ -1911,9 +1908,16 @@ ffs_bufwrite(struct buf *bp)

newbp->b_flags &= ~B_INVAL;

#ifdef SOFTUPDATES

- /* move over the dependencies */

- if (!LIST_EMPTY(&bp->b_dep))

- softdep_move_dependencies(bp, newbp);

+ /*

+ * Move over the dependencies. If there are rollbacks,

+ * leave the parent buffer dirtied as it will need to

+ * be written again.

+ */

+ if (LIST_EMPTY(&bp->b_dep) ||

+ softdep_move_dependencies(bp, newbp) == 0)

+ bundirty(bp);

+#else

+ bundirty(bp);

#endif

@@ -1926,7 +1930,10 @@ ffs_bufwrite(struct buf *bp)

bqrelse(bp);

bp = newbp;

- }

+ } else

+ /* Mark the buffer clean */

+ bundirty(bp);

/* Let the normal bufwrite do the rest for us */

normal_write:

@@ -1940,6 +1947,7 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)

struct vnode *vp;

int error;

struct buf *tbp;

+ int nocopy;

vp = bo->__bo_vnode;

if (bp->b_iocmd == BIO_WRITE) {

@@ -1947,8 +1955,9 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)

bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&

(bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)

panic("ffs_geom_strategy: bad I/O");

- bp->b_flags &= ~B_VALIDSUSPWRT;

- if ((vp->v_vflag & VV_COPYONWRITE) &&

+ nocopy = bp->b_flags & B_NOCOPY;

+ bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY);

+ if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&

vp->v_rdev->si_snapdata != NULL) {

if ((bp->b_flags & B_CLUSTER) != 0) {

runningbufwakeup(bp);

diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 464a7613e162..e6617cbcdfa8 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c

@@ -225,6 +225,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor)

wait = (waitfor == MNT_WAIT);

lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));

bo = &vp->v_bufobj;

+ ip->i_flag &= ~IN_NEEDSYNC;

* Flush all dirty buffers associated with a vnode.

diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h
index 5452e2be6de2..e863b961c620 100644
--- a/sys/ufs/ffs/fs.h
+++ b/sys/ufs/ffs/fs.h

@@ -340,7 +340,9 @@ struct fs {

u_int32_t fs_avgfilesize; /* expected average file size */

u_int32_t fs_avgfpdir; /* expected # of files per directory */

int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */

- int32_t fs_sparecon32[26]; /* reserved for future constants */

+ ufs_time_t fs_mtime; /* Last mount or fsck time. */

+ int32_t fs_sujfree; /* SUJ free list */

+ int32_t fs_sparecon32[23]; /* reserved for future constants */

int32_t fs_flags; /* see FS_ flags below */

int32_t fs_contigsumsize; /* size of cluster summary array */

int32_t fs_maxsymlinklen; /* max length of an internal symlink */

@@ -408,12 +410,13 @@ CTASSERT(sizeof(struct fs) == 1376);

#define FS_UNCLEAN 0x0001 /* filesystem not clean at mount */

#define FS_DOSOFTDEP 0x0002 /* filesystem using soft dependencies */

#define FS_NEEDSFSCK 0x0004 /* filesystem needs sync fsck before mount */

-#define FS_INDEXDIRS 0x0008 /* kernel supports indexed directories */

+#define FS_SUJ 0x0008 /* Filesystem using softupdate journal */

#define FS_ACLS 0x0010 /* file system has POSIX.1e ACLs enabled */

#define FS_MULTILABEL 0x0020 /* file system is MAC multi-label */

#define FS_GJOURNAL 0x0040 /* gjournaled file system */

#define FS_FLAGS_UPDATED 0x0080 /* flags have been moved to new location */

#define FS_NFS4ACLS 0x0100 /* file system has NFSv4 ACLs enabled */

+#define FS_INDEXDIRS 0x0200 /* kernel supports indexed directories */

* Macros to access bits in the fs_active array.

@@ -603,7 +606,31 @@ struct cg {

? (fs)->fs_bsize \

: (fragroundup(fs, blkoff(fs, (size)))))

+/*

+ * Indirect lbns are aligned on NDADDR addresses where single indirects

+ * are the negated address of the lowest lbn reachable, double indirects

+ * are this lbn - 1 and triple indirects are this lbn - 2. This yields

+ * an unusual bit order to determine level.

+ */

+static inline int

+lbn_level(ufs_lbn_t lbn)

+ if (lbn >= 0)

+ return 0;

+ switch (lbn & 0x3) {

+ case 0:

+ return (0);

+ case 1:

+ break;

+ case 2:

+ return (2);

+ case 3:

+ return (1);

+ default:

+ break;

+ }

+ return (-1);

* Number of inodes in a secondary storage block/fragment.

@@ -615,6 +642,108 @@ struct cg {

#define NINDIR(fs) ((fs)->fs_nindir)

+/*

+ * Softdep journal record format.

+ */

+#define JOP_ADDREF 1 /* Add a reference to an inode. */

+#define JOP_REMREF 2 /* Remove a reference from an inode. */

+#define JOP_NEWBLK 3 /* Allocate a block. */

+#define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */

+#define JOP_MVREF 5 /* Move a reference from one off to another. */

+#define JOP_TRUNC 6 /* Partial truncation record. */

+#define JREC_SIZE 32 /* Record and segment header size. */

+#define SUJ_MIN (4 * 1024 * 1024) /* Minimum journal size */

+#define SUJ_MAX (32 * 1024 * 1024) /* Maximum journal size */

+#define SUJ_FILE ".sujournal" /* Journal file name */

+/*

+ * Size of the segment record header. There is at most one for each disk

+ * block n the journal. The segment header is followed by an array of

+ * records. fsck depends on the first element in each record being 'op'

+ * and the second being 'ino'. Segments may span multiple disk blocks but

+ * the header is present on each.

+ */

+struct jsegrec {

+ uint64_t jsr_seq; /* Our sequence number */

+ uint64_t jsr_oldest; /* Oldest valid sequence number */

+ uint16_t jsr_cnt; /* Count of valid records */

+ uint16_t jsr_blocks; /* Count of DEV_BSIZE blocks. */

+ uint32_t jsr_crc; /* 32bit crc of the valid space */

+ ufs_time_t jsr_time; /* timestamp for mount instance */

+};

+/*

+ * Reference record. Records a single link count modification.

+ */

+struct jrefrec {

+ uint32_t jr_op;

+ ino_t jr_ino;

+ ino_t jr_parent;

+ uint16_t jr_nlink;

+ uint16_t jr_mode;

+ off_t jr_diroff;

+ uint64_t jr_unused;

+};

+/*

+ * Move record. Records a reference moving within a directory block. The

+ * nlink is unchanged but we must search both locations.

+ */

+struct jmvrec {

+ uint32_t jm_op;

+ ino_t jm_ino;

+ ino_t jm_parent;

+ uint16_t jm_unused;

+ off_t jm_oldoff;

+ off_t jm_newoff;

+};

+/*

+ * Block record. A set of frags or tree of blocks starting at an indirect are

+ * freed or a set of frags are allocated.

+ */

+struct jblkrec {

+ uint32_t jb_op;

+ uint32_t jb_ino;

+ ufs2_daddr_t jb_blkno;

+ ufs_lbn_t jb_lbn;

+ uint16_t jb_frags;

+ uint16_t jb_oldfrags;

+ uint32_t jb_unused;

+};

+/*

+ * Truncation record. Records a partial truncation so that it may be

+ * completed later.

+ */

+struct jtrncrec {

+ uint32_t jt_op;

+ uint32_t jt_ino;

+ off_t jt_size;

+ uint32_t jt_extsize;

+ uint32_t jt_pad[3];

+};

+union jrec {

+ struct jsegrec rec_jsegrec;

+ struct jrefrec rec_jrefrec;

+ struct jmvrec rec_jmvrec;

+ struct jblkrec rec_jblkrec;

+ struct jtrncrec rec_jtrncrec;

+};

+#ifdef CTASSERT

+CTASSERT(sizeof(struct jsegrec) == JREC_SIZE);

+CTASSERT(sizeof(struct jrefrec) == JREC_SIZE);

+CTASSERT(sizeof(struct jmvrec) == JREC_SIZE);

+CTASSERT(sizeof(struct jblkrec) == JREC_SIZE);

+CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE);

+CTASSERT(sizeof(union jrec) == JREC_SIZE);

+#endif

extern int inside[], around[];

extern u_char *fragtbl[];

diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h
index b00183bcfd2c..5d8a8691b170 100644
--- a/sys/ufs/ffs/softdep.h
+++ b/sys/ufs/ffs/softdep.h

@@ -94,22 +94,29 @@

* The ONWORKLIST flag shows whether the structure is currently linked

* onto a worklist.

-#define ATTACHED 0x0001

-#define UNDONE 0x0002

-#define COMPLETE 0x0004

-#define DEPCOMPLETE 0x0008

-#define MKDIR_PARENT 0x0010 /* diradd & mkdir only */

-#define MKDIR_BODY 0x0020 /* diradd & mkdir only */

-#define RMDIR 0x0040 /* dirrem only */

-#define DIRCHG 0x0080 /* diradd & dirrem only */

-#define GOINGAWAY 0x0100 /* indirdep only */

-#define IOSTARTED 0x0200 /* inodedep & pagedep only */

-#define SPACECOUNTED 0x0400 /* inodedep only */

-#define NEWBLOCK 0x0800 /* pagedep only */

-#define INPROGRESS 0x1000 /* dirrem, freeblks, freefrag, freefile only */

-#define UFS1FMT 0x2000 /* indirdep only */

-#define EXTDATA 0x4000 /* allocdirect only */

-#define ONWORKLIST 0x8000

+#define ATTACHED 0x000001

+#define UNDONE 0x000002

+#define COMPLETE 0x000004

+#define DEPCOMPLETE 0x000008

+#define MKDIR_PARENT 0x000010 /* diradd, mkdir, jaddref, jsegdep only */

+#define MKDIR_BODY 0x000020 /* diradd, mkdir, jaddref only */

+#define RMDIR 0x000040 /* dirrem only */

+#define DIRCHG 0x000080 /* diradd, dirrem only */

+#define GOINGAWAY 0x000100 /* indirdep, jremref only */

+#define IOSTARTED 0x000200 /* inodedep, pagedep, bmsafemap only */

+#define SPACECOUNTED 0x000400 /* inodedep only */

+#define NEWBLOCK 0x000800 /* pagedep, jaddref only */

+#define INPROGRESS 0x001000 /* dirrem, freeblks, freefrag, freefile only */

+#define UFS1FMT 0x002000 /* indirdep only */

+#define EXTDATA 0x004000 /* allocdirect only */

+#define ONWORKLIST 0x008000

+#define IOWAITING 0x010000 /* Thread is waiting for IO to complete. */

+#define ONDEPLIST 0x020000 /* Structure is on a dependency list. */

+#define UNLINKED 0x040000 /* inodedep has been unlinked. */

+#define UNLINKNEXT 0x080000 /* inodedep has valid di_freelink */

+#define UNLINKPREV 0x100000 /* inodedep is pointed at in the unlink list */

+#define UNLINKONLIST 0x200000 /* inodedep is in the unlinked list on disk */

+#define UNLINKLINKS (UNLINKNEXT | UNLINKPREV)

#define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE)

@@ -135,25 +142,38 @@

* and the macros below changed to use it.

struct worklist {

- struct mount *wk_mp; /* Mount we live in */

LIST_ENTRY(worklist) wk_list; /* list of work requests */

- unsigned short wk_type; /* type of request */

- unsigned short wk_state; /* state flags */

+ struct mount *wk_mp; /* Mount we live in */

+ unsigned int wk_type:8, /* type of request */

+ wk_state:24; /* state flags */

};

#define WK_DATA(wk) ((void *)(wk))

#define WK_PAGEDEP(wk) ((struct pagedep *)(wk))

#define WK_INODEDEP(wk) ((struct inodedep *)(wk))

#define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk))

+#define WK_NEWBLK(wk) ((struct newblk *)(wk))

#define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk))

#define WK_INDIRDEP(wk) ((struct indirdep *)(wk))

#define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk))

#define WK_FREEFRAG(wk) ((struct freefrag *)(wk))

#define WK_FREEBLKS(wk) ((struct freeblks *)(wk))

+#define WK_FREEWORK(wk) ((struct freework *)(wk))

#define WK_FREEFILE(wk) ((struct freefile *)(wk))

#define WK_DIRADD(wk) ((struct diradd *)(wk))

#define WK_MKDIR(wk) ((struct mkdir *)(wk))

#define WK_DIRREM(wk) ((struct dirrem *)(wk))

#define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk))

+#define WK_JADDREF(wk) ((struct jaddref *)(wk))

+#define WK_JREMREF(wk) ((struct jremref *)(wk))

+#define WK_JMVREF(wk) ((struct jmvref *)(wk))

+#define WK_JSEGDEP(wk) ((struct jsegdep *)(wk))

+#define WK_JSEG(wk) ((struct jseg *)(wk))

+#define WK_JNEWBLK(wk) ((struct jnewblk *)(wk))

+#define WK_JFREEBLK(wk) ((struct jfreeblk *)(wk))

+#define WK_FREEDEP(wk) ((struct freedep *)(wk))

+#define WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk))

+#define WK_SBDEP(wk) ((struct sbdep *)wk)

+#define WK_JTRUNC(wk) ((struct jtrunc *)(wk))

* Various types of lists

@@ -165,6 +185,15 @@ LIST_HEAD(inodedephd, inodedep);

LIST_HEAD(allocindirhd, allocindir);

LIST_HEAD(allocdirecthd, allocdirect);

TAILQ_HEAD(allocdirectlst, allocdirect);

+LIST_HEAD(indirdephd, indirdep);

+LIST_HEAD(jaddrefhd, jaddref);

+LIST_HEAD(jremrefhd, jremref);

+LIST_HEAD(jmvrefhd, jmvref);

+LIST_HEAD(jnewblkhd, jnewblk);

+LIST_HEAD(jfreeblkhd, jfreeblk);

+LIST_HEAD(freeworkhd, freework);

+TAILQ_HEAD(jseglst, jseg);

+TAILQ_HEAD(inoreflst, inoref);

* The "pagedep" structure tracks the various dependencies related to

@@ -192,9 +221,11 @@ struct pagedep {

LIST_ENTRY(pagedep) pd_hash; /* hashed lookup */

ino_t pd_ino; /* associated file */

ufs_lbn_t pd_lbn; /* block within file */

+ struct newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */

struct dirremhd pd_dirremhd; /* dirrem's waiting for page */

struct diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */

struct diraddhd pd_pendinghd; /* directory entries awaiting write */

+ struct jmvrefhd pd_jmvrefhd; /* Dependent journal writes. */

};

@@ -248,13 +279,18 @@ struct inodedep {

struct worklist id_list; /* buffer holding inode block */

# define id_state id_list.wk_state /* inode dependency state */

LIST_ENTRY(inodedep) id_hash; /* hashed lookup */

+ TAILQ_ENTRY(inodedep) id_unlinked; /* Unlinked but ref'd inodes */

struct fs *id_fs; /* associated filesystem */

ino_t id_ino; /* dependent inode */

nlink_t id_nlinkdelta; /* saved effective link count */

+ nlink_t id_savednlink; /* Link saved during rollback */

LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */

- struct buf *id_buf; /* related bmsafemap (if pending) */

+ struct bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */

+ struct diradd *id_mkdiradd; /* diradd for a mkdir. */

+ struct inoreflst id_inoreflst; /* Inode reference adjustments. */

long id_savedextsize; /* ext size saved during rollback */

off_t id_savedsize; /* file size saved during rollback */

+ struct dirremhd id_dirremhd; /* Removals pending. */

struct workhead id_pendinghd; /* entries awaiting directory write */

struct workhead id_bufwait; /* operations after inode written */

struct workhead id_inowait; /* operations waiting inode update */

@@ -271,23 +307,6 @@ struct inodedep {

#define id_savedino2 id_un.idu_savedino2

- * A "newblk" structure is attached to a bmsafemap structure when a block

- * or fragment is allocated from a cylinder group. Its state is set to

- * DEPCOMPLETE when its cylinder group map is written. It is consumed by

- * an associated allocdirect or allocindir allocation which will attach

- * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag

- * is not set (i.e., its cylinder group map has not been written).

- */

-struct newblk {

- LIST_ENTRY(newblk) nb_hash; /* hashed lookup */

- struct fs *nb_fs; /* associated filesystem */

- int nb_state; /* state of bitmap dependency */

- ufs2_daddr_t nb_newblkno; /* allocated block number */

- LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblk's */

- struct bmsafemap *nb_bmsafemap; /* associated bmsafemap */

-};

-/*

* A "bmsafemap" structure maintains a list of dependency structures

* that depend on the update of a particular cylinder group map.

* It has lists for newblks, allocdirects, allocindirs, and inodedeps.

@@ -299,11 +318,41 @@ struct newblk {

struct bmsafemap {

struct worklist sm_list; /* cylgrp buffer */

+# define sm_state sm_list.wk_state

+ int sm_cg;

+ LIST_ENTRY(bmsafemap) sm_hash; /* Hash links. */

struct buf *sm_buf; /* associated buffer */

struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */

+ struct allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */

struct allocindirhd sm_allocindirhd; /* allocindir deps */

+ struct allocindirhd sm_allocindirwr; /* writing allocindir deps */

struct inodedephd sm_inodedephd; /* inodedep deps */

+ struct inodedephd sm_inodedepwr; /* writing inodedep deps */

struct newblkhd sm_newblkhd; /* newblk deps */

+ struct newblkhd sm_newblkwr; /* writing newblk deps */

+ struct jaddrefhd sm_jaddrefhd; /* Pending inode allocations. */

+ struct jnewblkhd sm_jnewblkhd; /* Pending block allocations. */

+};

+/*

+ * A "newblk" structure is attached to a bmsafemap structure when a block

+ * or fragment is allocated from a cylinder group. Its state is set to

+ * DEPCOMPLETE when its cylinder group map is written. It is converted to

+ * an allocdirect or allocindir allocation once the allocator calls the

+ * appropriate setup function.

+ */

+struct newblk {

+ struct worklist nb_list;

+# define nb_state nb_list.wk_state

+ LIST_ENTRY(newblk) nb_hash; /* hashed lookup */

+ LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblks */

+ struct jnewblk *nb_jnewblk; /* New block journal entry. */

+ struct bmsafemap *nb_bmsafemap;/* cylgrp dep (if pending) */

+ struct freefrag *nb_freefrag; /* fragment to be freed (if any) */

+ struct indirdephd nb_indirdeps; /* Children indirect blocks. */

+ struct workhead nb_newdirblk; /* dir block to notify when written */

+ struct workhead nb_jwork; /* Journal work pending. */

+ ufs2_daddr_t nb_newblkno; /* new value of block pointer */

};

@@ -334,20 +383,18 @@ struct bmsafemap {

* and inodedep->id_pendinghd lists.

struct allocdirect {

- struct worklist ad_list; /* buffer holding block */

-# define ad_state ad_list.wk_state /* block pointer state */

+ struct newblk ad_block; /* Common block logic */

+# define ad_state ad_block.nb_list.wk_state /* block pointer state */

TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */

- ufs_lbn_t ad_lbn; /* block within file */

- ufs2_daddr_t ad_newblkno; /* new value of block pointer */

- ufs2_daddr_t ad_oldblkno; /* old value of block pointer */

- long ad_newsize; /* size of new block */

- long ad_oldsize; /* size of old block */

- LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */

- struct buf *ad_buf; /* cylgrp buffer (if pending) */

struct inodedep *ad_inodedep; /* associated inodedep */

- struct freefrag *ad_freefrag; /* fragment to be freed (if any) */

- struct workhead ad_newdirblk; /* dir block to notify when written */

+ ufs2_daddr_t ad_oldblkno; /* old value of block pointer */

+ int ad_offset; /* Pointer offset in parent. */

+ long ad_newsize; /* size of new block */

+ long ad_oldsize; /* size of old block */

};

+#define ad_newblkno ad_block.nb_newblkno

+#define ad_freefrag ad_block.nb_freefrag

+#define ad_newdirblk ad_block.nb_newdirblk

* A single "indirdep" structure manages all allocation dependencies for

@@ -369,10 +416,14 @@ struct allocdirect {

struct indirdep {

struct worklist ir_list; /* buffer holding indirect block */

# define ir_state ir_list.wk_state /* indirect block pointer state */

- caddr_t ir_saveddata; /* buffer cache contents */

+ LIST_ENTRY(indirdep) ir_next; /* alloc{direct,indir} list */

+ caddr_t ir_saveddata; /* buffer cache contents */

struct buf *ir_savebp; /* buffer holding safe copy */

+ struct allocindirhd ir_completehd; /* waiting for indirdep complete */

+ struct allocindirhd ir_writehd; /* Waiting for the pointer write. */

struct allocindirhd ir_donehd; /* done waiting to update safecopy */

struct allocindirhd ir_deplisthd; /* allocindir deps for this block */

+ struct workhead ir_jwork; /* Journal work pending. */

};

@@ -389,16 +440,25 @@ struct indirdep {

* can then be freed as it is no longer applicable.

struct allocindir {

- struct worklist ai_list; /* buffer holding indirect block */

-# define ai_state ai_list.wk_state /* indirect block pointer state */

+ struct newblk ai_block; /* Common block area */

+# define ai_state ai_block.nb_list.wk_state /* indirect pointer state */

LIST_ENTRY(allocindir) ai_next; /* indirdep's list of allocindir's */

- int ai_offset; /* pointer offset in indirect block */

- ufs2_daddr_t ai_newblkno; /* new block pointer value */

- ufs2_daddr_t ai_oldblkno; /* old block pointer value */

- struct freefrag *ai_freefrag; /* block to be freed when complete */

struct indirdep *ai_indirdep; /* address of associated indirdep */

- LIST_ENTRY(allocindir) ai_deps; /* bmsafemap's list of allocindir's */

- struct buf *ai_buf; /* cylgrp buffer (if pending) */

+ ufs2_daddr_t ai_oldblkno; /* old value of block pointer */

+ int ai_offset; /* Pointer offset in parent. */

+};

+#define ai_newblkno ai_block.nb_newblkno

+#define ai_freefrag ai_block.nb_freefrag

+#define ai_newdirblk ai_block.nb_newdirblk

+/*

+ * The allblk union is used to size the newblk structure on allocation so

+ * that it may be any one of three types.

+ */

+union allblk {

+ struct allocindir ab_allocindir;

+ struct allocdirect ab_allocdirect;

+ struct newblk ab_newblk;

};

@@ -406,14 +466,13 @@ struct allocindir {

* allocated fragment is replaced with a larger fragment, rather than extended.

* The "freefrag" structure is constructed and attached when the replacement

* block is first allocated. It is processed after the inode claiming the

- * bigger block that replaces it has been written to disk. Note that the

- * ff_state field is is used to store the uid, so may lose data. However,

- * the uid is used only in printing an error message, so is not critical.

- * Keeping it in a short keeps the data structure down to 32 bytes.

+ * bigger block that replaces it has been written to disk.

struct freefrag {

struct worklist ff_list; /* id_inowait or delayed worklist */

-# define ff_state ff_list.wk_state /* owning user; should be uid_t */

+# define ff_state ff_list.wk_state

+ struct jfreefrag *ff_jfreefrag; /* Associated journal entry. */

+ struct workhead ff_jwork; /* Journal work pending. */

ufs2_daddr_t ff_blkno; /* fragment physical block number */

long ff_fragsize; /* size of fragment being deleted */

ino_t ff_inum; /* owning inode number */

@@ -423,20 +482,57 @@ struct freefrag {

* A "freeblks" structure is attached to an "inodedep" when the

* corresponding file's length is reduced to zero. It records all

* the information needed to free the blocks of a file after its

- * zero'ed inode has been written to disk.

+ * zero'ed inode has been written to disk. The actual work is done

+ * by child freework structures which are responsible for individual

+ * inode pointers while freeblks is responsible for retiring the

+ * entire operation when it is complete and holding common members.

struct freeblks {

struct worklist fb_list; /* id_inowait or delayed worklist */

# define fb_state fb_list.wk_state /* inode and dirty block state */

+ struct jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */

+ struct workhead fb_freeworkhd; /* Work items pending */

+ struct workhead fb_jwork; /* Journal work pending */

ino_t fb_previousinum; /* inode of previous owner of blocks */

uid_t fb_uid; /* uid of previous owner of blocks */

struct vnode *fb_devvp; /* filesystem device vnode */

- long fb_oldextsize; /* previous ext data size */

- off_t fb_oldsize; /* previous file size */

ufs2_daddr_t fb_chkcnt; /* used to check cnt of blks released */

- ufs2_daddr_t fb_dblks[NDADDR]; /* direct blk ptrs to deallocate */

- ufs2_daddr_t fb_iblks[NIADDR]; /* indirect blk ptrs to deallocate */

- ufs2_daddr_t fb_eblks[NXADDR]; /* indirect blk ptrs to deallocate */

+ int fb_ref; /* Children outstanding. */

+};

+/*

+ * A "freework" structure handles the release of a tree of blocks or a single

+ * block. Each indirect block in a tree is allocated its own freework

+ * structure so that the indrect block may be freed only when all of its

+ * children are freed. In this way we enforce the rule that an allocated

+ * block must have a valid path to a root that is journaled. Each child

+ * block acquires a reference and when the ref hits zero the parent ref

+ * is decremented. If there is no parent the freeblks ref is decremented.

+ */

+struct freework {

+ struct worklist fw_list;

+# define fw_state fw_list.wk_state

+ LIST_ENTRY(freework) fw_next; /* Queue for freeblksk. */

+ struct freeblks *fw_freeblks; /* Root of operation. */

+ struct freework *fw_parent; /* Parent indirect. */

+ ufs2_daddr_t fw_blkno; /* Our block #. */

+ ufs_lbn_t fw_lbn; /* Original lbn before free. */

+ int fw_frags; /* Number of frags. */

+ int fw_ref; /* Number of children out. */

+ int fw_off; /* Current working position. */

+ struct workhead fw_jwork; /* Journal work pending. */

+};

+/*

+ * A "freedep" structure is allocated to track the completion of a bitmap

+ * write for a freework. One freedep may cover many freed blocks so long

+ * as they reside in the same cylinder group. When the cg is written

+ * the freedep decrements the ref on the freework which may permit it

+ * to be freed as well.

+ */

+struct freedep {

+ struct worklist fd_list;

+ struct freework *fd_freework; /* Parent freework. */

};

@@ -450,6 +546,7 @@ struct freefile {

mode_t fx_mode; /* mode of inode */

ino_t fx_oldinum; /* inum of the unlinked file */

struct vnode *fx_devvp; /* filesystem device vnode */

+ struct workhead fx_jwork; /* journal work pending. */

};

@@ -482,12 +579,11 @@ struct freefile {

* than zero.

* The overlaying of da_pagedep and da_previous is done to keep the

- * structure down to 32 bytes in size on a 32-bit machine. If a

- * da_previous entry is present, the pointer to its pagedep is available

- * in the associated dirrem entry. If the DIRCHG flag is set, the

- * da_previous entry is valid; if not set the da_pagedep entry is valid.

- * The DIRCHG flag never changes; it is set when the structure is created

- * if appropriate and is never cleared.

+ * structure down. If a da_previous entry is present, the pointer to its

+ * pagedep is available in the associated dirrem entry. If the DIRCHG flag

+ * is set, the da_previous entry is valid; if not set the da_pagedep entry

+ * is valid. The DIRCHG flag never changes; it is set when the structure

+ * is created if appropriate and is never cleared.

struct diradd {

struct worklist da_list; /* id_inowait or id_pendinghd list */

@@ -499,6 +595,7 @@ struct diradd {

struct dirrem *dau_previous; /* entry being replaced in dir change */

struct pagedep *dau_pagedep; /* pagedep dependency for addition */

} da_un;

+ struct workhead da_jwork; /* Journal work awaiting completion. */

};

#define da_previous da_un.dau_previous

#define da_pagedep da_un.dau_pagedep

@@ -525,12 +622,13 @@ struct diradd {

* mkdir structures that reference it. The deletion would be faster if the

* diradd structure were simply augmented to have two pointers that referenced

* the associated mkdir's. However, this would increase the size of the diradd

- * structure from 32 to 64-bits to speed a very infrequent operation.

+ * structure to speed a very infrequent operation.

struct mkdir {

struct worklist md_list; /* id_inowait or buffer holding dir */

# define md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */

struct diradd *md_diradd; /* associated diradd */

+ struct jaddref *md_jaddref; /* dependent jaddref. */

struct buf *md_buf; /* MKDIR_BODY: buffer holding dir */

LIST_ENTRY(mkdir) md_mkdirs; /* list of all mkdirs */

};

@@ -542,20 +640,19 @@ LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;

* list of the pagedep for the directory page that contains the entry.

* It is processed after the directory page with the deleted entry has

* been written to disk.

- *

- * The overlaying of dm_pagedep and dm_dirinum is done to keep the

- * structure down to 32 bytes in size on a 32-bit machine. It works

- * because they are never used concurrently.

struct dirrem {

struct worklist dm_list; /* delayed worklist */

# define dm_state dm_list.wk_state /* state of the old directory entry */

LIST_ENTRY(dirrem) dm_next; /* pagedep's list of dirrem's */

+ LIST_ENTRY(dirrem) dm_inonext; /* inodedep's list of dirrem's */

+ struct jremrefhd dm_jremrefhd; /* Pending remove reference deps. */

ino_t dm_oldinum; /* inum of the removed dir entry */

union {

struct pagedep *dmu_pagedep; /* pagedep dependency for remove */

ino_t dmu_dirinum; /* parent inode number (for rmdir) */

} dm_un;

+ struct workhead dm_jwork; /* Journal work awaiting completion. */

};

#define dm_pagedep dm_un.dmu_pagedep

#define dm_dirinum dm_un.dmu_dirinum

@@ -577,9 +674,200 @@ struct dirrem {

* blocks using a similar scheme with the allocindir structures. Rather

* than adding this level of complexity, we simply write those newly

* allocated indirect blocks synchronously as such allocations are rare.

+ * In the case of a new directory the . and .. links are tracked with

+ * a mkdir rather than a pagedep. In this case we track the mkdir

+ * so it can be released when it is written. A workhead is used

+ * to simplify canceling a mkdir that is removed by a subsequent dirrem.

struct newdirblk {

struct worklist db_list; /* id_inowait or pg_newdirblk */

# define db_state db_list.wk_state /* unused */

struct pagedep *db_pagedep; /* associated pagedep */

+ struct workhead db_mkdir;

+};

+/*

+ * The inoref structure holds the elements common to jaddref and jremref

+ * so they may easily be queued in-order on the inodedep.

+ */

+struct inoref {

+ struct worklist if_list;

+# define if_state if_list.wk_state

+ TAILQ_ENTRY(inoref) if_deps; /* Links for inodedep. */

+ struct jsegdep *if_jsegdep;

+ off_t if_diroff; /* Directory offset. */

+ ino_t if_ino; /* Inode number. */

+ ino_t if_parent; /* Parent inode number. */

+ nlink_t if_nlink; /* nlink before addition. */

+ uint16_t if_mode; /* File mode, needed for IFMT. */

+};

+/*

+ * A "jaddref" structure tracks a new reference (link count) on an inode

+ * and prevents the link count increase and bitmap allocation until a

+ * journal entry can be written. Once the journal entry is written,

+ * the inode is put on the pendinghd of the bmsafemap and a diradd or

+ * mkdir entry is placed on the bufwait list of the inode. The DEPCOMPLETE

+ * flag is used to indicate that all of the required information for writing

+ * the journal entry is present. MKDIR_BODY and MKDIR_PARENT are used to

+ * differentiate . and .. links from regular file names. NEWBLOCK indicates

+ * a bitmap is still pending. If a new reference is canceled by a delete

+ * prior to writing the journal the jaddref write is canceled and the

+ * structure persists to prevent any disk-visible changes until it is

+ * ultimately released when the file is freed or the link is dropped again.

+ */

+struct jaddref {

+ struct inoref ja_ref;

+# define ja_list ja_ref.if_list /* Journal pending or jseg entries. */

+# define ja_state ja_ref.if_list.wk_state

+ LIST_ENTRY(jaddref) ja_bmdeps; /* Links for bmsafemap. */

+ union {

+ struct diradd *jau_diradd; /* Pending diradd. */

+ struct mkdir *jau_mkdir; /* MKDIR_{PARENT,BODY} */

+ } ja_un;

+};

+#define ja_diradd ja_un.jau_diradd

+#define ja_mkdir ja_un.jau_mkdir

+#define ja_diroff ja_ref.if_diroff

+#define ja_ino ja_ref.if_ino

+#define ja_parent ja_ref.if_parent

+#define ja_mode ja_ref.if_mode

+/*

+ * A "jremref" structure tracks a removed reference (unlink) on an

+ * inode and prevents the directory remove from proceeding until the

+ * journal entry is written. Once the journal has been written the remove

+ * may proceed as normal.

+ */

+struct jremref {

+ struct inoref jr_ref;

+# define jr_list jr_ref.if_list /* Journal pending or jseg entries. */

+# define jr_state jr_ref.if_list.wk_state

+ LIST_ENTRY(jremref) jr_deps; /* Links for pagdep. */

+ struct dirrem *jr_dirrem; /* Back pointer to dirrem. */

+};

+struct jmvref {

+ struct worklist jm_list;

+ LIST_ENTRY(jmvref) jm_deps;

+ struct pagedep *jm_pagedep;

+ ino_t jm_parent;

+ ino_t jm_ino;

+ off_t jm_oldoff;

+ off_t jm_newoff;

+};

+/*

+ * A "jnewblk" structure tracks a newly allocated block or fragment and

+ * prevents the direct or indirect block pointer as well as the cg bitmap

+ * from being written until it is logged. After it is logged the jsegdep

+ * is attached to the allocdirect or allocindir until the operation is

+ * completed or reverted. If the operation is reverted prior to the journal

+ * write the jnewblk structure is maintained to prevent the bitmaps from

+ * reaching the disk. Ultimately the jnewblk structure will be passed

+ * to the free routine as the in memory cg is modified back to the free

+ * state at which time it can be released.

+ */

+struct jnewblk {

+ struct worklist jn_list;

+# define jn_state jn_list.wk_state

+ struct jsegdep *jn_jsegdep;

+ LIST_ENTRY(jnewblk) jn_deps; /* All jnewblks on bmsafemap */

+ struct newblk *jn_newblk;

+ ino_t jn_ino;

+ ufs_lbn_t jn_lbn;

+ ufs2_daddr_t jn_blkno;

+ int jn_oldfrags;

+ int jn_frags;

+};

+/*

+ * A "jfreeblk" structure tracks the journal write for freeing a block

+ * or tree of blocks. The block pointer must not be cleared in the inode

+ * or indirect prior to the jfreeblk being written.

+ */

+struct jfreeblk {

+ struct worklist jf_list;

+# define jf_state jf_list.wk_state

+ struct jsegdep *jf_jsegdep;

+ struct freeblks *jf_freeblks;

+ LIST_ENTRY(jfreeblk) jf_deps;

+ ino_t jf_ino;

+ ufs_lbn_t jf_lbn;

+ ufs2_daddr_t jf_blkno;

+ int jf_frags;

+};

+/*

+ * A "jfreefrag" tracks the freeing of a single block when a fragment is

+ * extended or an indirect page is replaced. It is not part of a larger

+ * freeblks operation.

+ */

+struct jfreefrag {

+ struct worklist fr_list;

+# define fr_state fr_list.wk_state

+ struct jsegdep *fr_jsegdep;

+ struct freefrag *fr_freefrag;

+ ino_t fr_ino;

+ ufs_lbn_t fr_lbn;

+ ufs2_daddr_t fr_blkno;

+ int fr_frags;

+};

+/*

+ * A "jtrunc" journals the intent to truncate an inode to a non-zero

+ * value. This is done synchronously prior to the synchronous partial

+ * truncation process. The jsegdep is not released until the truncation

+ * is complete and the truncated inode is fsync'd.

+ */

+struct jtrunc {

+ struct worklist jt_list;

+ struct jsegdep *jt_jsegdep;

+ ino_t jt_ino;

+ off_t jt_size;

+ int jt_extsize;

+};

+/*

+ * A "jsegdep" structure tracks a single reference to a written journal

+ * segment so the journal space can be reclaimed when all dependencies

+ * have been written.

+ */

+struct jsegdep {

+ struct worklist jd_list;

+# define jd_state jd_list.wk_state

+ struct jseg *jd_seg;

+};

+/*

+ * A "jseg" structure contains all of the journal records written in a

+ * single disk write. jaddref and jremref structures are linked into

+ * js_entries so thay may be completed when the write completes. The

+ * js_deps array contains as many entries as there are ref counts to

+ * reduce the number of allocations required per journal write to one.

+ */

+struct jseg {

+ struct worklist js_list; /* b_deps link for journal */

+# define js_state js_list.wk_state

+ struct workhead js_entries; /* Entries awaiting write */

+ TAILQ_ENTRY(jseg) js_next;

+ struct jblocks *js_jblocks; /* Back pointer to block/seg list */

+ struct buf *js_buf; /* Buffer while unwritten */

+ uint64_t js_seq;

+ int js_size; /* Allocated size in bytes */

+ int js_cnt; /* Total items allocated */

+ int js_refs; /* Count of items pending completion */

+};

+/*

+ * A 'sbdep' structure tracks the head of the free inode list and

+ * superblock writes. This makes sure the superblock is always pointing at

+ * the first possible unlinked inode for the suj recovery process. If a

+ * block write completes and we discover a new head is available the buf

+ * is dirtied and the dep is kept.

+ */

+struct sbdep {

+ struct worklist sb_list; /* b_dep linkage */

+ struct fs *sb_fs; /* Filesystem pointer within buf. */

+ struct ufsmount *sb_ump;

};

diff --git a/sys/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h
index 7f9e7c56496e..c75257c8e62d 100644
--- a/sys/ufs/ufs/dinode.h
+++ b/sys/ufs/ufs/dinode.h

@@ -146,7 +146,8 @@ struct ufs2_dinode {

ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */

ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */

u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */

- int64_t di_spare[2]; /* 240: Reserved; currently unused */

+ ino_t di_freelink; /* 240: SUJ: Next unlinked inode. */

+ uint32_t di_spare[3]; /* 244: Reserved; currently unused */

};

@@ -167,9 +168,7 @@ struct ufs2_dinode {

struct ufs1_dinode {

u_int16_t di_mode; /* 0: IFMT, permissions; see below. */

int16_t di_nlink; /* 2: File link count. */

- union {

- u_int16_t oldids[2]; /* 4: Ffs: old user and group ids. */

- } di_u;

+ ino_t di_freelink; /* 4: SUJ: Next unlinked inode. */

u_int64_t di_size; /* 8: File byte count. */

int32_t di_atime; /* 16: Last access time. */

int32_t di_atimensec; /* 20: Last access time. */

@@ -186,7 +185,5 @@ struct ufs1_dinode {

u_int32_t di_gid; /* 116: File group. */

u_int64_t di_modrev; /* 120: i_modrev for NFSv4 */

};

-#define di_ogid di_u.oldids[1]

-#define di_ouid di_u.oldids[0]

#endif /* _UFS_UFS_DINODE_H_ */

diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h
index 565580e60460..295b12975e25 100644
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h

@@ -120,7 +120,7 @@ struct inode {

#define IN_CHANGE 0x0002 /* Inode change time update request. */

#define IN_UPDATE 0x0004 /* Modification time update request. */

#define IN_MODIFIED 0x0008 /* Inode has been modified. */

-#define IN_RENAME 0x0010 /* Inode is being renamed. */

+#define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */

#define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */

#define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */

#define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the

@@ -175,6 +175,7 @@ struct indir {

/* Determine if soft dependencies are being done */

#define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & MNT_SOFTDEP)

#define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)

+#define DOINGSUJ(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_SUJ)

/* This overlays the fid structure (see mount.h). */

struct ufid {

diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c
index c85fdc8980f6..d7c1d0ddb821 100644
--- a/sys/ufs/ufs/ufs_dirhash.c
+++ b/sys/ufs/ufs/ufs_dirhash.c

@@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$");

static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");

-static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");

static int ufs_mindirhashsize = DIRBLKSIZ * 5;

SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,

&ufs_mindirhashsize,

diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h
index b2e4a9757305..6658b663fb14 100644
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h

@@ -57,7 +57,7 @@ int ufs_bmap(struct vop_bmap_args *);

int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *,

struct buf *, int *, int *);

int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **);

-int ufs_checkpath(ino_t, struct inode *, struct ucred *);

+int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *);

void ufs_dirbad(struct inode *, doff_t, char *);

int ufs_dirbadentry(struct vnode *, struct direct *, int);

int ufs_dirempty(struct inode *, ino_t, struct ucred *);

@@ -66,9 +66,11 @@ int ufs_extwrite(struct vop_write_args *);

void ufs_makedirentry(struct inode *, struct componentname *,

struct direct *);

int ufs_direnter(struct vnode *, struct vnode *, struct direct *,

- struct componentname *, struct buf *);

+ struct componentname *, struct buf *, int);

int ufs_dirremove(struct vnode *, struct inode *, int, int);

int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int);

+int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *,

+ ino_t *);

int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *);

int ufs_inactive(struct vop_inactive_args *);

int ufs_init(struct vfsconf *);

@@ -81,19 +83,33 @@ vfs_root_t ufs_root;

int ufs_uninit(struct vfsconf *);

int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **);

+#include <sys/sysctl.h>

+SYSCTL_DECL(_vfs_ufs);

* Soft update function prototypes.

int softdep_setup_directory_add(struct buf *, struct inode *, off_t,

ino_t, struct buf *, int);

-void softdep_change_directoryentry_offset(struct inode *, caddr_t,

- caddr_t, caddr_t, int);

+void softdep_change_directoryentry_offset(struct buf *, struct inode *,

+ caddr_t, caddr_t, caddr_t, int);

void softdep_setup_remove(struct buf *,struct inode *, struct inode *, int);

void softdep_setup_directory_change(struct buf *, struct inode *,

struct inode *, ino_t, int);

void softdep_change_linkcnt(struct inode *);

void softdep_releasefile(struct inode *);

int softdep_slowdown(struct vnode *);

+void softdep_setup_create(struct inode *, struct inode *);

+void softdep_setup_dotdot_link(struct inode *, struct inode *);

+void softdep_setup_link(struct inode *, struct inode *);

+void softdep_setup_mkdir(struct inode *, struct inode *);

+void softdep_setup_rmdir(struct inode *, struct inode *);

+void softdep_setup_unlink(struct inode *, struct inode *);

+void softdep_revert_create(struct inode *, struct inode *);

+void softdep_revert_dotdot_link(struct inode *, struct inode *);

+void softdep_revert_link(struct inode *, struct inode *);

+void softdep_revert_mkdir(struct inode *, struct inode *);

+void softdep_revert_rmdir(struct inode *, struct inode *);

* Flags to low-level allocation routines. The low 16-bits are reserved

diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
index b0247e77d0d8..0030c5264bd1 100644
--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c

@@ -77,9 +77,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, "");

/* true if old FS format...*/

#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0)

-static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *,

- ino_t *);

static int

ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred,

struct thread *td)

@@ -189,11 +186,11 @@ ufs_lookup(ap)

} */ *ap;

{

- return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));

+ return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));

}

-static int

-ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,

+int

+ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,

ino_t *dd_ino)

{

struct inode *dp; /* inode for directory being searched */

@@ -524,6 +521,8 @@ notfound:

return (ENOENT);

found:

+ if (dd_ino != NULL)

+ *dd_ino = ino;

if (numdirpasses == 2)

nchstats.ncs_pass2++;

@@ -546,11 +545,6 @@ found:

if ((flags & ISLASTCN) && nameiop == LOOKUP)

dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1);

- if (dd_ino != NULL) {

- *dd_ino = ino;

- return (0);

- }

* If deleting, and at end of pathname, return

* parameters which can be used to remove file.

@@ -558,17 +552,6 @@ found:

if (nameiop == DELETE && (flags & ISLASTCN)) {

if (flags & LOCKPARENT)

ASSERT_VOP_ELOCKED(vdp, __FUNCTION__);

- if ((error = VFS_VGET(vdp->v_mount, ino,

- LK_EXCLUSIVE, &tdp)) != 0)

- return (error);

- error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);

- if (error) {

- vput(tdp);

- return (error);

- }

* Return pointer to current entry in dp->i_offset,

* and distance past previous entry (if there

@@ -585,6 +568,16 @@ found:

dp->i_count = 0;

else

dp->i_count = dp->i_offset - prevoff;

+ if (dd_ino != NULL)

+ return (0);

+ if ((error = VFS_VGET(vdp->v_mount, ino,

+ LK_EXCLUSIVE, &tdp)) != 0)

+ return (error);

+ error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);

+ if (error) {

+ vput(tdp);

+ return (error);

+ }

if (dp->i_number == ino) {

VREF(vdp);

*vpp = vdp;

@@ -616,6 +609,8 @@ found:

dp->i_offset = i_offset;

if (dp->i_number == ino)

return (EISDIR);

+ if (dd_ino != NULL)

+ return (0);

if ((error = VFS_VGET(vdp->v_mount, ino,

LK_EXCLUSIVE, &tdp)) != 0)

return (error);

@@ -650,6 +645,8 @@ found:

cnp->cn_flags |= SAVENAME;

return (0);

}

+ if (dd_ino != NULL)

+ return (0);

* Step through the translation in the name. We do not `vput' the

@@ -681,7 +678,7 @@ found:

* to the inode we looked up before vdp lock was

* dropped.

- error = ufs_lookup_(pdp, NULL, cnp, &ino1);

+ error = ufs_lookup_ino(pdp, NULL, cnp, &ino1);

if (error) {

vput(tdp);

return (error);

@@ -704,6 +701,14 @@ found:

vn_lock(vdp, LK_UPGRADE | LK_RETRY);

else /* if (ltype == LK_SHARED) */

vn_lock(vdp, LK_DOWNGRADE | LK_RETRY);

+ /*

+ * Relock for the "." case may left us with

+ * reclaimed vnode.

+ */

+ if (vdp->v_iflag & VI_DOOMED) {

+ vrele(vdp);

+ return (ENOENT);

+ }

}

*vpp = vdp;

} else {

@@ -825,12 +830,13 @@ ufs_makedirentry(ip, cnp, newdirp)

* soft dependency code).

int

-ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)

+ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename)

struct vnode *dvp;

struct vnode *tvp;

struct direct *dirp;

struct componentname *cnp;

struct buf *newdirbp;

+ int isrename;

{

struct ucred *cr;

struct thread *td;

@@ -903,22 +909,28 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)

blkoff += DIRBLKSIZ;

}

if (softdep_setup_directory_add(bp, dp, dp->i_offset,

- dirp->d_ino, newdirbp, 1) == 0) {

- bdwrite(bp);

+ dirp->d_ino, newdirbp, 1))

+ dp->i_flag |= IN_NEEDSYNC;

+ if (newdirbp)

+ bdwrite(newdirbp);

+ bdwrite(bp);

+ if ((dp->i_flag & IN_NEEDSYNC) == 0)

return (UFS_UPDATE(dvp, 0));

- }

- /* We have just allocated a directory block in an

- * indirect block. Rather than tracking when it gets

- * claimed by the inode, we simply do a VOP_FSYNC

- * now to ensure that it is there (in case the user

- * does a future fsync). Note that we have to unlock

- * the inode for the entry that we just entered, as

- * the VOP_FSYNC may need to lock other inodes which

- * can lead to deadlock if we also hold a lock on

- * the newly entered node.

+ /*

+ * We have just allocated a directory block in an

+ * indirect block. We must prevent holes in the

+ * directory created if directory entries are

+ * written out of order. To accomplish this we

+ * fsync when we extend a directory into indirects.

+ * During rename it's not safe to drop the tvp lock

+ * so sync must be delayed until it is.

+ *

+ * This synchronous step could be removed if fsck and

+ * the kernel were taught to fill in sparse

+ * directories rather than panic.

- if ((error = bwrite(bp)))

- return (error);

+ if (isrename)

+ return (0);

if (tvp != NULL)

VOP_UNLOCK(tvp, 0);

error = VOP_FSYNC(dvp, MNT_WAIT, td);

@@ -1007,7 +1019,7 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)

dp->i_offset + ((char *)ep - dirbuf));

#endif

if (DOINGSOFTDEP(dvp))

- softdep_change_directoryentry_offset(dp, dirbuf,

+ softdep_change_directoryentry_offset(bp, dp, dirbuf,

(caddr_t)nep, (caddr_t)ep, dsize);

else

bcopy((caddr_t)nep, (caddr_t)ep, dsize);

@@ -1059,6 +1071,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)

(void) softdep_setup_directory_add(bp, dp,

dp->i_offset + (caddr_t)ep - dirbuf,

dirp->d_ino, newdirbp, 0);

+ if (newdirbp != NULL)

+ bdwrite(newdirbp);

bdwrite(bp);

} else {

if (DOINGASYNC(dvp)) {

@@ -1076,7 +1090,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)

* lock other inodes which can lead to deadlock if we also hold a

* lock on the newly entered node.

- if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {

+ if (isrename == 0 && error == 0 &&

+ dp->i_endoff && dp->i_endoff < dp->i_size) {

if (tvp != NULL)

VOP_UNLOCK(tvp, 0);

#ifdef UFS_DIRHASH

@@ -1117,6 +1132,19 @@ ufs_dirremove(dvp, ip, flags, isrmdir)

dp = VTOI(dvp);

+ /*

+ * Adjust the link count early so softdep can block if necessary.

+ */

+ if (ip) {

+ ip->i_effnlink--;

+ if (DOINGSOFTDEP(dvp)) {

+ softdep_setup_unlink(dp, ip);

+ } else {

+ ip->i_nlink--;

+ DIP_SET(ip, i_nlink, ip->i_nlink);

+ ip->i_flag |= IN_CHANGE;

+ }

if (flags & DOWHITEOUT) {

* Whiteout entry: set d_ino to WINO.

@@ -1146,6 +1174,9 @@ ufs_dirremove(dvp, ip, flags, isrmdir)

if (dp->i_dirhash != NULL)

ufsdirhash_remove(dp, rep, dp->i_offset);

#endif

+ if (ip && rep->d_ino != ip->i_number)

+ panic("ufs_dirremove: ip %d does not match dirent ino %d\n",

+ ip->i_number, rep->d_ino);

if (dp->i_count == 0) {

* First entry in block: set d_ino to zero.

@@ -1164,31 +1195,20 @@ ufs_dirremove(dvp, ip, flags, isrmdir)

dp->i_offset & ~(DIRBLKSIZ - 1));

#endif

out:

+ error = 0;

if (DOINGSOFTDEP(dvp)) {

- if (ip) {

- ip->i_effnlink--;

- softdep_change_linkcnt(ip);

+ if (ip)

softdep_setup_remove(bp, dp, ip, isrmdir);

- }

- if (softdep_slowdown(dvp)) {

+ if (softdep_slowdown(dvp))

error = bwrite(bp);

- } else {

+ else

bdwrite(bp);

- error = 0;

- }

} else {

- if (ip) {

- ip->i_effnlink--;

- ip->i_nlink--;

- DIP_SET(ip, i_nlink, ip->i_nlink);

- ip->i_flag |= IN_CHANGE;

- }

if (flags & DOWHITEOUT)

error = bwrite(bp);

- else if (DOINGASYNC(dvp) && dp->i_count != 0) {

+ else if (DOINGASYNC(dvp) && dp->i_count != 0)

bdwrite(bp);

- error = 0;

- } else

+ else

error = bwrite(bp);

}

dp->i_flag |= IN_CHANGE | IN_UPDATE;

@@ -1221,6 +1241,19 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)

struct vnode *vdp = ITOV(dp);

int error;

+ /*

+ * Drop the link before we lock the buf so softdep can block if

+ * necessary.

+ */

+ oip->i_effnlink--;

+ if (DOINGSOFTDEP(vdp)) {

+ softdep_setup_unlink(dp, oip);

+ } else {

+ oip->i_nlink--;

+ DIP_SET(oip, i_nlink, oip->i_nlink);

+ oip->i_flag |= IN_CHANGE;

+ }

error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp);

if (error)

return (error);

@@ -1232,15 +1265,10 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)

ep->d_ino = newinum;

if (!OFSFMT(vdp))

ep->d_type = newtype;

- oip->i_effnlink--;

if (DOINGSOFTDEP(vdp)) {

- softdep_change_linkcnt(oip);

softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);

bdwrite(bp);

} else {

- oip->i_nlink--;

- DIP_SET(oip, i_nlink, oip->i_nlink);

- oip->i_flag |= IN_CHANGE;

if (DOINGASYNC(vdp)) {

bdwrite(bp);

error = 0;

@@ -1355,25 +1383,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct ucred *cred, ino_t *dd_ino)

* Check if source directory is in the path of the target directory.

- * Target is supplied locked, source is unlocked.

- * The target is always vput before returning.

int

-ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)

+ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino)

{

- struct vnode *vp, *vp1;

+ struct mount *mp;

+ struct vnode *tvp, *vp, *vp1;

int error;

ino_t dd_ino;

- vp = ITOV(target);

- if (target->i_number == source_ino) {

- error = EEXIST;

- goto out;

- }

- error = 0;

+ vp = tvp = ITOV(target);

+ mp = vp->v_mount;

+ *wait_ino = 0;

+ if (target->i_number == source_ino)

+ return (EEXIST);

+ if (target->i_number == parent_ino)

+ return (0);

if (target->i_number == ROOTINO)

- goto out;

+ return (0);

+ error = 0;

for (;;) {

error = ufs_dir_dd_ino(vp, cred, &dd_ino);

if (error != 0)

@@ -1384,9 +1412,13 @@ ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)

}

if (dd_ino == ROOTINO)

break;

- error = vn_vget_ino(vp, dd_ino, LK_EXCLUSIVE, &vp1);

- if (error != 0)

+ if (dd_ino == parent_ino)

break;

+ error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1);

+ if (error != 0) {

+ *wait_ino = dd_ino;

+ break;

+ }

/* Recheck that ".." still points to vp1 after relock of vp */

error = ufs_dir_dd_ino(vp, cred, &dd_ino);

if (error != 0) {

@@ -1398,14 +1430,14 @@ ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)

vput(vp1);

continue;

}

- vput(vp);

+ if (vp != tvp)

+ vput(vp);

vp = vp1;

}

-out:

if (error == ENOTDIR)

- printf("checkpath: .. not a directory\n");

- if (vp != NULL)

+ panic("checkpath: .. not a directory\n");

+ if (vp != tvp)

vput(vp);

return (error);

}

diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 9d4d93dbc8fe..f8d45cfceb8e 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c

@@ -114,6 +114,8 @@ static vop_close_t ufsfifo_close;

static vop_kqfilter_t ufsfifo_kqfilter;

static vop_pathconf_t ufsfifo_pathconf;

+SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");

* A virgin directory (no blushing please).

@@ -974,6 +976,9 @@ ufs_link(ap)

error = EXDEV;

goto out;

}

+ if (VTOI(tdvp)->i_effnlink < 2)

+ panic("ufs_link: Bad link count %d on parent",

+ VTOI(tdvp)->i_effnlink);

ip = VTOI(vp);

if ((nlink_t)ip->i_nlink >= LINK_MAX) {

error = EMLINK;

@@ -988,11 +993,11 @@ ufs_link(ap)

DIP_SET(ip, i_nlink, ip->i_nlink);

ip->i_flag |= IN_CHANGE;

if (DOINGSOFTDEP(vp))

- softdep_change_linkcnt(ip);

+ softdep_setup_link(VTOI(tdvp), ip);

error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp)));

if (!error) {

ufs_makedirentry(ip, cnp, &newdir);

- error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL);

+ error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0);

}

if (error) {

@@ -1001,7 +1006,7 @@ ufs_link(ap)

DIP_SET(ip, i_nlink, ip->i_nlink);

ip->i_flag |= IN_CHANGE;

if (DOINGSOFTDEP(vp))

- softdep_change_linkcnt(ip);

+ softdep_revert_link(VTOI(tdvp), ip);

}

out:

return (error);

@@ -1043,7 +1048,7 @@ ufs_whiteout(ap)

newdir.d_namlen = cnp->cn_namelen;

bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);

newdir.d_type = DT_WHT;

- error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL);

+ error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0);

break;

case DELETE:

@@ -1062,6 +1067,11 @@ ufs_whiteout(ap)

return (error);

}

+static volatile int rename_restarts;

+SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD,

+ __DEVOLATILE(int *, &rename_restarts), 0,

+ "Times rename had to restart due to lock contention");

* Rename system call.

* rename("foo", "bar");

@@ -1101,111 +1111,183 @@ ufs_rename(ap)

struct vnode *tdvp = ap->a_tdvp;

struct vnode *fvp = ap->a_fvp;

struct vnode *fdvp = ap->a_fdvp;

+ struct vnode *nvp;

struct componentname *tcnp = ap->a_tcnp;

struct componentname *fcnp = ap->a_fcnp;

struct thread *td = fcnp->cn_thread;

- struct inode *ip, *xp, *dp;

+ struct inode *fip, *tip, *tdp, *fdp;

struct direct newdir;

- int doingdirectory = 0, oldparent = 0, newparent = 0;

+ off_t endoff;

+ int doingdirectory, newparent;

int error = 0, ioflag;

- ino_t fvp_ino;

+ struct mount *mp;

+ ino_t ino;

#ifdef INVARIANTS

if ((tcnp->cn_flags & HASBUF) == 0 ||

(fcnp->cn_flags & HASBUF) == 0)

panic("ufs_rename: no name");

#endif

+ endoff = 0;

+ mp = tdvp->v_mount;

+ VOP_UNLOCK(tdvp, 0);

+ if (tvp && tvp != tdvp)

+ VOP_UNLOCK(tvp, 0);

* Check for cross-device rename.

if ((fvp->v_mount != tdvp->v_mount) ||

(tvp && (fvp->v_mount != tvp->v_mount))) {

error = EXDEV;

-abortit:

- if (tdvp == tvp)

- vrele(tdvp);

- else

- vput(tdvp);

- if (tvp)

- vput(tvp);

- vrele(fdvp);

+ mp = NULL;

+ goto releout;

+ }

+ error = vfs_busy(mp, 0);

+ if (error) {

+ mp = NULL;

+ goto releout;

+ }

+relock:

+ /*

+ * We need to acquire 2 to 4 locks depending on whether tvp is NULL

+ * and fdvp and tdvp are the same directory. Subsequently we need

+ * to double-check all paths and in the directory rename case we

+ * need to verify that we are not creating a directory loop. To

+ * handle this we acquire all but fdvp using non-blocking

+ * acquisitions. If we fail to acquire any lock in the path we will

+ * drop all held locks, acquire the new lock in a blocking fashion,

+ * and then release it and restart the rename. This acquire/release

+ * step ensures that we do not spin on a lock waiting for release.

+ */

+ error = vn_lock(fdvp, LK_EXCLUSIVE);

+ if (error)

+ goto releout;

+ if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {

+ VOP_UNLOCK(fdvp, 0);

+ error = vn_lock(tdvp, LK_EXCLUSIVE);

+ if (error)

+ goto releout;

+ VOP_UNLOCK(tdvp, 0);

+ atomic_add_int(&rename_restarts, 1);

+ goto relock;

+ }

+ /*

+ * Re-resolve fvp to be certain it still exists and fetch the

+ * correct vnode.

+ */

+ error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);

+ if (error) {

+ VOP_UNLOCK(fdvp, 0);

+ VOP_UNLOCK(tdvp, 0);

+ goto releout;

+ }

+ error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);

+ if (error) {

+ VOP_UNLOCK(fdvp, 0);

+ VOP_UNLOCK(tdvp, 0);

+ if (error != EBUSY)

+ goto releout;

+ error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);

+ if (error != 0)

+ goto releout;

+ VOP_UNLOCK(nvp, 0);

vrele(fvp);

- return (error);

+ fvp = nvp;

+ atomic_add_int(&rename_restarts, 1);

+ goto relock;

}

+ vrele(fvp);

+ fvp = nvp;

+ /*

+ * Re-resolve tvp and acquire the vnode lock if present.

+ */

+ error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino);

+ if (error != 0 && error != EJUSTRETURN) {

+ VOP_UNLOCK(fdvp, 0);

+ VOP_UNLOCK(tdvp, 0);

+ VOP_UNLOCK(fvp, 0);

+ goto releout;

+ }

+ /*

+ * If tvp disappeared we just carry on.

+ */

+ if (error == EJUSTRETURN && tvp != NULL) {

+ vrele(tvp);

+ tvp = NULL;

+ }

+ /*

+ * Get the tvp ino if the lookup succeeded. We may have to restart

+ * if the non-blocking acquire fails.

+ */

+ if (error == 0) {

+ nvp = NULL;

+ error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);

+ if (tvp)

+ vrele(tvp);

+ tvp = nvp;

+ if (error) {

+ VOP_UNLOCK(fdvp, 0);

+ VOP_UNLOCK(tdvp, 0);

+ VOP_UNLOCK(fvp, 0);

+ if (error != EBUSY)

+ goto releout;

+ error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);

+ if (error != 0)

+ goto releout;

+ VOP_UNLOCK(nvp, 0);

+ atomic_add_int(&rename_restarts, 1);

+ goto relock;

+ }

+ fdp = VTOI(fdvp);

+ fip = VTOI(fvp);

+ tdp = VTOI(tdvp);

+ tip = NULL;

+ if (tvp)

+ tip = VTOI(tvp);

if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||

(VTOI(tdvp)->i_flags & APPEND))) {

error = EPERM;

- goto abortit;

+ goto unlockout;

}

* Renaming a file to itself has no effect. The upper layers should

- * not call us in that case. Temporarily just warn if they do.

+ * not call us in that case. However, things could change after

+ * we drop the locks above.

if (fvp == tvp) {

- printf("ufs_rename: fvp == tvp (can't happen)\n");

error = 0;

- goto abortit;

+ goto unlockout;

}

- if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)

- goto abortit;

- dp = VTOI(fdvp);

- ip = VTOI(fvp);

- if (ip->i_nlink >= LINK_MAX) {

- VOP_UNLOCK(fvp, 0);

+ doingdirectory = 0;

+ newparent = 0;

+ ino = fip->i_number;

+ if (fip->i_nlink >= LINK_MAX) {

error = EMLINK;

- goto abortit;

+ goto unlockout;

}

- if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))

- || (dp->i_flags & APPEND)) {

- VOP_UNLOCK(fvp, 0);

+ if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))

+ || (fdp->i_flags & APPEND)) {

error = EPERM;

- goto abortit;

+ goto unlockout;

}

- if ((ip->i_mode & IFMT) == IFDIR) {

+ if ((fip->i_mode & IFMT) == IFDIR) {

* Avoid ".", "..", and aliases of "." for obvious reasons.

if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||

- dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||

- (ip->i_flag & IN_RENAME)) {

- VOP_UNLOCK(fvp, 0);

+ fdp == fip ||

+ (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {

error = EINVAL;

- goto abortit;

+ goto unlockout;

}

- ip->i_flag |= IN_RENAME;

- oldparent = dp->i_number;

+ if (fdp->i_number != tdp->i_number)

+ newparent = tdp->i_number;

doingdirectory = 1;

}

- vrele(fdvp);

- /*

- * When the target exists, both the directory

- * and target vnodes are returned locked.

- */

- dp = VTOI(tdvp);

- xp = NULL;

- if (tvp)

- xp = VTOI(tvp);

- /*

- * 1) Bump link count while we're moving stuff

- * around. If we crash somewhere before

- * completing our work, the link count

- * may be wrong, but correctable.

- */

- ip->i_effnlink++;

- ip->i_nlink++;

- DIP_SET(ip, i_nlink, ip->i_nlink);

- ip->i_flag |= IN_CHANGE;

- if (DOINGSOFTDEP(fvp))

- softdep_change_linkcnt(ip);

- if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) |

- DOINGASYNC(fvp)))) != 0) {

- VOP_UNLOCK(fvp, 0);

- goto bad;

+ if (fvp->v_mountedhere != NULL || (tvp && tvp->v_mountedhere != NULL)) {

+ error = EXDEV;

+ goto unlockout;

}

@@ -1214,35 +1296,55 @@ abortit:

* directory hierarchy above the target, as this would

* orphan everything below the source directory. Also

* the user must have write permission in the source so

- * as to be able to change "..". We must repeat the call

- * to namei, as the parent directory is unlocked by the

- * call to checkpath().

+ * as to be able to change "..".

- error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);

- fvp_ino = ip->i_number;

- VOP_UNLOCK(fvp, 0);

- if (oldparent != dp->i_number)

- newparent = dp->i_number;

if (doingdirectory && newparent) {

- if (error) /* write access check above */

- goto bad;

- if (xp != NULL)

- vput(tvp);

- error = ufs_checkpath(fvp_ino, dp, tcnp->cn_cred);

+ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);

if (error)

- goto out;

+ goto unlockout;

+ error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred,

+ &ino);

+ /*

+ * We encountered a lock that we have to wait for. Unlock

+ * everything else and VGET before restarting.

+ */

+ if (ino) {

+ VOP_UNLOCK(fdvp, 0);

+ VOP_UNLOCK(fvp, 0);

+ VOP_UNLOCK(tdvp, 0);

+ if (tvp)

+ VOP_UNLOCK(tvp, 0);

+ error = VFS_VGET(mp, ino, LK_SHARED, &nvp);

+ if (error == 0)

+ vput(nvp);

+ atomic_add_int(&rename_restarts, 1);

+ goto relock;

+ }

+ if (error)

+ goto unlockout;

if ((tcnp->cn_flags & SAVESTART) == 0)

panic("ufs_rename: lost to startdir");

- VREF(tdvp);

- error = relookup(tdvp, &tvp, tcnp);

- if (error)

- goto out;

- vrele(tdvp);

- dp = VTOI(tdvp);

- xp = NULL;

- if (tvp)

- xp = VTOI(tvp);

}

+ if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 ||

+ tdp->i_effnlink == 0)

+ panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp);

+ /*

+ * 1) Bump link count while we're moving stuff

+ * around. If we crash somewhere before

+ * completing our work, the link count

+ * may be wrong, but correctable.

+ */

+ fip->i_effnlink++;

+ fip->i_nlink++;

+ DIP_SET(fip, i_nlink, fip->i_nlink);

+ fip->i_flag |= IN_CHANGE;

+ if (DOINGSOFTDEP(fvp))

+ softdep_setup_link(tdp, fip);

+ error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp)));

+ if (error)

+ goto bad;

* 2) If target doesn't exist, link the target

* to the source and unlink the source.

@@ -1250,52 +1352,37 @@ abortit:

* entry to reference the source inode and

* expunge the original entry's existence.

- if (xp == NULL) {

- if (dp->i_dev != ip->i_dev)

+ if (tip == NULL) {

+ if (tdp->i_dev != fip->i_dev)

panic("ufs_rename: EXDEV");

- /*

- * Account for ".." in new directory.

- * When source and destination have the same

- * parent we don't fool with the link count.

- */

if (doingdirectory && newparent) {

- if ((nlink_t)dp->i_nlink >= LINK_MAX) {

+ /*

+ * Account for ".." in new directory.

+ * When source and destination have the same

+ * parent we don't adjust the link count. The

+ * actual link modification is completed when

+ * .. is rewritten below.

+ */

+ if ((nlink_t)tdp->i_nlink >= LINK_MAX) {

error = EMLINK;

goto bad;

}

- dp->i_effnlink++;

- dp->i_nlink++;

- DIP_SET(dp, i_nlink, dp->i_nlink);

- dp->i_flag |= IN_CHANGE;

- if (DOINGSOFTDEP(tdvp))

- softdep_change_linkcnt(dp);

- error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |

- DOINGASYNC(tdvp)));

- if (error)

- goto bad;

}

- ufs_makedirentry(ip, tcnp, &newdir);

- error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL);

- if (error) {

- if (doingdirectory && newparent) {

- dp->i_effnlink--;

- dp->i_nlink--;

- DIP_SET(dp, i_nlink, dp->i_nlink);

- dp->i_flag |= IN_CHANGE;

- if (DOINGSOFTDEP(tdvp))

- softdep_change_linkcnt(dp);

- (void)UFS_UPDATE(tdvp, 1);

- }

+ ufs_makedirentry(fip, tcnp, &newdir);

+ error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1);

+ if (error)

goto bad;

- }

- vput(tdvp);

+ /* Setup tdvp for directory compaction if needed. */

+ if (tdp->i_count && tdp->i_endoff &&

+ tdp->i_endoff < tdp->i_size)

+ endoff = tdp->i_endoff;

} else {

- if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)

+ if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev)

panic("ufs_rename: EXDEV");

* Short circuit rename(foo, foo).

- if (xp->i_number == ip->i_number)

+ if (tip->i_number == fip->i_number)

panic("ufs_rename: same file");

* If the parent directory is "sticky", then the caller

@@ -1303,7 +1390,7 @@ abortit:

* destination of the rename. This implements append-only

* directories.

- if ((dp->i_mode & S_ISTXT) &&

+ if ((tdp->i_mode & S_ISTXT) &&

VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) &&

VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) {

error = EPERM;

@@ -1314,9 +1401,9 @@ abortit:

* to it. Also, ensure source and target are compatible

* (both directories, or both not directories).

- if ((xp->i_mode&IFMT) == IFDIR) {

- if ((xp->i_effnlink > 2) ||

- !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) {

+ if ((tip->i_mode & IFMT) == IFDIR) {

+ if ((tip->i_effnlink > 2) ||

+ !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) {

error = ENOTEMPTY;

goto bad;

}

@@ -1329,20 +1416,30 @@ abortit:

error = EISDIR;

goto bad;

}

- error = ufs_dirrewrite(dp, xp, ip->i_number,

- IFTODT(ip->i_mode),

- (doingdirectory && newparent) ? newparent : doingdirectory);

- if (error)

- goto bad;

if (doingdirectory) {

if (!newparent) {

- dp->i_effnlink--;

+ tdp->i_effnlink--;

if (DOINGSOFTDEP(tdvp))

- softdep_change_linkcnt(dp);

+ softdep_change_linkcnt(tdp);

}

- xp->i_effnlink--;

+ tip->i_effnlink--;

if (DOINGSOFTDEP(tvp))

- softdep_change_linkcnt(xp);

+ softdep_change_linkcnt(tip);

+ }

+ error = ufs_dirrewrite(tdp, tip, fip->i_number,

+ IFTODT(fip->i_mode),

+ (doingdirectory && newparent) ? newparent : doingdirectory);

+ if (error) {

+ if (doingdirectory) {

+ if (!newparent) {

+ tdp->i_effnlink++;

+ if (DOINGSOFTDEP(tdvp))

+ softdep_change_linkcnt(tdp);

+ }

+ tip->i_effnlink++;

+ if (DOINGSOFTDEP(tvp))

+ softdep_change_linkcnt(tip);

+ }

}

if (doingdirectory && !DOINGSOFTDEP(tvp)) {

@@ -1357,115 +1454,107 @@ abortit:

* them now.

if (!newparent) {

- dp->i_nlink--;

- DIP_SET(dp, i_nlink, dp->i_nlink);

- dp->i_flag |= IN_CHANGE;

+ tdp->i_nlink--;

+ DIP_SET(tdp, i_nlink, tdp->i_nlink);

+ tdp->i_flag |= IN_CHANGE;

}

- xp->i_nlink--;

- DIP_SET(xp, i_nlink, xp->i_nlink);

- xp->i_flag |= IN_CHANGE;

+ tip->i_nlink--;

+ DIP_SET(tip, i_nlink, tip->i_nlink);

+ tip->i_flag |= IN_CHANGE;

ioflag = IO_NORMAL;

if (!DOINGASYNC(tvp))

ioflag |= IO_SYNC;

+ /* Don't go to bad here as the new link exists. */

if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag,

tcnp->cn_cred, tcnp->cn_thread)) != 0)

- goto bad;

+ goto unlockout;

}

- vput(tdvp);

- vput(tvp);

- xp = NULL;

}

- * 3) Unlink the source.

+ * 3) Unlink the source. We have to resolve the path again to

+ * fixup the directory offset and count for ufs_dirremove.

- fcnp->cn_flags &= ~MODMASK;

- fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;

- if ((fcnp->cn_flags & SAVESTART) == 0)

- panic("ufs_rename: lost from startdir");

- VREF(fdvp);

- error = relookup(fdvp, &fvp, fcnp);

- if (error == 0)

- vrele(fdvp);

- if (fvp != NULL) {

- xp = VTOI(fvp);

- dp = VTOI(fdvp);

- } else {

- /*

- * From name has disappeared. IN_RENAME is not sufficient

- * to protect against directory races due to timing windows,

- * so we have to remove the panic. XXX the only real way

- * to solve this issue is at a much higher level. By the

- * time we hit ufs_rename() it's too late.

- */

-#if 0

- if (doingdirectory)

- panic("ufs_rename: lost dir entry");

-#endif

- vrele(ap->a_fvp);

- return (0);

+ if (fdvp == tdvp) {

+ error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);

+ if (error)

+ panic("ufs_rename: from entry went away!");

+ if (ino != fip->i_number)

+ panic("ufs_rename: ino mismatch %d != %d\n", ino,

+ fip->i_number);

}

- * Ensure that the directory entry still exists and has not

- * changed while the new name has been entered. If the source is

- * a file then the entry may have been unlinked or renamed. In

- * either case there is no further work to be done. If the source

- * is a directory then it cannot have been rmdir'ed; the IN_RENAME

- * flag ensures that it cannot be moved by another rename or removed

- * by a rmdir.

+ * If the source is a directory with a

+ * new parent, the link count of the old

+ * parent directory must be decremented

+ * and ".." set to point to the new parent.

- if (xp != ip) {

- /*

- * From name resolves to a different inode. IN_RENAME is

- * not sufficient protection against timing window races

- * so we can't panic here. XXX the only real way

- * to solve this issue is at a much higher level. By the

- * time we hit ufs_rename() it's too late.

- */

-#if 0

- if (doingdirectory)

- panic("ufs_rename: lost dir entry");

-#endif

- } else {

+ if (doingdirectory && newparent) {

- * If the source is a directory with a

- * new parent, the link count of the old

- * parent directory must be decremented

- * and ".." set to point to the new parent.

+ * If tip exists we simply use its link, otherwise we must

+ * add a new one.

- if (doingdirectory && newparent) {

- xp->i_offset = mastertemplate.dot_reclen;

- ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0);

- cache_purge(fdvp);

+ if (tip == NULL) {

+ tdp->i_effnlink++;

+ tdp->i_nlink++;

+ DIP_SET(tdp, i_nlink, tdp->i_nlink);

+ tdp->i_flag |= IN_CHANGE;

+ if (DOINGSOFTDEP(tdvp))

+ softdep_setup_dotdot_link(tdp, fip);

+ error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |

+ DOINGASYNC(tdvp)));

+ /* Don't go to bad here as the new link exists. */

+ if (error)

+ goto unlockout;

}

- error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);

- xp->i_flag &= ~IN_RENAME;

- }

- if (dp)

- vput(fdvp);

- if (xp)

- vput(fvp);

- vrele(ap->a_fvp);

+ fip->i_offset = mastertemplate.dot_reclen;

+ ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0);

+ cache_purge(fdvp);

+ }

+ error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0);

+unlockout:

+ vput(fdvp);

+ vput(fvp);

+ if (tvp)

+ vput(tvp);

+ /*

+ * If compaction or fsync was requested do it now that other locks

+ * are no longer needed.

+ */

+ if (error == 0 && endoff != 0) {

+#ifdef UFS_DIRHASH

+ if (tdp->i_dirhash != NULL)

+ ufsdirhash_dirtrunc(tdp, endoff);

+#endif

+ UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, tcnp->cn_cred,

+ td);

+ }

+ if (error == 0 && tdp->i_flag & IN_NEEDSYNC)

+ error = VOP_FSYNC(tdvp, MNT_WAIT, td);

+ vput(tdvp);

+ if (mp)

+ vfs_unbusy(mp);

return (error);

bad:

- if (xp)

- vput(ITOV(xp));

- vput(ITOV(dp));

-out:

- if (doingdirectory)

- ip->i_flag &= ~IN_RENAME;

- if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {

- ip->i_effnlink--;

- ip->i_nlink--;

- DIP_SET(ip, i_nlink, ip->i_nlink);

- ip->i_flag |= IN_CHANGE;

- ip->i_flag &= ~IN_RENAME;

- if (DOINGSOFTDEP(fvp))

- softdep_change_linkcnt(ip);

- vput(fvp);

- } else

- vrele(fvp);

+ fip->i_effnlink--;

+ fip->i_nlink--;

+ DIP_SET(fip, i_nlink, fip->i_nlink);

+ fip->i_flag |= IN_CHANGE;

+ if (DOINGSOFTDEP(fvp))

+ softdep_revert_link(tdp, fip);

+ goto unlockout;

+releout:

+ vrele(fdvp);

+ vrele(fvp);

+ vrele(tdvp);

+ if (tvp)

+ vrele(tvp);

+ if (mp)

+ vfs_unbusy(mp);

return (error);

}

@@ -1767,8 +1856,7 @@ ufs_mkdir(ap)

ip->i_effnlink = 2;

ip->i_nlink = 2;

DIP_SET(ip, i_nlink, 2);

- if (DOINGSOFTDEP(tvp))

- softdep_change_linkcnt(ip);

if (cnp->cn_flags & ISWHITEOUT) {

ip->i_flags |= UF_OPAQUE;

DIP_SET(ip, i_flags, ip->i_flags);

@@ -1784,8 +1872,8 @@ ufs_mkdir(ap)

DIP_SET(dp, i_nlink, dp->i_nlink);

dp->i_flag |= IN_CHANGE;

if (DOINGSOFTDEP(dvp))

- softdep_change_linkcnt(dp);

- error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));

+ softdep_setup_mkdir(dp, ip);

+ error = UFS_UPDATE(dvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));

if (error)

goto bad;

#ifdef MAC

@@ -1863,7 +1951,7 @@ ufs_mkdir(ap)

else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp))))

goto bad;

ufs_makedirentry(ip, cnp, &newdir);

- error = ufs_direnter(dvp, tvp, &newdir, cnp, bp);

+ error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0);

bad:

if (error == 0) {

@@ -1873,8 +1961,6 @@ bad:

dp->i_nlink--;

DIP_SET(dp, i_nlink, dp->i_nlink);

dp->i_flag |= IN_CHANGE;

- if (DOINGSOFTDEP(dvp))

- softdep_change_linkcnt(dp);

* No need to do an explicit VOP_TRUNCATE here, vrele will

* do this for us because we set the link count to 0.

@@ -1884,7 +1970,8 @@ bad:

DIP_SET(ip, i_nlink, 0);

ip->i_flag |= IN_CHANGE;

if (DOINGSOFTDEP(tvp))

- softdep_change_linkcnt(ip);

+ softdep_revert_mkdir(dp, ip);

vput(tvp);

}

out:

@@ -1920,10 +2007,13 @@ ufs_rmdir(ap)

* tries to remove a locally mounted on directory).

error = 0;

- if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) {

+ if (ip->i_effnlink < 2) {

error = EINVAL;

goto out;

}

+ if (dp->i_effnlink < 3)

+ panic("ufs_dirrem: Bad link count %d on parent",

+ dp->i_effnlink);

if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {

error = ENOTEMPTY;

goto out;

@@ -1947,18 +2037,14 @@ ufs_rmdir(ap)

dp->i_effnlink--;

ip->i_effnlink--;

- if (DOINGSOFTDEP(vp)) {

- softdep_change_linkcnt(dp);

- softdep_change_linkcnt(ip);

- }

+ if (DOINGSOFTDEP(vp))

+ softdep_setup_rmdir(dp, ip);

error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);

if (error) {

dp->i_effnlink++;

ip->i_effnlink++;

- if (DOINGSOFTDEP(vp)) {

- softdep_change_linkcnt(dp);

- softdep_change_linkcnt(ip);

- }

+ if (DOINGSOFTDEP(vp))

+ softdep_revert_rmdir(dp, ip);

goto out;

}

cache_purge(dvp);

@@ -2464,6 +2550,9 @@ ufs_makeinode(mode, dvp, vpp, cnp)

if ((mode & IFMT) == 0)

mode |= IFREG;

+ if (VTOI(dvp)->i_effnlink < 2)

+ panic("ufs_makeinode: Bad link count %d on parent",

+ VTOI(dvp)->i_effnlink);

error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);

if (error)

return (error);

@@ -2539,7 +2628,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)

ip->i_nlink = 1;

DIP_SET(ip, i_nlink, 1);

if (DOINGSOFTDEP(tvp))

- softdep_change_linkcnt(ip);

+ softdep_setup_create(VTOI(dvp), ip);

if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&

priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) {

ip->i_mode &= ~ISGID;

@@ -2579,7 +2668,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)

}

#endif /* !UFS_ACL */

ufs_makedirentry(ip, cnp, &newdir);

- error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL);

+ error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0);

if (error)

goto bad;

*vpp = tvp;

@@ -2595,7 +2684,7 @@ bad:

DIP_SET(ip, i_nlink, 0);

ip->i_flag |= IN_CHANGE;

if (DOINGSOFTDEP(tvp))

- softdep_change_linkcnt(ip);

+ softdep_revert_create(VTOI(dvp), ip);

vput(tvp);

return (error);

}

diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h
index 83f9af06b59d..d5669179dac2 100644
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h

@@ -57,6 +57,10 @@ struct ucred;

struct uio;

struct vnode;

struct ufs_extattr_per_mount;

+struct jblocks;

+struct inodedep;

+TAILQ_HEAD(inodedeplst, inodedep);

/* This structure describes the UFS specific mount structure data. */

struct ufsmount {

@@ -75,6 +79,11 @@ struct ufsmount {

long um_numindirdeps; /* outstanding indirdeps */

struct workhead softdep_workitem_pending; /* softdep work queue */

struct worklist *softdep_worklist_tail; /* Tail pointer for above */

+ struct workhead softdep_journal_pending; /* journal work queue */

+ struct worklist *softdep_journal_tail; /* Tail pointer for above */

+ struct jblocks *softdep_jblocks; /* Journal block information */

+ struct inodedeplst softdep_unlinked; /* Unlinked inodes */

+ int softdep_on_journal; /* Items on the journal list */

int softdep_on_worklist; /* Items on the worklist */

int softdep_on_worklist_inprogress; /* Busy items on worklist */

int softdep_deps; /* Total dependency count */