aboutsummaryrefslogtreecommitdiff
path: root/sys/ufs
diff options
context:
space:
mode:
authorKirk McKusick <mckusick@FreeBSD.org>2010-04-28 05:33:59 +0000
committerKirk McKusick <mckusick@FreeBSD.org>2010-04-28 05:33:59 +0000
commita4bf5fb987611aeb78c422312b63b185e39982d7 (patch)
treea65d36ab57a1e076de7e7a1d78add642fbd7062e /sys/ufs
parent509210970393a1a8cd8a65d5340dc4bed069fa68 (diff)
parentb641222476732f1f99d2362f093b79bbe088d764 (diff)
downloadsrc-a4bf5fb987611aeb78c422312b63b185e39982d7.tar.gz
src-a4bf5fb987611aeb78c422312b63b185e39982d7.zip
Update to current version of head.
Notes
Notes: svn path=/projects/quota64/; revision=207307
Diffstat (limited to 'sys/ufs')
-rw-r--r--sys/ufs/ffs/ffs_alloc.c252
-rw-r--r--sys/ufs/ffs/ffs_balloc.c13
-rw-r--r--sys/ufs/ffs/ffs_extern.h24
-rw-r--r--sys/ufs/ffs/ffs_inode.c132
-rw-r--r--sys/ufs/ffs/ffs_snapshot.c66
-rw-r--r--sys/ufs/ffs/ffs_softdep.c7289
-rw-r--r--sys/ufs/ffs/ffs_subr.c130
-rw-r--r--sys/ufs/ffs/ffs_vfsops.c69
-rw-r--r--sys/ufs/ffs/ffs_vnops.c1
-rw-r--r--sys/ufs/ffs/fs.h135
-rw-r--r--sys/ufs/ffs/softdep.h446
-rw-r--r--sys/ufs/ufs/dinode.h9
-rw-r--r--sys/ufs/ufs/inode.h3
-rw-r--r--sys/ufs/ufs/ufs_dirhash.c2
-rw-r--r--sys/ufs/ufs/ufs_extern.h24
-rw-r--r--sys/ufs/ufs/ufs_lookup.c192
-rw-r--r--sys/ufs/ufs/ufs_vnops.c583
-rw-r--r--sys/ufs/ufs/ufsmount.h9
18 files changed, 7576 insertions, 1803 deletions
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index 7bf117719726..b1f7ba0127f7 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -94,24 +94,24 @@ __FBSDID("$FreeBSD$");
#include <ufs/ffs/ffs_extern.h>
typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,
- int size);
+ int size, int rsize);
-static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int);
+static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);
static ufs2_daddr_t
- ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t);
+ ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
#ifdef INVARIANTS
static int ffs_checkblk(struct inode *, ufs2_daddr_t, long);
#endif
-static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);
-static void ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *,
- ufs1_daddr_t, int);
+static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int,
+ int);
static ino_t ffs_dirpref(struct inode *);
static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,
int, int);
static void ffs_fserr(struct fs *, ino_t, char *);
static ufs2_daddr_t ffs_hashalloc
- (struct inode *, u_int, ufs2_daddr_t, int, allocfcn_t *);
-static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int);
+ (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *);
+static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,
+ int);
static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
@@ -188,7 +188,7 @@ retry:
cg = ino_to_cg(fs, ip->i_number);
else
cg = dtog(fs, bpref);
- bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
+ bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
if (bno > 0) {
delta = btodb(size);
if (ip->i_flag & IN_SPACECOUNTED) {
@@ -387,16 +387,12 @@ retry:
panic("ffs_realloccg: bad optim");
/* NOTREACHED */
}
- bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);
+ bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
if (bno > 0) {
bp->b_blkno = fsbtodb(fs, bno);
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize,
- ip->i_number);
- if (nsize < request)
- ffs_blkfree(ump, fs, ip->i_devvp,
- bno + numfrags(fs, nsize),
- (long)(request - nsize), ip->i_number);
+ ip->i_number, NULL);
delta = btodb(nsize - osize);
if (ip->i_flag & IN_SPACECOUNTED) {
UFS_LOCK(ump);
@@ -487,6 +483,14 @@ ffs_reallocblks(ap)
if (doreallocblks == 0)
return (ENOSPC);
+ /*
+ * We can't wait in softdep prealloc as it may fsync and recurse
+ * here. Instead we simply fail to reallocate blocks if this
+ * rare condition arises.
+ */
+ if (DOINGSOFTDEP(ap->a_vp))
+ if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
+ return (ENOSPC);
if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1)
return (ffs_reallocblks_ufs1(ap));
return (ffs_reallocblks_ufs2(ap));
@@ -587,7 +591,7 @@ ffs_reallocblks_ufs1(ap)
* Search the block map looking for an allocation of the desired size.
*/
if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
- len, ffs_clusteralloc)) == 0) {
+ len, len, ffs_clusteralloc)) == 0) {
UFS_UNLOCK(ump);
goto fail;
}
@@ -673,7 +677,7 @@ ffs_reallocblks_ufs1(ap)
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ip->i_devvp,
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
- fs->fs_bsize, ip->i_number);
+ fs->fs_bsize, ip->i_number, NULL);
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
@@ -795,7 +799,7 @@ ffs_reallocblks_ufs2(ap)
* Search the block map looking for an allocation of the desired size.
*/
if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
- len, ffs_clusteralloc)) == 0) {
+ len, len, ffs_clusteralloc)) == 0) {
UFS_UNLOCK(ump);
goto fail;
}
@@ -881,7 +885,7 @@ ffs_reallocblks_ufs2(ap)
if (!DOINGSOFTDEP(vp))
ffs_blkfree(ump, fs, ip->i_devvp,
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
- fs->fs_bsize, ip->i_number);
+ fs->fs_bsize, ip->i_number, NULL);
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
if (!ffs_checkblk(ip,
@@ -969,7 +973,7 @@ ffs_valloc(pvp, mode, cred, vpp)
if (fs->fs_contigdirs[cg] > 0)
fs->fs_contigdirs[cg]--;
}
- ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode,
+ ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
(allocfcn_t *)ffs_nodealloccg);
if (ino == 0)
goto noinodes;
@@ -1278,11 +1282,12 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap)
*/
/*VARARGS5*/
static ufs2_daddr_t
-ffs_hashalloc(ip, cg, pref, size, allocator)
+ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
struct inode *ip;
u_int cg;
ufs2_daddr_t pref;
- int size; /* size for data blocks, mode for inodes */
+ int size; /* Search size for data blocks, mode for inodes */
+ int rsize; /* Real allocated size. */
allocfcn_t *allocator;
{
struct fs *fs;
@@ -1298,7 +1303,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
/*
* 1: preferred cylinder group
*/
- result = (*allocator)(ip, cg, pref, size);
+ result = (*allocator)(ip, cg, pref, size, rsize);
if (result)
return (result);
/*
@@ -1308,7 +1313,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
cg += i;
if (cg >= fs->fs_ncg)
cg -= fs->fs_ncg;
- result = (*allocator)(ip, cg, 0, size);
+ result = (*allocator)(ip, cg, 0, size, rsize);
if (result)
return (result);
}
@@ -1319,7 +1324,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
*/
cg = (icg + 2) % fs->fs_ncg;
for (i = 2; i < fs->fs_ncg; i++) {
- result = (*allocator)(ip, cg, 0, size);
+ result = (*allocator)(ip, cg, 0, size, rsize);
if (result)
return (result);
cg++;
@@ -1401,7 +1406,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize)
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev);
+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
+ frags, numfrags(fs, osize));
bdwrite(bp);
return (bprev);
@@ -1419,11 +1425,12 @@ fail:
* and if it is, allocate it.
*/
static ufs2_daddr_t
-ffs_alloccg(ip, cg, bpref, size)
+ffs_alloccg(ip, cg, bpref, size, rsize)
struct inode *ip;
u_int cg;
ufs2_daddr_t bpref;
int size;
+ int rsize;
{
struct fs *fs;
struct cg *cgp;
@@ -1451,7 +1458,7 @@ ffs_alloccg(ip, cg, bpref, size)
cgp->cg_old_time = cgp->cg_time = time_second;
if (size == fs->fs_bsize) {
UFS_LOCK(ump);
- blkno = ffs_alloccgblk(ip, bp, bpref);
+ blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
bdwrite(bp);
@@ -1475,21 +1482,14 @@ ffs_alloccg(ip, cg, bpref, size)
if (cgp->cg_cs.cs_nbfree == 0)
goto fail;
UFS_LOCK(ump);
- blkno = ffs_alloccgblk(ip, bp, bpref);
- bno = dtogd(fs, blkno);
- for (i = frags; i < fs->fs_frag; i++)
- setbit(blksfree, bno + i);
- i = fs->fs_frag - frags;
- cgp->cg_cs.cs_nffree += i;
- fs->fs_cstotal.cs_nffree += i;
- fs->fs_cs(fs, cg).cs_nffree += i;
- fs->fs_fmod = 1;
- cgp->cg_frsum[i]++;
+ blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
bdwrite(bp);
return (blkno);
}
+ KASSERT(size == rsize,
+ ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
if (bno < 0)
goto fail;
@@ -1507,7 +1507,7 @@ ffs_alloccg(ip, cg, bpref, size)
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
bdwrite(bp);
return (blkno);
@@ -1529,10 +1529,11 @@ fail:
* blocks may be fragmented by the routine that allocates them.
*/
static ufs2_daddr_t
-ffs_alloccgblk(ip, bp, bpref)
+ffs_alloccgblk(ip, bp, bpref, size)
struct inode *ip;
struct buf *bp;
ufs2_daddr_t bpref;
+ int size;
{
struct fs *fs;
struct cg *cgp;
@@ -1540,6 +1541,7 @@ ffs_alloccgblk(ip, bp, bpref)
ufs1_daddr_t bno;
ufs2_daddr_t blkno;
u_int8_t *blksfree;
+ int i;
fs = ip->i_fs;
ump = ip->i_ump;
@@ -1567,16 +1569,32 @@ ffs_alloccgblk(ip, bp, bpref)
gotit:
blkno = fragstoblks(fs, bno);
ffs_clrblock(fs, blksfree, (long)blkno);
- ffs_clusteracct(ump, fs, cgp, blkno, -1);
+ ffs_clusteracct(fs, cgp, blkno, -1);
cgp->cg_cs.cs_nbfree--;
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
fs->fs_fmod = 1;
blkno = cgbase(fs, cgp->cg_cgx) + bno;
+ /*
+ * If the caller didn't want the whole block free the frags here.
+ */
+ size = numfrags(fs, size);
+ if (size != fs->fs_frag) {
+ bno = dtogd(fs, blkno);
+ for (i = size; i < fs->fs_frag; i++)
+ setbit(blksfree, bno + i);
+ i = fs->fs_frag - size;
+ cgp->cg_cs.cs_nffree += i;
+ fs->fs_cstotal.cs_nffree += i;
+ fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
+ fs->fs_fmod = 1;
+ cgp->cg_frsum[i]++;
+ }
/* XXX Fixme. */
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
+ size, 0);
UFS_LOCK(ump);
return (blkno);
}
@@ -1589,11 +1607,12 @@ gotit:
* take the first one that we find following bpref.
*/
static ufs2_daddr_t
-ffs_clusteralloc(ip, cg, bpref, len)
+ffs_clusteralloc(ip, cg, bpref, len, unused)
struct inode *ip;
u_int cg;
ufs2_daddr_t bpref;
int len;
+ int unused;
{
struct fs *fs;
struct cg *cgp;
@@ -1689,7 +1708,7 @@ ffs_clusteralloc(ip, cg, bpref, len)
len = blkstofrags(fs, len);
UFS_LOCK(ump);
for (i = 0; i < len; i += fs->fs_frag)
- if (ffs_alloccgblk(ip, bp, bno + i) != bno + i)
+ if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
panic("ffs_clusteralloc: lost block");
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
@@ -1713,11 +1732,12 @@ fail:
* inode in the specified cylinder group.
*/
static ufs2_daddr_t
-ffs_nodealloccg(ip, cg, ipref, mode)
+ffs_nodealloccg(ip, cg, ipref, mode, unused)
struct inode *ip;
u_int cg;
ufs2_daddr_t ipref;
int mode;
+ int unused;
{
struct fs *fs;
struct cg *cgp;
@@ -1820,28 +1840,6 @@ gotit:
}
/*
- * check if a block is free
- */
-static int
-ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
-{
-
- switch ((int)fs->fs_frag) {
- case 8:
- return (cp[h] == 0);
- case 4:
- return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
- case 2:
- return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
- case 1:
- return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
- default:
- panic("ffs_isfreeblock");
- }
- return (0);
-}
-
-/*
* Free a block or fragment.
*
* The specified block or fragment is placed back in the
@@ -1849,14 +1847,16 @@ ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
* block reassembly is checked.
*/
void
-ffs_blkfree(ump, fs, devvp, bno, size, inum)
+ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
ufs2_daddr_t bno;
long size;
ino_t inum;
+ struct workhead *dephd;
{
+ struct mount *mp;
struct cg *cgp;
struct buf *bp;
ufs1_daddr_t fragno, cgbno;
@@ -1923,7 +1923,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
panic("ffs_blkfree: freeing free block");
}
ffs_setblock(fs, blksfree, fragno);
- ffs_clusteracct(ump, fs, cgp, fragno, 1);
+ ffs_clusteracct(fs, cgp, fragno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
@@ -1963,7 +1963,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
cgp->cg_cs.cs_nffree -= fs->fs_frag;
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
- ffs_clusteracct(ump, fs, cgp, fragno, 1);
+ ffs_clusteracct(fs, cgp, fragno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
@@ -1972,6 +1972,10 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
fs->fs_fmod = 1;
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
+ mp = UFSTOVFS(ump);
+ if (mp->mnt_flag & MNT_SOFTDEP && devvp->v_type != VREG)
+ softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
+ numfrags(fs, size), dephd);
bdwrite(bp);
}
@@ -2042,7 +2046,8 @@ ffs_vfree(pvp, ino, mode)
return (0);
}
ip = VTOI(pvp);
- return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode));
+ return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode,
+ NULL));
}
/*
@@ -2050,12 +2055,13 @@ ffs_vfree(pvp, ino, mode)
* The specified inode is placed back in the free map.
*/
int
-ffs_freefile(ump, fs, devvp, ino, mode)
+ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
ino_t ino;
int mode;
+ struct workhead *wkhd;
{
struct cg *cgp;
struct buf *bp;
@@ -2112,6 +2118,9 @@ ffs_freefile(ump, fs, devvp, ino, mode)
fs->fs_fmod = 1;
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
+ if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP && devvp->v_type != VREG)
+ softdep_setup_inofree(UFSTOVFS(ump), bp,
+ ino + cg * fs->fs_ipg, wkhd);
bdwrite(bp);
return (0);
}
@@ -2226,101 +2235,6 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz)
}
/*
- * Update the cluster map because of an allocation or free.
- *
- * Cnt == 1 means free; cnt == -1 means allocating.
- */
-void
-ffs_clusteracct(ump, fs, cgp, blkno, cnt)
- struct ufsmount *ump;
- struct fs *fs;
- struct cg *cgp;
- ufs1_daddr_t blkno;
- int cnt;
-{
- int32_t *sump;
- int32_t *lp;
- u_char *freemapp, *mapp;
- int i, start, end, forw, back, map, bit;
-
- mtx_assert(UFS_MTX(ump), MA_OWNED);
-
- if (fs->fs_contigsumsize <= 0)
- return;
- freemapp = cg_clustersfree(cgp);
- sump = cg_clustersum(cgp);
- /*
- * Allocate or clear the actual block.
- */
- if (cnt > 0)
- setbit(freemapp, blkno);
- else
- clrbit(freemapp, blkno);
- /*
- * Find the size of the cluster going forward.
- */
- start = blkno + 1;
- end = start + fs->fs_contigsumsize;
- if (end >= cgp->cg_nclusterblks)
- end = cgp->cg_nclusterblks;
- mapp = &freemapp[start / NBBY];
- map = *mapp++;
- bit = 1 << (start % NBBY);
- for (i = start; i < end; i++) {
- if ((map & bit) == 0)
- break;
- if ((i & (NBBY - 1)) != (NBBY - 1)) {
- bit <<= 1;
- } else {
- map = *mapp++;
- bit = 1;
- }
- }
- forw = i - start;
- /*
- * Find the size of the cluster going backward.
- */
- start = blkno - 1;
- end = start - fs->fs_contigsumsize;
- if (end < 0)
- end = -1;
- mapp = &freemapp[start / NBBY];
- map = *mapp--;
- bit = 1 << (start % NBBY);
- for (i = start; i > end; i--) {
- if ((map & bit) == 0)
- break;
- if ((i & (NBBY - 1)) != 0) {
- bit >>= 1;
- } else {
- map = *mapp--;
- bit = 1 << (NBBY - 1);
- }
- }
- back = start - i;
- /*
- * Account for old cluster and the possibly new forward and
- * back clusters.
- */
- i = back + forw + 1;
- if (i > fs->fs_contigsumsize)
- i = fs->fs_contigsumsize;
- sump[i] += cnt;
- if (back > 0)
- sump[back] -= cnt;
- if (forw > 0)
- sump[forw] -= cnt;
- /*
- * Update cluster summary information.
- */
- lp = &sump[fs->fs_contigsumsize];
- for (i = fs->fs_contigsumsize; i > 0; i--)
- if (*lp-- > 0)
- break;
- fs->fs_maxcluster[cgp->cg_cgx] = i;
-}
-
-/*
* Fserr prints the name of a filesystem with an error diagnostic.
*
* The form of the error message is:
@@ -2540,7 +2454,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
#endif /* DEBUG */
while (cmd.size > 0) {
if ((error = ffs_freefile(ump, fs, ump->um_devvp,
- cmd.value, filetype)))
+ cmd.value, filetype, NULL)))
break;
cmd.size -= 1;
cmd.value += 1;
@@ -2568,7 +2482,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
if (blksize > blkcnt)
blksize = blkcnt;
ffs_blkfree(ump, fs, ump->um_devvp, blkno,
- blksize * fs->fs_fsize, ROOTINO);
+ blksize * fs->fs_fsize, ROOTINO, NULL);
blkno += blksize;
blkcnt -= blksize;
blksize = fs->fs_frag;
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index a12f96e60d0e..6d5f27c1f306 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -120,6 +120,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
if (lbn < 0)
return (EFBIG);
+ if (DOINGSOFTDEP(vp))
+ softdep_prealloc(vp, MNT_WAIT);
/*
* If the next write will extend the file into a new block,
* and the file is currently composed of a fragment
@@ -418,6 +420,8 @@ fail:
* slow, running out of disk space is not expected to be a common
* occurence. The error return from fsync is ignored as we already
* have an error to return to the user.
+ *
+ * XXX Still have to journal the free below
*/
(void) ffs_syncvnode(vp, MNT_WAIT);
for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
@@ -473,7 +477,7 @@ fail:
*/
for (blkp = allociblk; blkp < allocblk; blkp++) {
ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
- ip->i_number);
+ ip->i_number, NULL);
}
return (error);
}
@@ -515,6 +519,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
if (lbn < 0)
return (EFBIG);
+ if (DOINGSOFTDEP(vp))
+ softdep_prealloc(vp, MNT_WAIT);
+
/*
* Check for allocating external data.
*/
@@ -930,6 +937,8 @@ fail:
* slow, running out of disk space is not expected to be a common
* occurence. The error return from fsync is ignored as we already
* have an error to return to the user.
+ *
+ * XXX Still have to journal the free below
*/
(void) ffs_syncvnode(vp, MNT_WAIT);
for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
@@ -985,7 +994,7 @@ fail:
*/
for (blkp = allociblk; blkp < allocblk; blkp++) {
ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
- ip->i_number);
+ ip->i_number, NULL);
}
return (error);
}
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index 7e32ced2ebe2..7011623749ba 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -47,6 +47,7 @@ struct ucred;
struct vnode;
struct vop_fsync_args;
struct vop_reallocblks_args;
+struct workhead;
int ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int,
struct ucred *, ufs2_daddr_t *);
@@ -56,20 +57,23 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size,
struct ucred *a_cred, int a_flags, struct buf **a_bpp);
int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
- ufs2_daddr_t, long, ino_t);
+ ufs2_daddr_t, long, ino_t, struct workhead *);
ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
void ffs_bdflush(struct bufobj *, struct buf *);
int ffs_copyonwrite(struct vnode *, struct buf *);
int ffs_flushfiles(struct mount *, int, struct thread *);
void ffs_fragacct(struct fs *, int, int32_t [], int);
int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t,
- int);
+ int, struct workhead *);
int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
+int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
int ffs_mountroot(void);
+void ffs_oldfscompat_write(struct fs *, struct ufsmount *);
int ffs_reallocblks(struct vop_reallocblks_args *);
int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
ufs2_daddr_t, int, int, int, struct ucred *, struct buf **);
@@ -103,12 +107,14 @@ extern struct vop_vector ffs_fifoops2;
int softdep_check_suspend(struct mount *, struct vnode *,
int, int, int, int);
+int softdep_complete_trunc(struct vnode *, void *);
void softdep_get_depcounts(struct mount *, int *, int *);
void softdep_initialize(void);
void softdep_uninitialize(void);
int softdep_mount(struct vnode *, struct mount *, struct fs *,
struct ucred *);
-void softdep_move_dependencies(struct buf *, struct buf *);
+void softdep_unmount(struct mount *);
+int softdep_move_dependencies(struct buf *, struct buf *);
int softdep_flushworklist(struct mount *, int *, struct thread *);
int softdep_flushfiles(struct mount *, int, struct thread *);
void softdep_update_inodeblock(struct inode *, struct buf *, int);
@@ -117,7 +123,8 @@ void softdep_freefile(struct vnode *, ino_t, int);
int softdep_request_cleanup(struct fs *, struct vnode *);
void softdep_setup_freeblocks(struct inode *, off_t, int);
void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t);
-void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t);
+void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t,
+ int, int);
void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t,
ufs2_daddr_t, long, long, struct buf *);
void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t,
@@ -126,11 +133,20 @@ void softdep_setup_allocindir_meta(struct buf *, struct inode *,
struct buf *, int, ufs2_daddr_t);
void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t,
struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *);
+void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int,
+ struct workhead *);
+void softdep_setup_inofree(struct mount *, struct buf *, ino_t,
+ struct workhead *);
+void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);
+void *softdep_setup_trunc(struct vnode *vp, off_t length, int flags);
void softdep_fsync_mountdev(struct vnode *);
int softdep_sync_metadata(struct vnode *);
int softdep_process_worklist(struct mount *, int);
int softdep_fsync(struct vnode *);
int softdep_waitidle(struct mount *);
+int softdep_prealloc(struct vnode *, int);
+int softdep_journal_lookup(struct mount *, struct vnode **);
+
int ffs_rdonly(struct inode *);
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index b2f906730121..3b6983258b93 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -92,15 +92,6 @@ ffs_update(vp, waitfor)
fs = ip->i_fs;
if (fs->fs_ronly)
return (0);
- /*
- * Ensure that uid and gid are correct. This is a temporary
- * fix until fsck has been changed to do the update.
- */
- if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */
- fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */
- ip->i_din1->di_ouid = ip->i_uid; /* XXX */
- ip->i_din1->di_ogid = ip->i_gid; /* XXX */
- } /* XXX */
error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
(int)fs->fs_bsize, NOCRED, &bp);
if (error) {
@@ -160,6 +151,7 @@ ffs_truncate(vp, length, flags, cred, td)
ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR];
ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
ufs2_daddr_t count, blocksreleased = 0, datablocks;
+ void *cookie;
struct bufobj *bo;
struct fs *fs;
struct buf *bp;
@@ -173,11 +165,14 @@ ffs_truncate(vp, length, flags, cred, td)
fs = ip->i_fs;
ump = ip->i_ump;
bo = &vp->v_bufobj;
+ cookie = NULL;
ASSERT_VOP_LOCKED(vp, "ffs_truncate");
if (length < 0)
return (EINVAL);
+ if (length > fs->fs_maxfilesize)
+ return (EFBIG);
/*
* Historically clients did not have to specify which data
* they were truncating. So, if not specified, we assume
@@ -192,6 +187,7 @@ ffs_truncate(vp, length, flags, cred, td)
* (e.g., the file is being unlinked), then pick it off with
* soft updates below.
*/
+ allerror = 0;
needextclean = 0;
softdepslowdown = DOINGSOFTDEP(vp) && softdep_slowdown(vp);
extblocks = 0;
@@ -212,6 +208,8 @@ ffs_truncate(vp, length, flags, cred, td)
panic("ffs_truncate: partial trunc of extdata");
if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
return (error);
+ if (DOINGSUJ(vp))
+ cookie = softdep_setup_trunc(vp, length, flags);
osize = ip->i_din2->di_extsize;
ip->i_din2->di_blocks -= extblocks;
#ifdef QUOTA
@@ -227,19 +225,19 @@ ffs_truncate(vp, length, flags, cred, td)
}
ip->i_flag |= IN_CHANGE;
if ((error = ffs_update(vp, 1)))
- return (error);
+ goto out;
for (i = 0; i < NXADDR; i++) {
if (oldblks[i] == 0)
continue;
ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i],
- sblksize(fs, osize, i), ip->i_number);
+ sblksize(fs, osize, i), ip->i_number, NULL);
}
}
}
- if ((flags & IO_NORMAL) == 0)
- return (0);
- if (length > fs->fs_maxfilesize)
- return (EFBIG);
+ if ((flags & IO_NORMAL) == 0) {
+ error = 0;
+ goto out;
+ }
if (vp->v_type == VLNK &&
(ip->i_size < vp->v_mount->mnt_maxsymlinklen ||
datablocks == 0)) {
@@ -253,24 +251,52 @@ ffs_truncate(vp, length, flags, cred, td)
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (needextclean)
softdep_setup_freeblocks(ip, length, IO_EXT);
- return (ffs_update(vp, 1));
+ error = ffs_update(vp, 1);
+ goto out;
}
if (ip->i_size == length) {
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (needextclean)
softdep_setup_freeblocks(ip, length, IO_EXT);
- return (ffs_update(vp, 0));
+ error = ffs_update(vp, 0);
+ goto out;
}
if (fs->fs_ronly)
panic("ffs_truncate: read-only filesystem");
#ifdef QUOTA
error = getinoquota(ip);
if (error)
- return (error);
+ goto out;
#endif
if ((ip->i_flags & SF_SNAPSHOT) != 0)
ffs_snapremove(vp);
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+ osize = ip->i_size;
+ /*
+ * Lengthen the size of the file. We must ensure that the
+ * last byte of the file is allocated. Since the smallest
+ * value of osize is 0, length will be at least 1.
+ */
+ if (osize < length) {
+ vnode_pager_setsize(vp, length);
+ flags |= BA_CLRBUF;
+ error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
+ if (error) {
+ vnode_pager_setsize(vp, osize);
+ goto out;
+ }
+ ip->i_size = length;
+ DIP_SET(ip, i_size, length);
+ if (bp->b_bufsize == fs->fs_bsize)
+ bp->b_flags |= B_CLUSTEROK;
+ if (flags & IO_SYNC)
+ bwrite(bp);
+ else
+ bawrite(bp);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ error = ffs_update(vp, 1);
+ goto out;
+ }
if (DOINGSOFTDEP(vp)) {
if (length > 0 || softdepslowdown) {
/*
@@ -283,11 +309,18 @@ ffs_truncate(vp, length, flags, cred, td)
* so that it will have no data structures left.
*/
if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
- return (error);
+ goto out;
UFS_LOCK(ump);
if (ip->i_flag & IN_SPACECOUNTED)
fs->fs_pendingblocks -= datablocks;
UFS_UNLOCK(ump);
+ /*
+ * We have to journal the truncation before we change
+ * any blocks so we don't leave the file partially
+ * truncated.
+ */
+ if (DOINGSUJ(vp) && cookie == NULL)
+ cookie = softdep_setup_trunc(vp, length, flags);
} else {
#ifdef QUOTA
(void) chkdq(ip, -datablocks, NOCRED, 0);
@@ -301,33 +334,9 @@ ffs_truncate(vp, length, flags, cred, td)
OFF_TO_IDX(lblktosize(fs, -extblocks)));
vnode_pager_setsize(vp, 0);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
- return (ffs_update(vp, 0));
- }
- }
- osize = ip->i_size;
- /*
- * Lengthen the size of the file. We must ensure that the
- * last byte of the file is allocated. Since the smallest
- * value of osize is 0, length will be at least 1.
- */
- if (osize < length) {
- vnode_pager_setsize(vp, length);
- flags |= BA_CLRBUF;
- error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
- if (error) {
- vnode_pager_setsize(vp, osize);
- return (error);
+ error = ffs_update(vp, 0);
+ goto out;
}
- ip->i_size = length;
- DIP_SET(ip, i_size, length);
- if (bp->b_bufsize == fs->fs_bsize)
- bp->b_flags |= B_CLUSTEROK;
- if (flags & IO_SYNC)
- bwrite(bp);
- else
- bawrite(bp);
- ip->i_flag |= IN_CHANGE | IN_UPDATE;
- return (ffs_update(vp, 1));
}
/*
* Shorten the size of the file. If the file is not being
@@ -345,9 +354,8 @@ ffs_truncate(vp, length, flags, cred, td)
lbn = lblkno(fs, length);
flags |= BA_CLRBUF;
error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
- if (error) {
- return (error);
- }
+ if (error)
+ goto out;
/*
* When we are doing soft updates and the UFS_BALLOC
* above fills in a direct block hole with a full sized
@@ -359,7 +367,7 @@ ffs_truncate(vp, length, flags, cred, td)
if (DOINGSOFTDEP(vp) && lbn < NDADDR &&
fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
(error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
- return (error);
+ goto out;
ip->i_size = length;
DIP_SET(ip, i_size, length);
size = blksize(fs, ip, lbn);
@@ -405,7 +413,13 @@ ffs_truncate(vp, length, flags, cred, td)
DIP_SET(ip, i_db[i], 0);
}
ip->i_flag |= IN_CHANGE | IN_UPDATE;
- allerror = ffs_update(vp, 1);
+ /*
+ * When doing softupdate journaling we must preserve the size along
+ * with the old pointers until they are freed or we might not
+ * know how many fragments remain.
+ */
+ if (!DOINGSUJ(vp))
+ allerror = ffs_update(vp, 1);
/*
* Having written the new inode to disk, save its new configuration
@@ -445,7 +459,7 @@ ffs_truncate(vp, length, flags, cred, td)
if (lastiblock[level] < 0) {
DIP_SET(ip, i_ib[level], 0);
ffs_blkfree(ump, fs, ip->i_devvp, bn,
- fs->fs_bsize, ip->i_number);
+ fs->fs_bsize, ip->i_number, NULL);
blocksreleased += nblocks;
}
}
@@ -464,7 +478,8 @@ ffs_truncate(vp, length, flags, cred, td)
continue;
DIP_SET(ip, i_db[i], 0);
bsize = blksize(fs, ip, i);
- ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number);
+ ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number,
+ NULL);
blocksreleased += btodb(bsize);
}
if (lastblock < 0)
@@ -496,7 +511,7 @@ ffs_truncate(vp, length, flags, cred, td)
*/
bn += numfrags(fs, newspace);
ffs_blkfree(ump, fs, ip->i_devvp, bn,
- oldspace - newspace, ip->i_number);
+ oldspace - newspace, ip->i_number, NULL);
blocksreleased += btodb(oldspace - newspace);
}
}
@@ -528,7 +543,14 @@ done:
#ifdef QUOTA
(void) chkdq(ip, -blocksreleased, NOCRED, 0);
#endif
- return (allerror);
+ error = allerror;
+out:
+ if (cookie) {
+ allerror = softdep_complete_trunc(vp, cookie);
+ if (allerror != 0 && error == 0)
+ error = allerror;
+ }
+ return (error);
}
/*
@@ -638,7 +660,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
blocksreleased += blkcount;
}
ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize,
- ip->i_number);
+ ip->i_number, NULL);
blocksreleased += nblocks;
}
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
index b36cb58808bd..11362cfbc755 100644
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@@ -142,7 +142,7 @@ MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
static int cgaccount(int, struct vnode *, struct buf *, int);
static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
- ufs_lbn_t, int), int);
+ ufs_lbn_t, int), int, int);
static int indiracct_ufs1(struct vnode *, struct vnode *, int,
ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
@@ -155,7 +155,7 @@ static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
struct fs *, ufs_lbn_t, int);
static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
- ufs_lbn_t, int), int);
+ ufs_lbn_t, int), int, int);
static int indiracct_ufs2(struct vnode *, struct vnode *, int,
ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
@@ -582,7 +582,8 @@ loop:
len = fragroundup(fs, blkoff(fs, xp->i_size));
if (len != 0 && len < fs->fs_bsize) {
ffs_blkfree(ump, copy_fs, vp,
- DIP(xp, i_db[loc]), len, xp->i_number);
+ DIP(xp, i_db[loc]), len, xp->i_number,
+ NULL);
blkno = DIP(xp, i_db[loc]);
DIP_SET(xp, i_db[loc], 0);
}
@@ -590,15 +591,15 @@ loop:
snaplistsize += 1;
if (xp->i_ump->um_fstype == UFS1)
error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
- BLK_NOCOPY);
+ BLK_NOCOPY, 1);
else
error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
- BLK_NOCOPY);
+ BLK_NOCOPY, 1);
if (blkno)
DIP_SET(xp, i_db[loc], blkno);
if (!error)
error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
- xp->i_mode);
+ xp->i_mode, NULL);
VOP_UNLOCK(xvp, 0);
vdrop(xvp);
if (error) {
@@ -612,6 +613,26 @@ loop:
}
MNT_IUNLOCK(mp);
/*
+ * Erase the journal file from the snapshot.
+ */
+ if (fs->fs_flags & FS_SUJ) {
+ error = softdep_journal_lookup(mp, &xvp);
+ if (error) {
+ free(copy_fs->fs_csp, M_UFSMNT);
+ bawrite(sbp);
+ sbp = NULL;
+ goto out1;
+ }
+ xp = VTOI(xvp);
+ if (xp->i_ump->um_fstype == UFS1)
+ error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
+ BLK_NOCOPY, 0);
+ else
+ error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
+ BLK_NOCOPY, 0);
+ vput(xvp);
+ }
+ /*
* Acquire a lock on the snapdata structure, creating it if necessary.
*/
sn = ffs_snapdata_acquire(devvp);
@@ -691,16 +712,16 @@ out1:
break;
if (xp->i_ump->um_fstype == UFS1)
error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
- BLK_SNAP);
+ BLK_SNAP, 0);
else
error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
- BLK_SNAP);
+ BLK_SNAP, 0);
if (error == 0 && xp->i_effnlink == 0) {
error = ffs_freefile(ump,
copy_fs,
vp,
xp->i_number,
- xp->i_mode);
+ xp->i_mode, NULL);
}
if (error) {
fs->fs_snapinum[snaploc] = 0;
@@ -719,9 +740,11 @@ out1:
* the list of allocated blocks in i_snapblklist.
*/
if (ip->i_ump->um_fstype == UFS1)
- error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
+ error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
+ BLK_SNAP, 0);
else
- error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
+ error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
+ BLK_SNAP, 0);
if (error) {
fs->fs_snapinum[snaploc] = 0;
free(snapblklist, M_UFSMNT);
@@ -954,13 +977,14 @@ cgaccount(cg, vp, nbp, passno)
* is reproduced once each for UFS1 and UFS2.
*/
static int
-expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
+expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
struct vnode *snapvp;
struct inode *cancelip;
struct fs *fs;
int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
struct fs *, ufs_lbn_t, int);
int expungetype;
+ int clearmode;
{
int i, error, indiroff;
ufs_lbn_t lbn, rlbn;
@@ -1005,7 +1029,7 @@ expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
*/
dip = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
- if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
+ if (clearmode || cancelip->i_effnlink == 0)
dip->di_mode = 0;
dip->di_size = 0;
dip->di_blocks = 0;
@@ -1220,7 +1244,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
*ip->i_snapblklist++ = lblkno;
if (blkno == BLK_SNAP)
blkno = blkstofrags(fs, lblkno);
- ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
+ ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);
}
return (0);
}
@@ -1234,13 +1258,14 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
* is reproduced once each for UFS1 and UFS2.
*/
static int
-expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
+expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
struct vnode *snapvp;
struct inode *cancelip;
struct fs *fs;
int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
struct fs *, ufs_lbn_t, int);
int expungetype;
+ int clearmode;
{
int i, error, indiroff;
ufs_lbn_t lbn, rlbn;
@@ -1285,7 +1310,7 @@ expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
*/
dip = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
- if (expungetype == BLK_NOCOPY)
+ if (clearmode || cancelip->i_effnlink == 0)
dip->di_mode = 0;
dip->di_size = 0;
dip->di_blocks = 0;
@@ -1500,7 +1525,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
*ip->i_snapblklist++ = lblkno;
if (blkno == BLK_SNAP)
blkno = blkstofrags(fs, lblkno);
- ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
+ ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);
}
return (0);
}
@@ -1657,6 +1682,13 @@ ffs_snapremove(vp)
ip->i_flags &= ~SF_SNAPSHOT;
DIP_SET(ip, i_flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ /*
+ * The dirtied indirects must be written out before
+ * softdep_setup_freeblocks() is called. Otherwise indir_trunc()
+ * may find indirect pointers using the magic BLK_* values.
+ */
+ if (DOINGSOFTDEP(vp))
+ ffs_syncvnode(vp, MNT_WAIT);
#ifdef QUOTA
/*
* Reenable disk quotas for ex-snapshot file.
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 4d652c114dd1..4a659f9de7ba 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -1,5 +1,7 @@
/*-
- * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
+ * Copyright 1998, 2000 Marshall Kirk McKusick.
+ * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
*
* The soft updates code is derived from the appendix of a University
* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
@@ -23,17 +25,16 @@
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
- * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
*/
@@ -50,6 +51,7 @@ __FBSDID("$FreeBSD$");
#ifndef DEBUG
#define DEBUG
#endif
+#define SUJ_DEBUG
#include <sys/param.h>
#include <sys/kernel.h>
@@ -62,6 +64,7 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
+#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
@@ -130,10 +133,12 @@ softdep_setup_inomapdep(bp, ip, newinum)
}
void
-softdep_setup_blkmapdep(bp, mp, newblkno)
+softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
struct buf *bp;
struct mount *mp;
ufs2_daddr_t newblkno;
+ int frags;
+ int oldfrags;
{
panic("softdep_setup_blkmapdep called");
@@ -227,7 +232,8 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
}
void
-softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
+softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
+ struct buf *bp;
struct inode *dp;
caddr_t base;
caddr_t oldloc;
@@ -403,31 +409,13 @@ softdep_get_depcounts(struct mount *mp,
* These definitions need to be adapted to the system to which
* this file is being ported.
*/
-/*
- * malloc types defined for the softdep system.
- */
-static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
-static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
-static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
-static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
-static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
-static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
-static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
-static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
-static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
-static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
-static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
-static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
-static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
-static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
-static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
#define D_PAGEDEP 0
#define D_INODEDEP 1
-#define D_NEWBLK 2
-#define D_BMSAFEMAP 3
+#define D_BMSAFEMAP 2
+#define D_NEWBLK 3
#define D_ALLOCDIRECT 4
#define D_INDIRDEP 5
#define D_ALLOCINDIR 6
@@ -438,7 +426,67 @@ static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
#define D_MKDIR 11
#define D_DIRREM 12
#define D_NEWDIRBLK 13
-#define D_LAST D_NEWDIRBLK
+#define D_FREEWORK 14
+#define D_FREEDEP 15
+#define D_JADDREF 16
+#define D_JREMREF 17
+#define D_JMVREF 18
+#define D_JNEWBLK 19
+#define D_JFREEBLK 20
+#define D_JFREEFRAG 21
+#define D_JSEG 22
+#define D_JSEGDEP 23
+#define D_SBDEP 24
+#define D_JTRUNC 25
+#define D_LAST D_JTRUNC
+
+unsigned long dep_current[D_LAST + 1];
+unsigned long dep_total[D_LAST + 1];
+
+
+SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
+SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
+ "total dependencies allocated");
+SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
+ "current dependencies allocated");
+
+#define SOFTDEP_TYPE(type, str, long) \
+ static MALLOC_DEFINE(M_ ## type, #str, long); \
+ SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \
+ &dep_total[D_ ## type], 0, ""); \
+ SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \
+ &dep_current[D_ ## type], 0, "");
+
+SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
+SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
+SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
+ "Block or frag allocated from cyl group map");
+SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
+SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
+SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
+SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
+SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
+SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
+SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
+SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
+SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
+SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
+SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
+SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
+SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
+SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
+SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
+SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
+SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
+SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
+SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
+SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
+SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
+SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
+SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
+
+static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
+static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
/*
* translate from workitem type to memory type
@@ -447,8 +495,8 @@ static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
static struct malloc_type *memtype[] = {
M_PAGEDEP,
M_INODEDEP,
- M_NEWBLK,
M_BMSAFEMAP,
+ M_NEWBLK,
M_ALLOCDIRECT,
M_INDIRDEP,
M_ALLOCINDIR,
@@ -458,7 +506,19 @@ static struct malloc_type *memtype[] = {
M_DIRADD,
M_MKDIR,
M_DIRREM,
- M_NEWDIRBLK
+ M_NEWDIRBLK,
+ M_FREEWORK,
+ M_FREEDEP,
+ M_JADDREF,
+ M_JREMREF,
+ M_JMVREF,
+ M_JNEWBLK,
+ M_JFREEBLK,
+ M_JFREEFRAG,
+ M_JSEG,
+ M_JSEGDEP,
+ M_SBDEP,
+ M_JTRUNC
};
#define DtoM(type) (memtype[type])
@@ -467,17 +527,21 @@ static struct malloc_type *memtype[] = {
* Names of malloc types.
*/
#define TYPENAME(type) \
- ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
+ ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
/*
* End system adaptation definitions.
*/
+#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino)
+#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino)
+
/*
* Forward declarations.
*/
struct inodedep_hashhead;
struct newblk_hashhead;
struct pagedep_hashhead;
+struct bmsafemap_hashhead;
/*
* Internal function prototypes.
@@ -487,59 +551,172 @@ static void drain_output(struct vnode *);
static struct buf *getdirtybuf(struct buf *, struct mtx *, int);
static void clear_remove(struct thread *);
static void clear_inodedeps(struct thread *);
+static void unlinked_inodedep(struct mount *, struct inodedep *);
+static void clear_unlinked_inodedep(struct inodedep *);
+static struct inodedep *first_unlinked_inodedep(struct ufsmount *);
static int flush_pagedep_deps(struct vnode *, struct mount *,
struct diraddhd *);
+static void free_pagedep(struct pagedep *);
+static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
static int flush_inodedep_deps(struct mount *, ino_t);
static int flush_deplist(struct allocdirectlst *, int, int *);
static int handle_written_filepage(struct pagedep *, struct buf *);
+static int handle_written_sbdep(struct sbdep *, struct buf *);
+static void initiate_write_sbdep(struct sbdep *);
static void diradd_inode_written(struct diradd *, struct inodedep *);
+static int handle_written_indirdep(struct indirdep *, struct buf *,
+ struct buf**);
static int handle_written_inodeblock(struct inodedep *, struct buf *);
-static void handle_allocdirect_partdone(struct allocdirect *);
+static int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
+static void handle_written_jaddref(struct jaddref *);
+static void handle_written_jremref(struct jremref *);
+static void handle_written_jseg(struct jseg *, struct buf *);
+static void handle_written_jnewblk(struct jnewblk *);
+static void handle_written_jfreeblk(struct jfreeblk *);
+static void handle_written_jfreefrag(struct jfreefrag *);
+static void complete_jseg(struct jseg *);
+static void jseg_write(struct fs *, struct jblocks *, struct jseg *,
+ uint8_t *);
+static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
+static void jremref_write(struct jremref *, struct jseg *, uint8_t *);
+static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
+static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
+static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
+static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
+static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
+static inline void inoref_write(struct inoref *, struct jseg *,
+ struct jrefrec *);
+static void handle_allocdirect_partdone(struct allocdirect *,
+ struct workhead *);
+static void cancel_newblk(struct newblk *, struct workhead *);
+static void indirdep_complete(struct indirdep *);
static void handle_allocindir_partdone(struct allocindir *);
static void initiate_write_filepage(struct pagedep *, struct buf *);
+static void initiate_write_indirdep(struct indirdep*, struct buf *);
static void handle_written_mkdir(struct mkdir *, int);
+static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
static void handle_workitem_freefile(struct freefile *);
static void handle_workitem_remove(struct dirrem *, struct vnode *);
static struct dirrem *newdirrem(struct buf *, struct inode *,
struct inode *, int, struct dirrem **);
-static void free_diradd(struct diradd *);
-static void free_allocindir(struct allocindir *, struct inodedep *);
+static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
+ struct freeblks *);
+static void free_indirdep(struct indirdep *);
+static void free_diradd(struct diradd *, struct workhead *);
+static void merge_diradd(struct inodedep *, struct diradd *);
+static void complete_diradd(struct diradd *);
+static struct diradd *diradd_lookup(struct pagedep *, int);
+static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
+ struct jremref *);
+static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
+ struct jremref *);
+static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
+ struct jremref *, struct jremref *);
+static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
+ struct jremref *);
+static void cancel_allocindir(struct allocindir *, struct inodedep *,
+ struct freeblks *);
+static void complete_mkdir(struct mkdir *);
static void free_newdirblk(struct newdirblk *);
-static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
- ufs2_daddr_t *);
-static void deallocate_dependencies(struct buf *, struct inodedep *);
-static void free_allocdirect(struct allocdirectlst *,
- struct allocdirect *, int);
+static void free_jremref(struct jremref *);
+static void free_jaddref(struct jaddref *);
+static void free_jsegdep(struct jsegdep *);
+static void free_jseg(struct jseg *);
+static void free_jnewblk(struct jnewblk *);
+static void free_jfreeblk(struct jfreeblk *);
+static void free_jfreefrag(struct jfreefrag *);
+static void free_freedep(struct freedep *);
+static void journal_jremref(struct dirrem *, struct jremref *,
+ struct inodedep *);
+static void cancel_jnewblk(struct jnewblk *, struct workhead *);
+static int cancel_jaddref(struct jaddref *, struct inodedep *,
+ struct workhead *);
+static void cancel_jfreefrag(struct jfreefrag *);
+static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
+static int deallocate_dependencies(struct buf *, struct inodedep *,
+ struct freeblks *);
+static void free_newblk(struct newblk *);
+static void cancel_allocdirect(struct allocdirectlst *,
+ struct allocdirect *, struct freeblks *, int);
static int check_inode_unwritten(struct inodedep *);
static int free_inodedep(struct inodedep *);
+static void freework_freeblock(struct freework *);
static void handle_workitem_freeblocks(struct freeblks *, int);
+static void handle_complete_freeblocks(struct freeblks *);
+static void handle_workitem_indirblk(struct freework *);
+static void handle_written_freework(struct freework *);
static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
static void setup_allocindir_phase2(struct buf *, struct inode *,
- struct allocindir *);
+ struct inodedep *, struct allocindir *, ufs_lbn_t);
static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
- ufs2_daddr_t);
+ ufs2_daddr_t, ufs_lbn_t);
static void handle_workitem_freefrag(struct freefrag *);
-static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
+static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
+ ufs_lbn_t);
static void allocdirect_merge(struct allocdirectlst *,
struct allocdirect *, struct allocdirect *);
-static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
-static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
- struct newblk **);
-static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
+static struct freefrag *allocindir_merge(struct allocindir *,
+ struct allocindir *);
+static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
+ struct bmsafemap **);
+static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
+ int cg);
+static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
+ int, struct newblk **);
+static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
struct inodedep **);
static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
-static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
+static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
+ struct pagedep **);
static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
struct mount *mp, int, struct pagedep **);
static void pause_timer(void *);
static int request_cleanup(struct mount *, int);
static int process_worklist_item(struct mount *, int);
-static void add_to_worklist(struct worklist *);
+static void process_removes(struct vnode *);
+static void jwork_move(struct workhead *, struct workhead *);
+static void add_to_worklist(struct worklist *, int);
+static void remove_from_worklist(struct worklist *);
static void softdep_flush(void);
static int softdep_speedup(void);
+static void worklist_speedup(void);
+static int journal_mount(struct mount *, struct fs *, struct ucred *);
+static void journal_unmount(struct mount *);
+static int journal_space(struct ufsmount *, int);
+static void journal_suspend(struct ufsmount *);
+static void softdep_prelink(struct vnode *, struct vnode *);
+static void add_to_journal(struct worklist *);
+static void remove_from_journal(struct worklist *);
+static void softdep_process_journal(struct mount *, int);
+static struct jremref *newjremref(struct dirrem *, struct inode *,
+ struct inode *ip, off_t, nlink_t);
+static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
+ uint16_t);
+static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
+ uint16_t);
+static inline struct jsegdep *inoref_jseg(struct inoref *);
+static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
+static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
+ ufs2_daddr_t, int);
+static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
+ ufs2_daddr_t, long, ufs_lbn_t);
+static struct freework *newfreework(struct freeblks *, struct freework *,
+ ufs_lbn_t, ufs2_daddr_t, int, int);
+static void jwait(struct worklist *wk);
+static struct inodedep *inodedep_lookup_ip(struct inode *);
+static int bmsafemap_rollbacks(struct bmsafemap *);
+static struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
+static void handle_jwork(struct workhead *);
+static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
+ struct mkdir **);
+static struct jblocks *jblocks_create(void);
+static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
+static void jblocks_free(struct jblocks *, struct mount *, int);
+static void jblocks_destroy(struct jblocks *);
+static void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
/*
* Exported softdep operations.
@@ -572,40 +749,128 @@ MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
(item)->wk_state &= ~ONWORKLIST; \
LIST_REMOVE(item, wk_list); \
} while (0)
+#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT
+#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE
+
#else /* DEBUG */
-static void worklist_insert(struct workhead *, struct worklist *);
-static void worklist_remove(struct worklist *);
+static void worklist_insert(struct workhead *, struct worklist *, int);
+static void worklist_remove(struct worklist *, int);
-#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
-#define WORKLIST_REMOVE(item) worklist_remove(item)
+#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
+#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
+#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
+#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
static void
-worklist_insert(head, item)
+worklist_insert(head, item, locked)
struct workhead *head;
struct worklist *item;
+ int locked;
{
- mtx_assert(&lk, MA_OWNED);
+ if (locked)
+ mtx_assert(&lk, MA_OWNED);
if (item->wk_state & ONWORKLIST)
- panic("worklist_insert: already on list");
+ panic("worklist_insert: %p %s(0x%X) already on list",
+ item, TYPENAME(item->wk_type), item->wk_state);
item->wk_state |= ONWORKLIST;
LIST_INSERT_HEAD(head, item, wk_list);
}
static void
-worklist_remove(item)
+worklist_remove(item, locked)
struct worklist *item;
+ int locked;
{
- mtx_assert(&lk, MA_OWNED);
+ if (locked)
+ mtx_assert(&lk, MA_OWNED);
if ((item->wk_state & ONWORKLIST) == 0)
- panic("worklist_remove: not on list");
+ panic("worklist_remove: %p %s(0x%X) not on list",
+ item, TYPENAME(item->wk_type), item->wk_state);
item->wk_state &= ~ONWORKLIST;
LIST_REMOVE(item, wk_list);
}
#endif /* DEBUG */
/*
+ * Merge two jsegdeps keeping only the oldest one as newer references
+ * can't be discarded until after older references.
+ */
+static inline struct jsegdep *
+jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
+{
+ struct jsegdep *swp;
+
+ if (two == NULL)
+ return (one);
+
+ if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
+ swp = one;
+ one = two;
+ two = swp;
+ }
+ WORKLIST_REMOVE(&two->jd_list);
+ free_jsegdep(two);
+
+ return (one);
+}
+
+/*
+ * If two freedeps are compatible free one to reduce list size.
+ */
+static inline struct freedep *
+freedep_merge(struct freedep *one, struct freedep *two)
+{
+ if (two == NULL)
+ return (one);
+
+ if (one->fd_freework == two->fd_freework) {
+ WORKLIST_REMOVE(&two->fd_list);
+ free_freedep(two);
+ }
+ return (one);
+}
+
+/*
+ * Move journal work from one list to another. Duplicate freedeps and
+ * jsegdeps are coalesced to keep the lists as small as possible.
+ */
+static void
+jwork_move(dst, src)
+ struct workhead *dst;
+ struct workhead *src;
+{
+ struct freedep *freedep;
+ struct jsegdep *jsegdep;
+ struct worklist *wkn;
+ struct worklist *wk;
+
+ KASSERT(dst != src,
+ ("jwork_move: dst == src"));
+ freedep = NULL;
+ jsegdep = NULL;
+ LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
+ if (wk->wk_type == D_JSEGDEP)
+ jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
+ if (wk->wk_type == D_FREEDEP)
+ freedep = freedep_merge(WK_FREEDEP(wk), freedep);
+ }
+
+ mtx_assert(&lk, MA_OWNED);
+ while ((wk = LIST_FIRST(src)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ WORKLIST_INSERT(dst, wk);
+ if (wk->wk_type == D_JSEGDEP) {
+ jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
+ continue;
+ }
+ if (wk->wk_type == D_FREEDEP)
+ freedep = freedep_merge(WK_FREEDEP(wk), freedep);
+ }
+}
+
+/*
* Routines for tracking and managing workitems.
*/
static void workitem_free(struct worklist *, int);
@@ -623,13 +888,16 @@ workitem_free(item, type)
#ifdef DEBUG
if (item->wk_state & ONWORKLIST)
- panic("workitem_free: still on list");
+ panic("workitem_free: %s(0x%X) still on list",
+ TYPENAME(item->wk_type), item->wk_state);
if (item->wk_type != type)
- panic("workitem_free: type mismatch");
+ panic("workitem_free: type mismatch %s != %s",
+ TYPENAME(item->wk_type), TYPENAME(type));
#endif
ump = VFSTOUFS(item->wk_mp);
if (--ump->softdep_deps == 0 && ump->softdep_req)
wakeup(&ump->softdep_deps);
+ dep_current[type]--;
free(item, DtoM(type));
}
@@ -643,6 +911,8 @@ workitem_alloc(item, type, mp)
item->wk_mp = mp;
item->wk_state = 0;
ACQUIRE_LOCK(&lk);
+ dep_current[type]++;
+ dep_total[type]++;
VFSTOUFS(mp)->softdep_deps++;
VFSTOUFS(mp)->softdep_accdeps++;
FREE_LOCK(&lk);
@@ -678,24 +948,66 @@ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
-
-SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
-SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
-/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
+static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */
+static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */
+static int stat_journal_min; /* Times hit journal min threshold */
+static int stat_journal_low; /* Times hit journal low threshold */
+static int stat_journal_wait; /* Times blocked in jwait(). */
+static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
+static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
+static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */
+static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */
+
+SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
+ &max_softdeps, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
+ &tickdelay, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
+ &maxindirdeps, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
+ &stat_worklist_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
+ &stat_blk_limit_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
+ &stat_ino_limit_push, 0,"");
+SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
+ &stat_blk_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
+ &stat_ino_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
+ &stat_sync_limit_hit, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
+ &stat_indir_blk_ptrs, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
+ &stat_inode_bitmap, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
+ &stat_direct_blk_ptrs, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
+ &stat_dir_entry, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
+ &stat_jaddref, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
+ &stat_jnewblk, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
+ &stat_journal_low, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
+ &stat_journal_min, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
+ &stat_journal_wait, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
+ &stat_jwait_filepage, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
+ &stat_jwait_freeblks, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
+ &stat_jwait_inode, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
+ &stat_jwait_newblk, 0, "");
SYSCTL_DECL(_vfs_ffs);
+LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
+static u_long bmsafemap_hash; /* size of hash table - 1 */
+
static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */
SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
&compute_summary_at_mount, 0, "Recompute summary at mount");
@@ -770,16 +1082,22 @@ softdep_flush(void)
}
}
-static int
-softdep_speedup(void)
+static void
+worklist_speedup(void)
{
-
mtx_assert(&lk, MA_OWNED);
if (req_pending == 0) {
req_pending = 1;
wakeup(&req_pending);
}
+}
+static int
+softdep_speedup(void)
+{
+
+ worklist_speedup();
+ bd_speedup();
return speedup_syncer();
}
@@ -791,15 +1109,17 @@ softdep_speedup(void)
* and does so in order from first to last.
*/
static void
-add_to_worklist(wk)
+add_to_worklist(wk, nodelay)
struct worklist *wk;
+ int nodelay;
{
struct ufsmount *ump;
mtx_assert(&lk, MA_OWNED);
ump = VFSTOUFS(wk->wk_mp);
if (wk->wk_state & ONWORKLIST)
- panic("add_to_worklist: already on list");
+ panic("add_to_worklist: %s(0x%X) already on list",
+ TYPENAME(wk->wk_type), wk->wk_state);
wk->wk_state |= ONWORKLIST;
if (LIST_EMPTY(&ump->softdep_workitem_pending))
LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
@@ -807,6 +1127,30 @@ add_to_worklist(wk)
LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
ump->softdep_worklist_tail = wk;
ump->softdep_on_worklist += 1;
+ if (nodelay)
+ worklist_speedup();
+}
+
+/*
+ * Remove the item to be processed. If we are removing the last
+ * item on the list, we need to recalculate the tail pointer.
+ */
+static void
+remove_from_worklist(wk)
+ struct worklist *wk;
+{
+ struct ufsmount *ump;
+ struct worklist *wkend;
+
+ ump = VFSTOUFS(wk->wk_mp);
+ WORKLIST_REMOVE(wk);
+ if (wk == ump->softdep_worklist_tail) {
+ LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
+ if (LIST_NEXT(wkend, wk_list) == NULL)
+ break;
+ ump->softdep_worklist_tail = wkend;
+ }
+ ump->softdep_on_worklist -= 1;
}
/*
@@ -838,8 +1182,9 @@ softdep_process_worklist(mp, full)
ACQUIRE_LOCK(&lk);
loopcount = 1;
starttime = time_second;
+ softdep_process_journal(mp, full?MNT_WAIT:0);
while (ump->softdep_on_worklist > 0) {
- if ((cnt = process_worklist_item(mp, 0)) == -1)
+ if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
break;
else
matchcnt += cnt;
@@ -871,16 +1216,61 @@ softdep_process_worklist(mp, full)
* second. Otherwise the other mountpoints may get
* excessively backlogged.
*/
- if (!full && starttime != time_second) {
- matchcnt = -1;
+ if (!full && starttime != time_second)
break;
- }
}
FREE_LOCK(&lk);
return (matchcnt);
}
/*
+ * Process all removes associated with a vnode if we are running out of
+ * journal space. Any other process which attempts to flush these will
+ * be unable as we have the vnodes locked.
+ */
+static void
+process_removes(vp)
+ struct vnode *vp;
+{
+ struct inodedep *inodedep;
+ struct dirrem *dirrem;
+ struct mount *mp;
+ ino_t inum;
+
+ mtx_assert(&lk, MA_OWNED);
+
+ mp = vp->v_mount;
+ inum = VTOI(vp)->i_number;
+ for (;;) {
+ if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
+ return;
+ LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)
+ if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
+ (COMPLETE | ONWORKLIST))
+ break;
+ if (dirrem == NULL)
+ return;
+ /*
+ * If another thread is trying to lock this vnode it will
+ * fail but we must wait for it to do so before we can
+ * proceed.
+ */
+ if (dirrem->dm_state & INPROGRESS) {
+ dirrem->dm_state |= IOWAITING;
+ msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);
+ continue;
+ }
+ remove_from_worklist(&dirrem->dm_list);
+ FREE_LOCK(&lk);
+ if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
+ panic("process_removes: suspended filesystem");
+ handle_workitem_remove(dirrem, vp);
+ vn_finished_secondary_write(mp);
+ ACQUIRE_LOCK(&lk);
+ }
+}
+
+/*
* Process one item on the worklist.
*/
static int
@@ -888,7 +1278,7 @@ process_worklist_item(mp, flags)
struct mount *mp;
int flags;
{
- struct worklist *wk, *wkend;
+ struct worklist *wk, *wkXXX;
struct ufsmount *ump;
struct vnode *vp;
int matchcnt = 0;
@@ -908,11 +1298,14 @@ process_worklist_item(mp, flags)
* inodes, we have to skip over any dirrem requests whose
* vnodes are resident and locked.
*/
- ump = VFSTOUFS(mp);
vp = NULL;
+ ump = VFSTOUFS(mp);
LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
- if (wk->wk_state & INPROGRESS)
+ if (wk->wk_state & INPROGRESS) {
+ wkXXX = wk;
continue;
+ }
+ wkXXX = wk; /* Record the last valid wk pointer. */
if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
break;
wk->wk_state |= INPROGRESS;
@@ -921,6 +1314,10 @@ process_worklist_item(mp, flags)
ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
ACQUIRE_LOCK(&lk);
+ if (wk->wk_state & IOWAITING) {
+ wk->wk_state &= ~IOWAITING;
+ wakeup(wk);
+ }
wk->wk_state &= ~INPROGRESS;
ump->softdep_on_worklist_inprogress--;
if (vp != NULL)
@@ -928,21 +1325,7 @@ process_worklist_item(mp, flags)
}
if (wk == 0)
return (-1);
- /*
- * Remove the item to be processed. If we are removing the last
- * item on the list, we need to recalculate the tail pointer.
- * As this happens rarely and usually when the list is short,
- * we just run down the list to find it rather than tracking it
- * in the above loop.
- */
- WORKLIST_REMOVE(wk);
- if (wk == ump->softdep_worklist_tail) {
- LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
- if (LIST_NEXT(wkend, wk_list) == NULL)
- break;
- ump->softdep_worklist_tail = wkend;
- }
- ump->softdep_on_worklist -= 1;
+ remove_from_worklist(wk);
FREE_LOCK(&lk);
if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
panic("process_worklist_item: suspended filesystem");
@@ -952,6 +1335,8 @@ process_worklist_item(mp, flags)
case D_DIRREM:
/* removal of a directory entry */
handle_workitem_remove(WK_DIRREM(wk), vp);
+ if (vp)
+ vput(vp);
break;
case D_FREEBLKS:
@@ -969,6 +1354,11 @@ process_worklist_item(mp, flags)
handle_workitem_freefile(WK_FREEFILE(wk));
break;
+ case D_FREEWORK:
+ /* Final block in an indirect was freed. */
+ handle_workitem_indirblk(WK_FREEWORK(wk));
+ break;
+
default:
panic("%s_process_worklist: Unknown type %s",
"softdep", TYPENAME(wk->wk_type));
@@ -982,19 +1372,22 @@ process_worklist_item(mp, flags)
/*
* Move dependencies from one buffer to another.
*/
-void
+int
softdep_move_dependencies(oldbp, newbp)
struct buf *oldbp;
struct buf *newbp;
{
struct worklist *wk, *wktail;
+ int dirty;
- if (!LIST_EMPTY(&newbp->b_dep))
- panic("softdep_move_dependencies: need merge code");
- wktail = 0;
+ dirty = 0;
+ wktail = NULL;
ACQUIRE_LOCK(&lk);
while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
LIST_REMOVE(wk, wk_list);
+ if (wk->wk_type == D_BMSAFEMAP &&
+ bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
+ dirty = 1;
if (wktail == 0)
LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
else
@@ -1002,6 +1395,8 @@ softdep_move_dependencies(oldbp, newbp)
wktail = wk;
}
FREE_LOCK(&lk);
+
+ return (dirty);
}
/*
@@ -1198,23 +1593,22 @@ pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
* This routine must be called with splbio interrupts blocked.
*/
static int
-pagedep_lookup(ip, lbn, flags, pagedeppp)
- struct inode *ip;
+pagedep_lookup(mp, ino, lbn, flags, pagedeppp)
+ struct mount *mp;
+ ino_t ino;
ufs_lbn_t lbn;
int flags;
struct pagedep **pagedeppp;
{
struct pagedep *pagedep;
struct pagedep_hashhead *pagedephd;
- struct mount *mp;
int ret;
int i;
mtx_assert(&lk, MA_OWNED);
- mp = ITOV(ip)->v_mount;
- pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
+ pagedephd = PAGEDEP_HASH(mp, ino, lbn);
- ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
+ ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
if (*pagedeppp || (flags & DEPALLOC) == 0)
return (ret);
FREE_LOCK(&lk);
@@ -1222,12 +1616,12 @@ pagedep_lookup(ip, lbn, flags, pagedeppp)
M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
ACQUIRE_LOCK(&lk);
- ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
+ ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
if (*pagedeppp) {
WORKITEM_FREE(pagedep, D_PAGEDEP);
return (ret);
}
- pagedep->pd_ino = ip->i_number;
+ pagedep->pd_ino = ino;
pagedep->pd_lbn = lbn;
LIST_INIT(&pagedep->pd_dirremhd);
LIST_INIT(&pagedep->pd_pendinghd);
@@ -1314,10 +1708,14 @@ inodedep_lookup(mp, inum, flags, inodedeppp)
inodedep->id_savedino1 = NULL;
inodedep->id_savedsize = -1;
inodedep->id_savedextsize = -1;
- inodedep->id_buf = NULL;
+ inodedep->id_savednlink = -1;
+ inodedep->id_bmsafemap = NULL;
+ inodedep->id_mkdiradd = NULL;
+ LIST_INIT(&inodedep->id_dirremhd);
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
LIST_INIT(&inodedep->id_bufwait);
+ TAILQ_INIT(&inodedep->id_inoreflst);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
TAILQ_INIT(&inodedep->id_extupdt);
@@ -1336,17 +1734,29 @@ u_long newblk_hash; /* size of hash table - 1 */
(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
static int
-newblk_find(newblkhd, fs, newblkno, newblkpp)
+newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
struct newblk_hashhead *newblkhd;
- struct fs *fs;
+ struct mount *mp;
ufs2_daddr_t newblkno;
+ int flags;
struct newblk **newblkpp;
{
struct newblk *newblk;
- LIST_FOREACH(newblk, newblkhd, nb_hash)
- if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
- break;
+ LIST_FOREACH(newblk, newblkhd, nb_hash) {
+ if (newblkno != newblk->nb_newblkno)
+ continue;
+ if (mp != newblk->nb_list.wk_mp)
+ continue;
+ /*
+ * If we're creating a new dependency don't match those that
+ * have already been converted to allocdirects. This is for
+ * a frag extend.
+ */
+ if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
+ continue;
+ break;
+ }
if (newblk) {
*newblkpp = newblk;
return (1);
@@ -1361,8 +1771,8 @@ newblk_find(newblkhd, fs, newblkno, newblkpp)
* Found or allocated entry is returned in newblkpp.
*/
static int
-newblk_lookup(fs, newblkno, flags, newblkpp)
- struct fs *fs;
+newblk_lookup(mp, newblkno, flags, newblkpp)
+ struct mount *mp;
ufs2_daddr_t newblkno;
int flags;
struct newblk **newblkpp;
@@ -1370,21 +1780,25 @@ newblk_lookup(fs, newblkno, flags, newblkpp)
struct newblk *newblk;
struct newblk_hashhead *newblkhd;
- newblkhd = NEWBLK_HASH(fs, newblkno);
- if (newblk_find(newblkhd, fs, newblkno, newblkpp))
+ newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
+ if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
return (1);
if ((flags & DEPALLOC) == 0)
return (0);
FREE_LOCK(&lk);
- newblk = malloc(sizeof(struct newblk),
- M_NEWBLK, M_SOFTDEP_FLAGS);
+ newblk = malloc(sizeof(union allblk), M_NEWBLK,
+ M_SOFTDEP_FLAGS | M_ZERO);
+ workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
ACQUIRE_LOCK(&lk);
- if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
- free(newblk, M_NEWBLK);
+ if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
+ WORKITEM_FREE(newblk, D_NEWBLK);
return (1);
}
- newblk->nb_state = 0;
- newblk->nb_fs = fs;
+ newblk->nb_freefrag = NULL;
+ LIST_INIT(&newblk->nb_indirdeps);
+ LIST_INIT(&newblk->nb_newdirblk);
+ LIST_INIT(&newblk->nb_jwork);
+ newblk->nb_state = ATTACHED;
newblk->nb_newblkno = newblkno;
LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
*newblkpp = newblk;
@@ -1401,10 +1815,10 @@ softdep_initialize()
LIST_INIT(&mkdirlisthd);
max_softdeps = desiredvnodes * 4;
- pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
- &pagedep_hash);
+ pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
- newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
+ newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash);
+ bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
/* initialise bioops hack */
bioops.io_start = softdep_disk_io_initiation;
@@ -1428,6 +1842,7 @@ softdep_uninitialize()
hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
+ hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
}
/*
@@ -1457,9 +1872,16 @@ softdep_mount(devvp, mp, fs, cred)
MNT_IUNLOCK(mp);
ump = VFSTOUFS(mp);
LIST_INIT(&ump->softdep_workitem_pending);
+ LIST_INIT(&ump->softdep_journal_pending);
+ TAILQ_INIT(&ump->softdep_unlinked);
ump->softdep_worklist_tail = NULL;
ump->softdep_on_worklist = 0;
ump->softdep_deps = 0;
+ if ((fs->fs_flags & FS_SUJ) &&
+ (error = journal_mount(mp, fs, cred)) != 0) {
+ printf("Failed to start journal: %d\n", error);
+ return (error);
+ }
/*
* When doing soft updates, the counters in the
* superblock may have gotten out of sync. Recomputation
@@ -1493,6 +1915,2019 @@ softdep_mount(devvp, mp, fs, cred)
return (0);
}
+void
+softdep_unmount(mp)
+ struct mount *mp;
+{
+
+ if (mp->mnt_kern_flag & MNTK_SUJ)
+ journal_unmount(mp);
+}
+
+struct jblocks {
+ struct jseglst jb_segs; /* TAILQ of current segments. */
+ struct jseg *jb_writeseg; /* Next write to complete. */
+ struct jextent *jb_extent; /* Extent array. */
+ uint64_t jb_nextseq; /* Next sequence number. */
+ uint64_t jb_oldestseq; /* Oldest active sequence number. */
+ int jb_avail; /* Available extents. */
+ int jb_used; /* Last used extent. */
+ int jb_head; /* Allocator head. */
+ int jb_off; /* Allocator extent offset. */
+ int jb_blocks; /* Total disk blocks covered. */
+ int jb_free; /* Total disk blocks free. */
+ int jb_min; /* Minimum free space. */
+ int jb_low; /* Low on space. */
+ int jb_age; /* Insertion time of oldest rec. */
+ int jb_suspended; /* Did journal suspend writes? */
+};
+
+struct jextent {
+ ufs2_daddr_t je_daddr; /* Disk block address. */
+ int je_blocks; /* Disk block count. */
+};
+
+static struct jblocks *
+jblocks_create(void)
+{
+ struct jblocks *jblocks;
+
+ jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&jblocks->jb_segs);
+ jblocks->jb_avail = 10;
+ jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
+ M_JBLOCKS, M_WAITOK | M_ZERO);
+
+ return (jblocks);
+}
+
+static ufs2_daddr_t
+jblocks_alloc(jblocks, bytes, actual)
+ struct jblocks *jblocks;
+ int bytes;
+ int *actual;
+{
+ ufs2_daddr_t daddr;
+ struct jextent *jext;
+ int freecnt;
+ int blocks;
+
+ blocks = bytes / DEV_BSIZE;
+ jext = &jblocks->jb_extent[jblocks->jb_head];
+ freecnt = jext->je_blocks - jblocks->jb_off;
+ if (freecnt == 0) {
+ jblocks->jb_off = 0;
+ if (++jblocks->jb_head > jblocks->jb_used)
+ jblocks->jb_head = 0;
+ jext = &jblocks->jb_extent[jblocks->jb_head];
+ freecnt = jext->je_blocks;
+ }
+ if (freecnt > blocks)
+ freecnt = blocks;
+ *actual = freecnt * DEV_BSIZE;
+ daddr = jext->je_daddr + jblocks->jb_off;
+ jblocks->jb_off += freecnt;
+ jblocks->jb_free -= freecnt;
+
+ return (daddr);
+}
+
+static void
+jblocks_free(jblocks, mp, bytes)
+ struct jblocks *jblocks;
+ struct mount *mp;
+ int bytes;
+{
+
+ jblocks->jb_free += bytes / DEV_BSIZE;
+ if (jblocks->jb_suspended)
+ worklist_speedup();
+ wakeup(jblocks);
+}
+
+static void
+jblocks_destroy(jblocks)
+ struct jblocks *jblocks;
+{
+
+ if (jblocks->jb_extent)
+ free(jblocks->jb_extent, M_JBLOCKS);
+ free(jblocks, M_JBLOCKS);
+}
+
+static void
+jblocks_add(jblocks, daddr, blocks)
+ struct jblocks *jblocks;
+ ufs2_daddr_t daddr;
+ int blocks;
+{
+ struct jextent *jext;
+
+ jblocks->jb_blocks += blocks;
+ jblocks->jb_free += blocks;
+ jext = &jblocks->jb_extent[jblocks->jb_used];
+ /* Adding the first block. */
+ if (jext->je_daddr == 0) {
+ jext->je_daddr = daddr;
+ jext->je_blocks = blocks;
+ return;
+ }
+ /* Extending the last extent. */
+ if (jext->je_daddr + jext->je_blocks == daddr) {
+ jext->je_blocks += blocks;
+ return;
+ }
+ /* Adding a new extent. */
+ if (++jblocks->jb_used == jblocks->jb_avail) {
+ jblocks->jb_avail *= 2;
+ jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
+ M_JBLOCKS, M_WAITOK | M_ZERO);
+ memcpy(jext, jblocks->jb_extent,
+ sizeof(struct jextent) * jblocks->jb_used);
+ free(jblocks->jb_extent, M_JBLOCKS);
+ jblocks->jb_extent = jext;
+ }
+ jext = &jblocks->jb_extent[jblocks->jb_used];
+ jext->je_daddr = daddr;
+ jext->je_blocks = blocks;
+ return;
+}
+
+int
+softdep_journal_lookup(mp, vpp)
+ struct mount *mp;
+ struct vnode **vpp;
+{
+ struct componentname cnp;
+ struct vnode *dvp;
+ ino_t sujournal;
+ int error;
+
+ error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
+ if (error)
+ return (error);
+ bzero(&cnp, sizeof(cnp));
+ cnp.cn_nameiop = LOOKUP;
+ cnp.cn_flags = ISLASTCN;
+ cnp.cn_thread = curthread;
+ cnp.cn_cred = curthread->td_ucred;
+ cnp.cn_pnbuf = SUJ_FILE;
+ cnp.cn_nameptr = SUJ_FILE;
+ cnp.cn_namelen = strlen(SUJ_FILE);
+ error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
+ vput(dvp);
+ if (error != 0)
+ return (error);
+ error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
+ return (error);
+}
+
+/*
+ * Open and verify the journal file.
+ */
+static int
+journal_mount(mp, fs, cred)
+ struct mount *mp;
+ struct fs *fs;
+ struct ucred *cred;
+{
+ struct jblocks *jblocks;
+ struct vnode *vp;
+ struct inode *ip;
+ ufs2_daddr_t blkno;
+ int bcount;
+ int error;
+ int i;
+
+ mp->mnt_kern_flag |= MNTK_SUJ;
+ error = softdep_journal_lookup(mp, &vp);
+ if (error != 0) {
+ printf("Failed to find journal. Use tunefs to create one\n");
+ return (error);
+ }
+ ip = VTOI(vp);
+ if (ip->i_size < SUJ_MIN) {
+ error = ENOSPC;
+ goto out;
+ }
+ bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */
+ jblocks = jblocks_create();
+ for (i = 0; i < bcount; i++) {
+ error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
+ if (error)
+ break;
+ jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
+ }
+ if (error) {
+ jblocks_destroy(jblocks);
+ goto out;
+ }
+ jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
+ jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
+ /*
+ * Only validate the journal contents if the filesystem is clean,
+ * otherwise we write the logs but they'll never be used. If the
+ * filesystem was still dirty when we mounted it the journal is
+ * invalid and a new journal can only be valid if it starts from a
+ * clean mount.
+ */
+ if (fs->fs_clean) {
+ DIP_SET(ip, i_modrev, fs->fs_mtime);
+ ip->i_flags |= IN_MODIFIED;
+ ffs_update(vp, 1);
+ }
+ VFSTOUFS(mp)->softdep_jblocks = jblocks;
+out:
+ vput(vp);
+ return (error);
+}
+
+static void
+journal_unmount(mp)
+ struct mount *mp;
+{
+ struct ufsmount *ump;
+
+ ump = VFSTOUFS(mp);
+ if (ump->softdep_jblocks)
+ jblocks_destroy(ump->softdep_jblocks);
+ ump->softdep_jblocks = NULL;
+}
+
+/*
+ * Called when a journal record is ready to be written. Space is allocated
+ * and the journal entry is created when the journal is flushed to stable
+ * store.
+ */
+static void
+add_to_journal(wk)
+ struct worklist *wk;
+{
+ struct ufsmount *ump;
+
+ mtx_assert(&lk, MA_OWNED);
+ ump = VFSTOUFS(wk->wk_mp);
+ if (wk->wk_state & ONWORKLIST)
+ panic("add_to_journal: %s(0x%X) already on list",
+ TYPENAME(wk->wk_type), wk->wk_state);
+ wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
+ if (LIST_EMPTY(&ump->softdep_journal_pending)) {
+ ump->softdep_jblocks->jb_age = ticks;
+ LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
+ } else
+ LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
+ ump->softdep_journal_tail = wk;
+ ump->softdep_on_journal += 1;
+}
+
+/*
+ * Remove an arbitrary item for the journal worklist maintain the tail
+ * pointer. This happens when a new operation obviates the need to
+ * journal an old operation.
+ */
+static void
+remove_from_journal(wk)
+ struct worklist *wk;
+{
+ struct ufsmount *ump;
+
+ mtx_assert(&lk, MA_OWNED);
+ ump = VFSTOUFS(wk->wk_mp);
+#ifdef DEBUG /* XXX Expensive, temporary. */
+ {
+ struct worklist *wkn;
+
+ LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
+ if (wkn == wk)
+ break;
+ if (wkn == NULL)
+ panic("remove_from_journal: %p is not in journal", wk);
+ }
+#endif
+ /*
+ * We emulate a TAILQ to save space in most structures which do not
+ * require TAILQ semantics. Here we must update the tail position
+ * when removing the tail which is not the final entry.
+ */
+ if (ump->softdep_journal_tail == wk)
+ ump->softdep_journal_tail =
+ (struct worklist *)wk->wk_list.le_prev;
+
+ WORKLIST_REMOVE(wk);
+ ump->softdep_on_journal -= 1;
+}
+
+/*
+ * Check for journal space as well as dependency limits so the prelink
+ * code can throttle both journaled and non-journaled filesystems.
+ * Threshold is 0 for low and 1 for min.
+ */
+static int
+journal_space(ump, thresh)
+ struct ufsmount *ump;
+ int thresh;
+{
+ struct jblocks *jblocks;
+ int avail;
+
+ /*
+ * We use a tighter restriction here to prevent request_cleanup()
+ * running in threads from running into locks we currently hold.
+ */
+ if (num_inodedep > (max_softdeps / 10) * 9)
+ return (0);
+
+ jblocks = ump->softdep_jblocks;
+ if (jblocks == NULL)
+ return (1);
+ if (thresh)
+ thresh = jblocks->jb_min;
+ else
+ thresh = jblocks->jb_low;
+ avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
+ avail = jblocks->jb_free - avail;
+
+ return (avail > thresh);
+}
+
+static void
+journal_suspend(ump)
+ struct ufsmount *ump;
+{
+ struct jblocks *jblocks;
+ struct mount *mp;
+
+ mp = UFSTOVFS(ump);
+ jblocks = ump->softdep_jblocks;
+ MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
+ stat_journal_min++;
+ mp->mnt_kern_flag |= MNTK_SUSPEND;
+ mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
+ }
+ jblocks->jb_suspended = 1;
+ MNT_IUNLOCK(mp);
+}
+
+/*
+ * Called before any allocation function to be certain that there is
+ * sufficient space in the journal prior to creating any new records.
+ * Since in the case of block allocation we may have multiple locked
+ * buffers at the time of the actual allocation we can not block
+ * when the journal records are created. Doing so would create a deadlock
+ * if any of these buffers needed to be flushed to reclaim space. Instead
+ * we require a sufficiently large amount of available space such that
+ * each thread in the system could have passed this allocation check and
+ * still have sufficient free space. With 20% of a minimum journal size
+ * of 1MB we have 6553 records available.
+ */
+int
+softdep_prealloc(vp, waitok)
+ struct vnode *vp;
+ int waitok;
+{
+ struct ufsmount *ump;
+
+ if (DOINGSUJ(vp) == 0)
+ return (0);
+ ump = VFSTOUFS(vp->v_mount);
+ ACQUIRE_LOCK(&lk);
+ if (journal_space(ump, 0)) {
+ FREE_LOCK(&lk);
+ return (0);
+ }
+ stat_journal_low++;
+ FREE_LOCK(&lk);
+ if (waitok == MNT_NOWAIT)
+ return (ENOSPC);
+ /*
+ * Attempt to sync this vnode once to flush any journal
+ * work attached to it.
+ */
+ ffs_syncvnode(vp, waitok);
+ ACQUIRE_LOCK(&lk);
+ process_removes(vp);
+ if (journal_space(ump, 0) == 0) {
+ softdep_speedup();
+ if (journal_space(ump, 1) == 0)
+ journal_suspend(ump);
+ }
+ FREE_LOCK(&lk);
+
+ return (0);
+}
+
+/*
+ * Before adjusting a link count on a vnode verify that we have sufficient
+ * journal space. If not, process operations that depend on the currently
+ * locked pair of vnodes to try to flush space as the syncer, buf daemon,
+ * and softdep flush threads can not acquire these locks to reclaim space.
+ */
+static void
+softdep_prelink(dvp, vp)
+ struct vnode *dvp;
+ struct vnode *vp;
+{
+ struct ufsmount *ump;
+
+ ump = VFSTOUFS(dvp->v_mount);
+ mtx_assert(&lk, MA_OWNED);
+ if (journal_space(ump, 0))
+ return;
+ stat_journal_low++;
+ FREE_LOCK(&lk);
+ if (vp)
+ ffs_syncvnode(vp, MNT_NOWAIT);
+ ffs_syncvnode(dvp, MNT_WAIT);
+ ACQUIRE_LOCK(&lk);
+ /* Process vp before dvp as it may create .. removes. */
+ if (vp)
+ process_removes(vp);
+ process_removes(dvp);
+ softdep_speedup();
+ process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
+ process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
+ if (journal_space(ump, 0) == 0) {
+ softdep_speedup();
+ if (journal_space(ump, 1) == 0)
+ journal_suspend(ump);
+ }
+}
+
+static void
+jseg_write(fs, jblocks, jseg, data)
+ struct fs *fs;
+ struct jblocks *jblocks;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jsegrec *rec;
+
+ rec = (struct jsegrec *)data;
+ rec->jsr_seq = jseg->js_seq;
+ rec->jsr_oldest = jblocks->jb_oldestseq;
+ rec->jsr_cnt = jseg->js_cnt;
+ rec->jsr_blocks = jseg->js_size / DEV_BSIZE;
+ rec->jsr_crc = 0;
+ rec->jsr_time = fs->fs_mtime;
+}
+
+static inline void
+inoref_write(inoref, jseg, rec)
+ struct inoref *inoref;
+ struct jseg *jseg;
+ struct jrefrec *rec;
+{
+
+ inoref->if_jsegdep->jd_seg = jseg;
+ rec->jr_ino = inoref->if_ino;
+ rec->jr_parent = inoref->if_parent;
+ rec->jr_nlink = inoref->if_nlink;
+ rec->jr_mode = inoref->if_mode;
+ rec->jr_diroff = inoref->if_diroff;
+}
+
+static void
+jaddref_write(jaddref, jseg, data)
+ struct jaddref *jaddref;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jrefrec *rec;
+
+ rec = (struct jrefrec *)data;
+ rec->jr_op = JOP_ADDREF;
+ inoref_write(&jaddref->ja_ref, jseg, rec);
+}
+
+static void
+jremref_write(jremref, jseg, data)
+ struct jremref *jremref;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jrefrec *rec;
+
+ rec = (struct jrefrec *)data;
+ rec->jr_op = JOP_REMREF;
+ inoref_write(&jremref->jr_ref, jseg, rec);
+}
+
+static void
+jmvref_write(jmvref, jseg, data)
+ struct jmvref *jmvref;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jmvrec *rec;
+
+ rec = (struct jmvrec *)data;
+ rec->jm_op = JOP_MVREF;
+ rec->jm_ino = jmvref->jm_ino;
+ rec->jm_parent = jmvref->jm_parent;
+ rec->jm_oldoff = jmvref->jm_oldoff;
+ rec->jm_newoff = jmvref->jm_newoff;
+}
+
+static void
+jnewblk_write(jnewblk, jseg, data)
+ struct jnewblk *jnewblk;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jblkrec *rec;
+
+ jnewblk->jn_jsegdep->jd_seg = jseg;
+ rec = (struct jblkrec *)data;
+ rec->jb_op = JOP_NEWBLK;
+ rec->jb_ino = jnewblk->jn_ino;
+ rec->jb_blkno = jnewblk->jn_blkno;
+ rec->jb_lbn = jnewblk->jn_lbn;
+ rec->jb_frags = jnewblk->jn_frags;
+ rec->jb_oldfrags = jnewblk->jn_oldfrags;
+}
+
+static void
+jfreeblk_write(jfreeblk, jseg, data)
+ struct jfreeblk *jfreeblk;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jblkrec *rec;
+
+ jfreeblk->jf_jsegdep->jd_seg = jseg;
+ rec = (struct jblkrec *)data;
+ rec->jb_op = JOP_FREEBLK;
+ rec->jb_ino = jfreeblk->jf_ino;
+ rec->jb_blkno = jfreeblk->jf_blkno;
+ rec->jb_lbn = jfreeblk->jf_lbn;
+ rec->jb_frags = jfreeblk->jf_frags;
+ rec->jb_oldfrags = 0;
+}
+
+static void
+jfreefrag_write(jfreefrag, jseg, data)
+ struct jfreefrag *jfreefrag;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jblkrec *rec;
+
+ jfreefrag->fr_jsegdep->jd_seg = jseg;
+ rec = (struct jblkrec *)data;
+ rec->jb_op = JOP_FREEBLK;
+ rec->jb_ino = jfreefrag->fr_ino;
+ rec->jb_blkno = jfreefrag->fr_blkno;
+ rec->jb_lbn = jfreefrag->fr_lbn;
+ rec->jb_frags = jfreefrag->fr_frags;
+ rec->jb_oldfrags = 0;
+}
+
+static void
+jtrunc_write(jtrunc, jseg, data)
+ struct jtrunc *jtrunc;
+ struct jseg *jseg;
+ uint8_t *data;
+{
+ struct jtrncrec *rec;
+
+ rec = (struct jtrncrec *)data;
+ rec->jt_op = JOP_TRUNC;
+ rec->jt_ino = jtrunc->jt_ino;
+ rec->jt_size = jtrunc->jt_size;
+ rec->jt_extsize = jtrunc->jt_extsize;
+}
+
+/*
+ * Flush some journal records to disk.
+ */
+static void
+softdep_process_journal(mp, flags)
+ struct mount *mp;
+ int flags;
+{
+ struct jblocks *jblocks;
+ struct ufsmount *ump;
+ struct worklist *wk;
+ struct jseg *jseg;
+ struct buf *bp;
+ uint8_t *data;
+ struct fs *fs;
+ int segwritten;
+ int jrecmin; /* Minimum records per block. */
+ int jrecmax; /* Maximum records per block. */
+ int size;
+ int cnt;
+ int off;
+
+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
+ return;
+ ump = VFSTOUFS(mp);
+ fs = ump->um_fs;
+ jblocks = ump->softdep_jblocks;
+ /*
+ * We write anywhere between a disk block and fs block. The upper
+ * bound is picked to prevent buffer cache fragmentation and limit
+ * processing time per I/O.
+ */
+ jrecmin = (DEV_BSIZE / JREC_SIZE) - 1; /* -1 for seg header */
+ jrecmax = (fs->fs_bsize / DEV_BSIZE) * jrecmin;
+ segwritten = 0;
+ while ((cnt = ump->softdep_on_journal) != 0) {
+ /*
+ * Create a new segment to hold as many as 'cnt' journal
+ * entries and add them to the segment. Notice cnt is
+ * off by one to account for the space required by the
+ * jsegrec. If we don't have a full block to log skip it
+ * unless we haven't written anything.
+ */
+ cnt++;
+ if (cnt < jrecmax && segwritten)
+ break;
+ /*
+ * Verify some free journal space. softdep_prealloc() should
+ * guarantee that we don't run out so this is indicative of
+ * a problem with the flow control. Try to recover
+ * gracefully in any event.
+ */
+ while (jblocks->jb_free == 0) {
+ if (flags != MNT_WAIT)
+ break;
+ printf("softdep: Out of journal space!\n");
+ softdep_speedup();
+ msleep(jblocks, &lk, PRIBIO, "jblocks", 1);
+ }
+ FREE_LOCK(&lk);
+ jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jseg->js_list, D_JSEG, mp);
+ LIST_INIT(&jseg->js_entries);
+ jseg->js_state = ATTACHED;
+ jseg->js_jblocks = jblocks;
+ bp = geteblk(fs->fs_bsize, 0);
+ ACQUIRE_LOCK(&lk);
+ /*
+ * If there was a race while we were allocating the block
+ * and jseg the entry we care about was likely written.
+ * We bail out in both the WAIT and NOWAIT case and assume
+ * the caller will loop if the entry it cares about is
+ * not written.
+ */
+ if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
+ bp->b_flags |= B_INVAL | B_NOCACHE;
+ WORKITEM_FREE(jseg, D_JSEG);
+ FREE_LOCK(&lk);
+ brelse(bp);
+ ACQUIRE_LOCK(&lk);
+ break;
+ }
+ /*
+ * Calculate the disk block size required for the available
+ * records rounded to the min size.
+ */
+ cnt = ump->softdep_on_journal;
+ if (cnt < jrecmax)
+ size = howmany(cnt, jrecmin) * DEV_BSIZE;
+ else
+ size = fs->fs_bsize;
+ /*
+ * Allocate a disk block for this journal data and account
+ * for truncation of the requested size if enough contiguous
+ * space was not available.
+ */
+ bp->b_blkno = jblocks_alloc(jblocks, size, &size);
+ bp->b_lblkno = bp->b_blkno;
+ bp->b_offset = bp->b_blkno * DEV_BSIZE;
+ bp->b_bcount = size;
+ bp->b_bufobj = &ump->um_devvp->v_bufobj;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
+ /*
+ * Initialize our jseg with cnt records. Assign the next
+ * sequence number to it and link it in-order.
+ */
+ cnt = MIN(ump->softdep_on_journal,
+ (size / DEV_BSIZE) * jrecmin);
+ jseg->js_buf = bp;
+ jseg->js_cnt = cnt;
+ jseg->js_refs = cnt + 1; /* Self ref. */
+ jseg->js_size = size;
+ jseg->js_seq = jblocks->jb_nextseq++;
+ if (TAILQ_EMPTY(&jblocks->jb_segs))
+ jblocks->jb_oldestseq = jseg->js_seq;
+ TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
+ if (jblocks->jb_writeseg == NULL)
+ jblocks->jb_writeseg = jseg;
+ /*
+ * Start filling in records from the pending list.
+ */
+ data = bp->b_data;
+ off = 0;
+ while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
+ != NULL) {
+ /* Place a segment header on every device block. */
+ if ((off % DEV_BSIZE) == 0) {
+ jseg_write(fs, jblocks, jseg, data);
+ off += JREC_SIZE;
+ data = bp->b_data + off;
+ }
+ remove_from_journal(wk);
+ wk->wk_state |= IOSTARTED;
+ WORKLIST_INSERT(&jseg->js_entries, wk);
+ switch (wk->wk_type) {
+ case D_JADDREF:
+ jaddref_write(WK_JADDREF(wk), jseg, data);
+ break;
+ case D_JREMREF:
+ jremref_write(WK_JREMREF(wk), jseg, data);
+ break;
+ case D_JMVREF:
+ jmvref_write(WK_JMVREF(wk), jseg, data);
+ break;
+ case D_JNEWBLK:
+ jnewblk_write(WK_JNEWBLK(wk), jseg, data);
+ break;
+ case D_JFREEBLK:
+ jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
+ break;
+ case D_JFREEFRAG:
+ jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
+ break;
+ case D_JTRUNC:
+ jtrunc_write(WK_JTRUNC(wk), jseg, data);
+ break;
+ default:
+ panic("process_journal: Unknown type %s",
+ TYPENAME(wk->wk_type));
+ /* NOTREACHED */
+ }
+ if (--cnt == 0)
+ break;
+ off += JREC_SIZE;
+ data = bp->b_data + off;
+ }
+ /*
+ * Write this one buffer and continue.
+ */
+ WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
+ FREE_LOCK(&lk);
+ BO_LOCK(bp->b_bufobj);
+ bgetvp(ump->um_devvp, bp);
+ BO_UNLOCK(bp->b_bufobj);
+ if (flags == MNT_NOWAIT)
+ bawrite(bp);
+ else
+ bwrite(bp);
+ ACQUIRE_LOCK(&lk);
+ }
+ /*
+ * If we've suspended the filesystem because we ran out of journal
+ * space either try to sync it here to make some progress or
+ * unsuspend it if we already have.
+ */
+ if (flags == 0 && jblocks && jblocks->jb_suspended) {
+ if (journal_space(ump, jblocks->jb_min)) {
+ FREE_LOCK(&lk);
+ jblocks->jb_suspended = 0;
+ mp->mnt_susp_owner = curthread;
+ vfs_write_resume(mp);
+ ACQUIRE_LOCK(&lk);
+ return;
+ }
+ FREE_LOCK(&lk);
+ VFS_SYNC(mp, MNT_NOWAIT);
+ ffs_sbupdate(ump, MNT_WAIT, 0);
+ ACQUIRE_LOCK(&lk);
+ }
+}
+
+/*
+ * Complete a jseg, allowing all dependencies awaiting journal writes
+ * to proceed. Each journal dependency also attaches a jsegdep to dependent
+ * structures so that the journal segment can be freed to reclaim space.
+ */
+static void
+complete_jseg(jseg)
+ struct jseg *jseg;
+{
+ struct worklist *wk;
+ struct jmvref *jmvref;
+ int waiting;
+ int i;
+
+ i = 0;
+ while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ waiting = wk->wk_state & IOWAITING;
+ wk->wk_state &= ~(IOSTARTED | IOWAITING);
+ wk->wk_state |= COMPLETE;
+ KASSERT(i < jseg->js_cnt,
+ ("handle_written_jseg: overflow %d >= %d",
+ i, jseg->js_cnt));
+ switch (wk->wk_type) {
+ case D_JADDREF:
+ handle_written_jaddref(WK_JADDREF(wk));
+ break;
+ case D_JREMREF:
+ handle_written_jremref(WK_JREMREF(wk));
+ break;
+ case D_JMVREF:
+ /* No jsegdep here. */
+ free_jseg(jseg);
+ jmvref = WK_JMVREF(wk);
+ LIST_REMOVE(jmvref, jm_deps);
+ free_pagedep(jmvref->jm_pagedep);
+ WORKITEM_FREE(jmvref, D_JMVREF);
+ break;
+ case D_JNEWBLK:
+ handle_written_jnewblk(WK_JNEWBLK(wk));
+ break;
+ case D_JFREEBLK:
+ handle_written_jfreeblk(WK_JFREEBLK(wk));
+ break;
+ case D_JFREEFRAG:
+ handle_written_jfreefrag(WK_JFREEFRAG(wk));
+ break;
+ case D_JTRUNC:
+ WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;
+ WORKITEM_FREE(wk, D_JTRUNC);
+ break;
+ default:
+ panic("handle_written_jseg: Unknown type %s",
+ TYPENAME(wk->wk_type));
+ /* NOTREACHED */
+ }
+ if (waiting)
+ wakeup(wk);
+ }
+ /* Release the self reference so the structure may be freed. */
+ free_jseg(jseg);
+}
+
+/*
+ * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg
+ * completions in order only.
+ */
+static void
+handle_written_jseg(jseg, bp)
+ struct jseg *jseg;
+ struct buf *bp;
+{
+ struct jblocks *jblocks;
+ struct jseg *jsegn;
+
+ if (jseg->js_refs == 0)
+ panic("handle_written_jseg: No self-reference on %p", jseg);
+ jseg->js_state |= DEPCOMPLETE;
+ /*
+ * We'll never need this buffer again, set flags so it will be
+ * discarded.
+ */
+ bp->b_flags |= B_INVAL | B_NOCACHE;
+ jblocks = jseg->js_jblocks;
+ /*
+ * Don't allow out of order completions. If this isn't the first
+ * block wait for it to write before we're done.
+ */
+ if (jseg != jblocks->jb_writeseg)
+ return;
+ /* Iterate through available jsegs processing their entries. */
+ do {
+ jsegn = TAILQ_NEXT(jseg, js_next);
+ complete_jseg(jseg);
+ jseg = jsegn;
+ } while (jseg && jseg->js_state & DEPCOMPLETE);
+ jblocks->jb_writeseg = jseg;
+}
+
+static inline struct jsegdep *
+inoref_jseg(inoref)
+ struct inoref *inoref;
+{
+ struct jsegdep *jsegdep;
+
+ jsegdep = inoref->if_jsegdep;
+ inoref->if_jsegdep = NULL;
+
+ return (jsegdep);
+}
+
+/*
+ * Called once a jremref has made it to stable store. The jremref is marked
+ * complete and we attempt to free it. Any pagedeps writes sleeping waiting
+ * for the jremref to complete will be awoken by free_jremref.
+ */
+static void
+handle_written_jremref(jremref)
+ struct jremref *jremref;
+{
+ struct inodedep *inodedep;
+ struct jsegdep *jsegdep;
+ struct dirrem *dirrem;
+
+ /* Grab the jsegdep. */
+ jsegdep = inoref_jseg(&jremref->jr_ref);
+ /*
+ * Remove us from the inoref list.
+ */
+ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
+ 0, &inodedep) == 0)
+ panic("handle_written_jremref: Lost inodedep");
+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
+ /*
+ * Complete the dirrem.
+ */
+ dirrem = jremref->jr_dirrem;
+ jremref->jr_dirrem = NULL;
+ LIST_REMOVE(jremref, jr_deps);
+ jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
+ WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);
+ if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
+ (dirrem->dm_state & COMPLETE) != 0)
+ add_to_worklist(&dirrem->dm_list, 0);
+ free_jremref(jremref);
+}
+
+/*
+ * Called once a jaddref has made it to stable store. The dependency is
+ * marked complete and any dependent structures are added to the inode
+ * bufwait list to be completed as soon as it is written. If a bitmap write
+ * depends on this entry we move the inode into the inodedephd of the
+ * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
+ */
+static void
+handle_written_jaddref(jaddref)
+ struct jaddref *jaddref;
+{
+ struct jsegdep *jsegdep;
+ struct inodedep *inodedep;
+ struct diradd *diradd;
+ struct mkdir *mkdir;
+
+ /* Grab the jsegdep. */
+ jsegdep = inoref_jseg(&jaddref->ja_ref);
+ mkdir = NULL;
+ diradd = NULL;
+ if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
+ 0, &inodedep) == 0)
+ panic("handle_written_jaddref: Lost inodedep.");
+ if (jaddref->ja_diradd == NULL)
+ panic("handle_written_jaddref: No dependency");
+ if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
+ diradd = jaddref->ja_diradd;
+ WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
+ } else if (jaddref->ja_state & MKDIR_PARENT) {
+ mkdir = jaddref->ja_mkdir;
+ WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
+ } else if (jaddref->ja_state & MKDIR_BODY)
+ mkdir = jaddref->ja_mkdir;
+ else
+ panic("handle_written_jaddref: Unknown dependency %p",
+ jaddref->ja_diradd);
+ jaddref->ja_diradd = NULL; /* also clears ja_mkdir */
+ /*
+ * Remove us from the inode list.
+ */
+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
+ /*
+ * The mkdir may be waiting on the jaddref to clear before freeing.
+ */
+ if (mkdir) {
+ KASSERT(mkdir->md_list.wk_type == D_MKDIR,
+ ("handle_written_jaddref: Incorrect type for mkdir %s",
+ TYPENAME(mkdir->md_list.wk_type)));
+ mkdir->md_jaddref = NULL;
+ diradd = mkdir->md_diradd;
+ mkdir->md_state |= DEPCOMPLETE;
+ complete_mkdir(mkdir);
+ }
+ WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);
+ if (jaddref->ja_state & NEWBLOCK) {
+ inodedep->id_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
+ inodedep, id_deps);
+ }
+ free_jaddref(jaddref);
+}
+
+/*
+ * Called once a jnewblk journal is written. The allocdirect or allocindir
+ * is placed in the bmsafemap to await notification of a written bitmap.
+ */
+static void
+handle_written_jnewblk(jnewblk)
+ struct jnewblk *jnewblk;
+{
+ struct bmsafemap *bmsafemap;
+ struct jsegdep *jsegdep;
+ struct newblk *newblk;
+
+ /* Grab the jsegdep. */
+ jsegdep = jnewblk->jn_jsegdep;
+ jnewblk->jn_jsegdep = NULL;
+ /*
+ * Add the written block to the bmsafemap so it can be notified when
+ * the bitmap is on disk.
+ */
+ newblk = jnewblk->jn_newblk;
+ jnewblk->jn_newblk = NULL;
+ if (newblk == NULL)
+ panic("handle_written_jnewblk: No dependency for the segdep.");
+
+ newblk->nb_jnewblk = NULL;
+ bmsafemap = newblk->nb_bmsafemap;
+ WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);
+ newblk->nb_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+ free_jnewblk(jnewblk);
+}
+
+/*
+ * Cancel a jfreefrag that won't be needed, probably due to colliding with
+ * an in-flight allocation that has not yet been committed. Divorce us
+ * from the freefrag and mark it DEPCOMPLETE so that it may be added
+ * to the worklist.
+ */
+static void
+cancel_jfreefrag(jfreefrag)
+ struct jfreefrag *jfreefrag;
+{
+ struct freefrag *freefrag;
+
+ if (jfreefrag->fr_jsegdep) {
+ free_jsegdep(jfreefrag->fr_jsegdep);
+ jfreefrag->fr_jsegdep = NULL;
+ }
+ freefrag = jfreefrag->fr_freefrag;
+ jfreefrag->fr_freefrag = NULL;
+ freefrag->ff_jfreefrag = NULL;
+ free_jfreefrag(jfreefrag);
+ freefrag->ff_state |= DEPCOMPLETE;
+}
+
+/*
+ * Free a jfreefrag when the parent freefrag is rendered obsolete.
+ */
+static void
+free_jfreefrag(jfreefrag)
+ struct jfreefrag *jfreefrag;
+{
+
+ if (jfreefrag->fr_state & IOSTARTED)
+ WORKLIST_REMOVE(&jfreefrag->fr_list);
+ else if (jfreefrag->fr_state & ONWORKLIST)
+ remove_from_journal(&jfreefrag->fr_list);
+ if (jfreefrag->fr_freefrag != NULL)
+ panic("free_jfreefrag: Still attached to a freefrag.");
+ WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
+}
+
+/*
+ * Called when the journal write for a jfreefrag completes. The parent
+ * freefrag is added to the worklist if this completes its dependencies.
+ */
+static void
+handle_written_jfreefrag(jfreefrag)
+ struct jfreefrag *jfreefrag;
+{
+ struct jsegdep *jsegdep;
+ struct freefrag *freefrag;
+
+ /* Grab the jsegdep. */
+ jsegdep = jfreefrag->fr_jsegdep;
+ jfreefrag->fr_jsegdep = NULL;
+ freefrag = jfreefrag->fr_freefrag;
+ if (freefrag == NULL)
+ panic("handle_written_jfreefrag: No freefrag.");
+ freefrag->ff_state |= DEPCOMPLETE;
+ freefrag->ff_jfreefrag = NULL;
+ WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
+ if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
+ add_to_worklist(&freefrag->ff_list, 0);
+ jfreefrag->fr_freefrag = NULL;
+ free_jfreefrag(jfreefrag);
+}
+
+/*
+ * Called when the journal write for a jfreeblk completes. The jfreeblk
+ * is removed from the freeblks list of pending journal writes and the
+ * jsegdep is moved to the freeblks jwork to be completed when all blocks
+ * have been reclaimed.
+ */
+static void
+handle_written_jfreeblk(jfreeblk)
+ struct jfreeblk *jfreeblk;
+{
+ struct freeblks *freeblks;
+ struct jsegdep *jsegdep;
+
+ /* Grab the jsegdep. */
+ jsegdep = jfreeblk->jf_jsegdep;
+ jfreeblk->jf_jsegdep = NULL;
+ freeblks = jfreeblk->jf_freeblks;
+ LIST_REMOVE(jfreeblk, jf_deps);
+ WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
+ /*
+ * If the freeblks is all journaled, we can add it to the worklist.
+ */
+ if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&
+ (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ /* Remove from the b_dep that is waiting on this write. */
+ if (freeblks->fb_state & ONWORKLIST)
+ WORKLIST_REMOVE(&freeblks->fb_list);
+ add_to_worklist(&freeblks->fb_list, 1);
+ }
+
+ free_jfreeblk(jfreeblk);
+}
+
+static struct jsegdep *
+newjsegdep(struct worklist *wk)
+{
+ struct jsegdep *jsegdep;
+
+ jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
+ jsegdep->jd_seg = NULL;
+
+ return (jsegdep);
+}
+
+static struct jmvref *
+newjmvref(dp, ino, oldoff, newoff)
+ struct inode *dp;
+ ino_t ino;
+ off_t oldoff;
+ off_t newoff;
+{
+ struct jmvref *jmvref;
+
+ jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
+ jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
+ jmvref->jm_parent = dp->i_number;
+ jmvref->jm_ino = ino;
+ jmvref->jm_oldoff = oldoff;
+ jmvref->jm_newoff = newoff;
+
+ return (jmvref);
+}
+
+/*
+ * Allocate a new jremref that tracks the removal of ip from dp with the
+ * directory entry offset of diroff. Mark the entry as ATTACHED and
+ * DEPCOMPLETE as we have all the information required for the journal write
+ * and the directory has already been removed from the buffer. The caller
+ * is responsible for linking the jremref into the pagedep and adding it
+ * to the journal to write. The MKDIR_PARENT flag is set if we're doing
+ * a DOTDOT addition so handle_workitem_remove() can properly assign
+ * the jsegdep when we're done.
+ */
+static struct jremref *
+newjremref(dirrem, dp, ip, diroff, nlink)
+ struct dirrem *dirrem;
+ struct inode *dp;
+ struct inode *ip;
+ off_t diroff;
+ nlink_t nlink;
+{
+ struct jremref *jremref;
+
+ jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
+ jremref->jr_state = ATTACHED;
+ newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
+ nlink, ip->i_mode);
+ jremref->jr_dirrem = dirrem;
+
+ return (jremref);
+}
+
+static inline void
+newinoref(inoref, ino, parent, diroff, nlink, mode)
+ struct inoref *inoref;
+ ino_t ino;
+ ino_t parent;
+ off_t diroff;
+ nlink_t nlink;
+ uint16_t mode;
+{
+
+ inoref->if_jsegdep = newjsegdep(&inoref->if_list);
+ inoref->if_diroff = diroff;
+ inoref->if_ino = ino;
+ inoref->if_parent = parent;
+ inoref->if_nlink = nlink;
+ inoref->if_mode = mode;
+}
+
+/*
+ * Allocate a new jaddref to track the addition of ino to dp at diroff. The
+ * directory offset may not be known until later. The caller is responsible
+ * adding the entry to the journal when this information is available. nlink
+ * should be the link count prior to the addition and mode is only required
+ * to have the correct FMT.
+ */
+static struct jaddref *
+newjaddref(dp, ino, diroff, nlink, mode)
+ struct inode *dp;
+ ino_t ino;
+ off_t diroff;
+ int16_t nlink;
+ uint16_t mode;
+{
+ struct jaddref *jaddref;
+
+ jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
+ jaddref->ja_state = ATTACHED;
+ jaddref->ja_mkdir = NULL;
+ newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
+
+ return (jaddref);
+}
+
+/*
+ * Create a new free dependency for a freework. The caller is responsible
+ * for adjusting the reference count when it has the lock held. The freedep
+ * will track an outstanding bitmap write that will ultimately clear the
+ * freework to continue.
+ */
+static struct freedep *
+newfreedep(struct freework *freework)
+{
+ struct freedep *freedep;
+
+ freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
+ workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
+ freedep->fd_freework = freework;
+
+ return (freedep);
+}
+
+/*
+ * Free a freedep structure once the buffer it is linked to is written. If
+ * this is the last reference to the freework schedule it for completion.
+ */
+static void
+free_freedep(freedep)
+ struct freedep *freedep;
+{
+
+ if (--freedep->fd_freework->fw_ref == 0)
+ add_to_worklist(&freedep->fd_freework->fw_list, 1);
+ WORKITEM_FREE(freedep, D_FREEDEP);
+}
+
+/*
+ * Allocate a new freework structure that may be a level in an indirect
+ * when parent is not NULL or a top level block when it is. The top level
+ * freework structures are allocated without lk held and before the freeblks
+ * is visible outside of softdep_setup_freeblocks().
+ */
+static struct freework *
+newfreework(freeblks, parent, lbn, nb, frags, journal)
+ struct freeblks *freeblks;
+ struct freework *parent;
+ ufs_lbn_t lbn;
+ ufs2_daddr_t nb;
+ int frags;
+ int journal;
+{
+ struct freework *freework;
+
+ freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
+ workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
+ freework->fw_freeblks = freeblks;
+ freework->fw_parent = parent;
+ freework->fw_lbn = lbn;
+ freework->fw_blkno = nb;
+ freework->fw_frags = frags;
+ freework->fw_ref = 0;
+ freework->fw_off = 0;
+ LIST_INIT(&freework->fw_jwork);
+
+ if (parent == NULL) {
+ WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,
+ &freework->fw_list);
+ freeblks->fb_ref++;
+ }
+ if (journal)
+ newjfreeblk(freeblks, lbn, nb, frags);
+
+ return (freework);
+}
+
+/*
+ * Allocate a new jfreeblk to journal top level block pointer when truncating
+ * a file. The caller must add this to the worklist when lk is held.
+ */
+static struct jfreeblk *
+newjfreeblk(freeblks, lbn, blkno, frags)
+ struct freeblks *freeblks;
+ ufs_lbn_t lbn;
+ ufs2_daddr_t blkno;
+ int frags;
+{
+ struct jfreeblk *jfreeblk;
+
+ jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);
+ jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);
+ jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;
+ jfreeblk->jf_ino = freeblks->fb_previousinum;
+ jfreeblk->jf_lbn = lbn;
+ jfreeblk->jf_blkno = blkno;
+ jfreeblk->jf_frags = frags;
+ jfreeblk->jf_freeblks = freeblks;
+ LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);
+
+ return (jfreeblk);
+}
+
+static void move_newblock_dep(struct jaddref *, struct inodedep *);
+/*
+ * If we're canceling a new bitmap we have to search for another ref
+ * to move into the bmsafemap dep. This might be better expressed
+ * with another structure.
+ */
+static void
+move_newblock_dep(jaddref, inodedep)
+ struct jaddref *jaddref;
+ struct inodedep *inodedep;
+{
+ struct inoref *inoref;
+ struct jaddref *jaddrefn;
+
+ jaddrefn = NULL;
+ for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
+ inoref = TAILQ_NEXT(inoref, if_deps)) {
+ if ((jaddref->ja_state & NEWBLOCK) &&
+ inoref->if_list.wk_type == D_JADDREF) {
+ jaddrefn = (struct jaddref *)inoref;
+ break;
+ }
+ }
+ if (jaddrefn == NULL)
+ return;
+ jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
+ jaddrefn->ja_state |= jaddref->ja_state &
+ (ATTACHED | UNDONE | NEWBLOCK);
+ jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
+ jaddref->ja_state |= ATTACHED;
+ LIST_REMOVE(jaddref, ja_bmdeps);
+ LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
+ ja_bmdeps);
+}
+
+/*
+ * Cancel a jaddref either before it has been written or while it is being
+ * written. This happens when a link is removed before the add reaches
+ * the disk. The jaddref dependency is kept linked into the bmsafemap
+ * and inode to prevent the link count or bitmap from reaching the disk
+ * until handle_workitem_remove() re-adjusts the counts and bitmaps as
+ * required.
+ *
+ * Returns 1 if the canceled addref requires journaling of the remove and
+ * 0 otherwise.
+ */
+static int
+cancel_jaddref(jaddref, inodedep, wkhd)
+ struct jaddref *jaddref;
+ struct inodedep *inodedep;
+ struct workhead *wkhd;
+{
+ struct inoref *inoref;
+ struct jsegdep *jsegdep;
+ int needsj;
+
+ KASSERT((jaddref->ja_state & COMPLETE) == 0,
+ ("cancel_jaddref: Canceling complete jaddref"));
+ if (jaddref->ja_state & (IOSTARTED | COMPLETE))
+ needsj = 1;
+ else
+ needsj = 0;
+ if (inodedep == NULL)
+ if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
+ 0, &inodedep) == 0)
+ panic("cancel_jaddref: Lost inodedep");
+ /*
+ * We must adjust the nlink of any reference operation that follows
+ * us so that it is consistent with the in-memory reference. This
+ * ensures that inode nlink rollbacks always have the correct link.
+ */
+ if (needsj == 0)
+ for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
+ inoref = TAILQ_NEXT(inoref, if_deps))
+ inoref->if_nlink--;
+ jsegdep = inoref_jseg(&jaddref->ja_ref);
+ if (jaddref->ja_state & NEWBLOCK)
+ move_newblock_dep(jaddref, inodedep);
+ if (jaddref->ja_state & IOWAITING) {
+ jaddref->ja_state &= ~IOWAITING;
+ wakeup(&jaddref->ja_list);
+ }
+ jaddref->ja_mkdir = NULL;
+ if (jaddref->ja_state & IOSTARTED) {
+ jaddref->ja_state &= ~IOSTARTED;
+ WORKLIST_REMOVE(&jaddref->ja_list);
+ WORKLIST_INSERT(wkhd, &jsegdep->jd_list);
+ } else {
+ free_jsegdep(jsegdep);
+ remove_from_journal(&jaddref->ja_list);
+ }
+ /*
+ * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
+ * can arrange for them to be freed with the bitmap. Otherwise we
+ * no longer need this addref attached to the inoreflst and it
+ * will incorrectly adjust nlink if we leave it.
+ */
+ if ((jaddref->ja_state & NEWBLOCK) == 0) {
+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ jaddref->ja_state |= COMPLETE;
+ free_jaddref(jaddref);
+ return (needsj);
+ }
+ jaddref->ja_state |= GOINGAWAY;
+ /*
+ * Leave the head of the list for jsegdeps for fast merging.
+ */
+ if (LIST_FIRST(wkhd) != NULL) {
+ jaddref->ja_state |= ONWORKLIST;
+ LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
+ } else
+ WORKLIST_INSERT(wkhd, &jaddref->ja_list);
+
+ return (needsj);
+}
+
+/*
+ * Attempt to free a jaddref structure when some work completes. This
+ * should only succeed once the entry is written and all dependencies have
+ * been notified.
+ */
+static void
+free_jaddref(jaddref)
+ struct jaddref *jaddref;
+{
+
+ if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
+ return;
+ if (jaddref->ja_ref.if_jsegdep)
+ panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
+ jaddref, jaddref->ja_state);
+ if (jaddref->ja_state & NEWBLOCK)
+ LIST_REMOVE(jaddref, ja_bmdeps);
+ if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))
+ panic("free_jaddref: Bad state %p(0x%X)",
+ jaddref, jaddref->ja_state);
+ if (jaddref->ja_mkdir != NULL)
+ panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
+ WORKITEM_FREE(jaddref, D_JADDREF);
+}
+
+/*
+ * Free a jremref structure once it has been written or discarded.
+ */
+static void
+free_jremref(jremref)
+ struct jremref *jremref;
+{
+
+ if (jremref->jr_ref.if_jsegdep)
+ free_jsegdep(jremref->jr_ref.if_jsegdep);
+ if (jremref->jr_state & IOSTARTED)
+ panic("free_jremref: IO still pending");
+ WORKITEM_FREE(jremref, D_JREMREF);
+}
+
+/*
+ * Free a jnewblk structure.
+ */
+static void
+free_jnewblk(jnewblk)
+ struct jnewblk *jnewblk;
+{
+
+ if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
+ return;
+ LIST_REMOVE(jnewblk, jn_deps);
+ if (jnewblk->jn_newblk != NULL)
+ panic("free_jnewblk: Dependency still attached.");
+ WORKITEM_FREE(jnewblk, D_JNEWBLK);
+}
+
+/*
+ * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk
+ * is kept linked into the bmsafemap until the free completes, thus
+ * preventing the modified state from ever reaching disk. The free
+ * routine must pass this structure via ffs_blkfree() to
+ * softdep_setup_freeblks() so there is no race in releasing the space.
+ */
+static void
+cancel_jnewblk(jnewblk, wkhd)
+ struct jnewblk *jnewblk;
+ struct workhead *wkhd;
+{
+ struct jsegdep *jsegdep;
+
+ jsegdep = jnewblk->jn_jsegdep;
+ jnewblk->jn_jsegdep = NULL;
+ free_jsegdep(jsegdep);
+ jnewblk->jn_newblk = NULL;
+ jnewblk->jn_state |= GOINGAWAY;
+ if (jnewblk->jn_state & IOSTARTED) {
+ jnewblk->jn_state &= ~IOSTARTED;
+ WORKLIST_REMOVE(&jnewblk->jn_list);
+ } else
+ remove_from_journal(&jnewblk->jn_list);
+ /*
+ * Leave the head of the list for jsegdeps for fast merging.
+ */
+ if (LIST_FIRST(wkhd) != NULL) {
+ jnewblk->jn_state |= ONWORKLIST;
+ LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list);
+ } else
+ WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
+ if (jnewblk->jn_state & IOWAITING) {
+ jnewblk->jn_state &= ~IOWAITING;
+ wakeup(&jnewblk->jn_list);
+ }
+}
+
+static void
+free_jfreeblk(jfreeblk)
+ struct jfreeblk *jfreeblk;
+{
+
+ WORKITEM_FREE(jfreeblk, D_JFREEBLK);
+}
+
+/*
+ * Release one reference to a jseg and free it if the count reaches 0. This
+ * should eventually reclaim journal space as well.
+ */
+static void
+free_jseg(jseg)
+ struct jseg *jseg;
+{
+ struct jblocks *jblocks;
+
+ KASSERT(jseg->js_refs > 0,
+ ("free_jseg: Invalid refcnt %d", jseg->js_refs));
+ if (--jseg->js_refs != 0)
+ return;
+ /*
+ * Free only those jsegs which have none allocated before them to
+ * preserve the journal space ordering.
+ */
+ jblocks = jseg->js_jblocks;
+ while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
+ jblocks->jb_oldestseq = jseg->js_seq;
+ if (jseg->js_refs != 0)
+ break;
+ TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
+ jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
+ KASSERT(LIST_EMPTY(&jseg->js_entries),
+ ("free_jseg: Freed jseg has valid entries."));
+ WORKITEM_FREE(jseg, D_JSEG);
+ }
+}
+
+/*
+ * Release a jsegdep and decrement the jseg count.
+ */
+static void
+free_jsegdep(jsegdep)
+ struct jsegdep *jsegdep;
+{
+
+ if (jsegdep->jd_seg)
+ free_jseg(jsegdep->jd_seg);
+ WORKITEM_FREE(jsegdep, D_JSEGDEP);
+}
+
+/*
+ * Wait for a journal item to make it to disk. Initiate journal processing
+ * if required.
+ */
+static void
+jwait(wk)
+ struct worklist *wk;
+{
+
+ stat_journal_wait++;
+ /*
+ * If IO has not started we process the journal. We can't mark the
+ * worklist item as IOWAITING because we drop the lock while
+ * processing the journal and the worklist entry may be freed after
+ * this point. The caller may call back in and re-issue the request.
+ */
+ if ((wk->wk_state & IOSTARTED) == 0) {
+ softdep_process_journal(wk->wk_mp, MNT_WAIT);
+ return;
+ }
+ wk->wk_state |= IOWAITING;
+ msleep(wk, &lk, PRIBIO, "jwait", 0);
+}
+
+/*
+ * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
+ * appropriate. This is a convenience function to reduce duplicate code
+ * for the setup and revert functions below.
+ */
+static struct inodedep *
+inodedep_lookup_ip(ip)
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+
+ KASSERT(ip->i_nlink >= ip->i_effnlink,
+ ("inodedep_lookup_ip: bad delta"));
+ (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
+ DEPALLOC, &inodedep);
+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
+
+ return (inodedep);
+}
+
+/*
+ * Create a journal entry that describes a truncate that we're about to
+ * perform. The inode allocations and frees between here and the completion
+ * of the operation are done asynchronously and without journaling. At
+ * the end of the operation the vnode is sync'd and the journal space
+ * is released. Recovery will discover the partially completed truncate
+ * and complete it.
+ */
+void *
+softdep_setup_trunc(vp, length, flags)
+ struct vnode *vp;
+ off_t length;
+ int flags;
+{
+ struct jsegdep *jsegdep;
+ struct jtrunc *jtrunc;
+ struct ufsmount *ump;
+ struct inode *ip;
+
+ softdep_prealloc(vp, MNT_WAIT);
+ ip = VTOI(vp);
+ ump = VFSTOUFS(vp->v_mount);
+ jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);
+ jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);
+ jtrunc->jt_ino = ip->i_number;
+ jtrunc->jt_extsize = 0;
+ jtrunc->jt_size = length;
+ if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)
+ jtrunc->jt_extsize = ip->i_din2->di_extsize;
+ if ((flags & IO_NORMAL) == 0)
+ jtrunc->jt_size = DIP(ip, i_size);
+ ACQUIRE_LOCK(&lk);
+ add_to_journal(&jtrunc->jt_list);
+ while (jsegdep->jd_seg == NULL) {
+ stat_jwait_freeblks++;
+ jwait(&jtrunc->jt_list);
+ }
+ FREE_LOCK(&lk);
+
+ return (jsegdep);
+}
+
+/*
+ * After synchronous truncation is complete we free sync the vnode and
+ * release the jsegdep so the journal space can be freed.
+ */
+int
+softdep_complete_trunc(vp, cookie)
+ struct vnode *vp;
+ void *cookie;
+{
+ int error;
+
+ error = ffs_syncvnode(vp, MNT_WAIT);
+ ACQUIRE_LOCK(&lk);
+ free_jsegdep((struct jsegdep *)cookie);
+ FREE_LOCK(&lk);
+
+ return (error);
+}
+
+/*
+ * Called prior to creating a new inode and linking it to a directory. The
+ * jaddref structure must already be allocated by softdep_setup_inomapdep
+ * and it is discovered here so we can initialize the mode and update
+ * nlinkdelta.
+ */
+void
+softdep_setup_create(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ KASSERT(ip->i_nlink == 1,
+ ("softdep_setup_create: Invalid link count."));
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+ ("softdep_setup_create: No addref structure present."));
+ jaddref->ja_mode = ip->i_mode;
+ }
+ softdep_prelink(dvp, NULL);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Create a jaddref structure to track the addition of a DOTDOT link when
+ * we are reparenting an inode as part of a rename. This jaddref will be
+ * found by softdep_setup_directory_change. Adjusts nlinkdelta for
+ * non-journaling softdep.
+ */
+void
+softdep_setup_dotdot_link(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+ struct vnode *vp;
+
+ dvp = ITOV(dp);
+ vp = ITOV(ip);
+ jaddref = NULL;
+ /*
+ * We don't set MKDIR_PARENT as this is not tied to a mkdir and
+ * is used as a normal link would be.
+ */
+ if (DOINGSUJ(dvp))
+ jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
+ dp->i_effnlink - 1, dp->i_mode);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(dp);
+ if (jaddref)
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ softdep_prelink(dvp, ITOV(ip));
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Create a jaddref structure to track a new link to an inode. The directory
+ * offset is not known until softdep_setup_directory_add or
+ * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling
+ * softdep.
+ */
+void
+softdep_setup_link(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ jaddref = NULL;
+ if (DOINGSUJ(dvp))
+ jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
+ ip->i_mode);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (jaddref)
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ softdep_prelink(dvp, ITOV(ip));
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to create the jaddref structures to track . and .. references as
+ * well as lookup and further initialize the incomplete jaddref created
+ * by softdep_setup_inomapdep when the inode was allocated. Adjusts
+ * nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_setup_mkdir(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *dotdotaddref;
+ struct jaddref *dotaddref;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ dotaddref = dotdotaddref = NULL;
+ if (DOINGSUJ(dvp)) {
+ dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
+ ip->i_mode);
+ dotaddref->ja_state |= MKDIR_BODY;
+ dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
+ dp->i_effnlink - 1, dp->i_mode);
+ dotdotaddref->ja_state |= MKDIR_PARENT;
+ }
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL,
+ ("softdep_setup_mkdir: No addref structure present."));
+ KASSERT(jaddref->ja_parent == dp->i_number,
+ ("softdep_setup_mkdir: bad parent %d",
+ jaddref->ja_parent));
+ jaddref->ja_mode = ip->i_mode;
+ TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
+ if_deps);
+ }
+ inodedep = inodedep_lookup_ip(dp);
+ if (DOINGSUJ(dvp))
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
+ &dotdotaddref->ja_ref, if_deps);
+ softdep_prelink(ITOV(dp), NULL);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to track nlinkdelta of the inode and parent directories prior to
+ * unlinking a directory.
+ */
+void
+softdep_setup_rmdir(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ (void) inodedep_lookup_ip(ip);
+ (void) inodedep_lookup_ip(dp);
+ softdep_prelink(dvp, ITOV(ip));
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to track nlinkdelta of the inode and parent directories prior to
+ * unlink.
+ */
+void
+softdep_setup_unlink(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ (void) inodedep_lookup_ip(ip);
+ (void) inodedep_lookup_ip(dp);
+ softdep_prelink(dvp, ITOV(ip));
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed non-directory
+ * creation. Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_create(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == dp->i_number,
+ ("softdep_revert_create: addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed dotdot link
+ * creation. Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_dotdot_link(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(dp);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == ip->i_number,
+ ("softdep_revert_dotdot_link: addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed link
+ * addition. Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_link(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == dp->i_number,
+ ("softdep_revert_link: addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to release the journal structures created by a failed mkdir
+ * attempt. Adjusts nlinkdelta for non-journaling softdep.
+ */
+void
+softdep_revert_mkdir(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct vnode *dvp;
+
+ dvp = ITOV(dp);
+
+ ACQUIRE_LOCK(&lk);
+ inodedep = inodedep_lookup_ip(dp);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == ip->i_number,
+ ("softdep_revert_mkdir: dotdot addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ inodedep = inodedep_lookup_ip(ip);
+ if (DOINGSUJ(dvp)) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == dp->i_number,
+ ("softdep_revert_mkdir: addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref->ja_parent == ip->i_number,
+ ("softdep_revert_mkdir: dot addref parent mismatch"));
+ cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Called to correct nlinkdelta after a failed rmdir.
+ */
+void
+softdep_revert_rmdir(dp, ip)
+ struct inode *dp;
+ struct inode *ip;
+{
+
+ ACQUIRE_LOCK(&lk);
+ (void) inodedep_lookup_ip(ip);
+ (void) inodedep_lookup_ip(dp);
+ FREE_LOCK(&lk);
+}
+
/*
* Protecting the freemaps (or bitmaps).
*
@@ -1536,6 +3971,22 @@ softdep_setup_inomapdep(bp, ip, newinum)
{
struct inodedep *inodedep;
struct bmsafemap *bmsafemap;
+ struct jaddref *jaddref;
+ struct mount *mp;
+ struct fs *fs;
+
+ mp = UFSTOVFS(ip->i_ump);
+ fs = ip->i_ump->um_fs;
+ jaddref = NULL;
+
+ /*
+ * Allocate the journal reference add structure so that the bitmap
+ * can be dependent on it.
+ */
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ jaddref = newjaddref(ip, newinum, 0, 0, 0);
+ jaddref->ja_state |= NEWBLOCK;
+ }
/*
* Create a dependency for the newly allocated inode.
@@ -1544,14 +3995,20 @@ softdep_setup_inomapdep(bp, ip, newinum)
* the cylinder group map from which it was allocated.
*/
ACQUIRE_LOCK(&lk);
- if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
- &inodedep)))
- panic("softdep_setup_inomapdep: dependency for new inode "
- "already exists");
- inodedep->id_buf = bp;
+ if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
+ panic("softdep_setup_inomapdep: dependency %p for new"
+ "inode already exists", inodedep);
+ bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
+ if (jaddref) {
+ LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ } else {
+ inodedep->id_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
+ }
+ inodedep->id_bmsafemap = bmsafemap;
inodedep->id_state &= ~DEPCOMPLETE;
- bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
- LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
FREE_LOCK(&lk);
}
@@ -1560,29 +4017,98 @@ softdep_setup_inomapdep(bp, ip, newinum)
* allocate block or fragment.
*/
void
-softdep_setup_blkmapdep(bp, mp, newblkno)
+softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
struct buf *bp; /* buffer for cylgroup block with block map */
struct mount *mp; /* filesystem doing allocation */
ufs2_daddr_t newblkno; /* number of newly allocated block */
+ int frags; /* Number of fragments. */
+ int oldfrags; /* Previous number of fragments for extend. */
{
struct newblk *newblk;
struct bmsafemap *bmsafemap;
+ struct jnewblk *jnewblk;
struct fs *fs;
fs = VFSTOUFS(mp)->um_fs;
+ jnewblk = NULL;
/*
* Create a dependency for the newly allocated block.
* Add it to the dependency list for the buffer holding
* the cylinder group map from which it was allocated.
*/
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
+ workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
+ jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
+ jnewblk->jn_state = ATTACHED;
+ jnewblk->jn_blkno = newblkno;
+ jnewblk->jn_frags = frags;
+ jnewblk->jn_oldfrags = oldfrags;
+#ifdef SUJ_DEBUG
+ {
+ struct cg *cgp;
+ uint8_t *blksfree;
+ long bno;
+ int i;
+
+ cgp = (struct cg *)bp->b_data;
+ blksfree = cg_blksfree(cgp);
+ bno = dtogd(fs, jnewblk->jn_blkno);
+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+ i++) {
+ if (isset(blksfree, bno + i))
+ panic("softdep_setup_blkmapdep: "
+ "free fragment %d from %d-%d "
+ "state 0x%X dep %p", i,
+ jnewblk->jn_oldfrags,
+ jnewblk->jn_frags,
+ jnewblk->jn_state,
+ jnewblk->jn_newblk);
+ }
+ }
+#endif
+ }
ACQUIRE_LOCK(&lk);
- if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
+ if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
panic("softdep_setup_blkmapdep: found block");
- newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
- LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+ newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
+ dtog(fs, newblkno));
+ if (jnewblk) {
+ jnewblk->jn_newblk = newblk;
+ LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
+ } else {
+ newblk->nb_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
+ }
+ newblk->nb_bmsafemap = bmsafemap;
+ newblk->nb_jnewblk = jnewblk;
FREE_LOCK(&lk);
}
+#define BMSAFEMAP_HASH(fs, cg) \
+ (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
+
+static int
+bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
+ struct bmsafemap_hashhead *bmsafemaphd;
+ struct mount *mp;
+ int cg;
+ struct bmsafemap **bmsafemapp;
+{
+ struct bmsafemap *bmsafemap;
+
+ LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
+ if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
+ break;
+ if (bmsafemap) {
+ *bmsafemapp = bmsafemap;
+ return (1);
+ }
+ *bmsafemapp = NULL;
+
+ return (0);
+}
+
/*
* Find the bmsafemap associated with a cylinder group buffer.
* If none exists, create one. The buffer must be locked when
@@ -1590,27 +4116,43 @@ softdep_setup_blkmapdep(bp, mp, newblkno)
* splbio interrupts blocked.
*/
static struct bmsafemap *
-bmsafemap_lookup(mp, bp)
+bmsafemap_lookup(mp, bp, cg)
struct mount *mp;
struct buf *bp;
+ int cg;
{
- struct bmsafemap *bmsafemap;
+ struct bmsafemap_hashhead *bmsafemaphd;
+ struct bmsafemap *bmsafemap, *collision;
struct worklist *wk;
+ struct fs *fs;
mtx_assert(&lk, MA_OWNED);
- LIST_FOREACH(wk, &bp->b_dep, wk_list)
- if (wk->wk_type == D_BMSAFEMAP)
- return (WK_BMSAFEMAP(wk));
+ if (bp)
+ LIST_FOREACH(wk, &bp->b_dep, wk_list)
+ if (wk->wk_type == D_BMSAFEMAP)
+ return (WK_BMSAFEMAP(wk));
+ fs = VFSTOUFS(mp)->um_fs;
+ bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
+ if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)
+ return (bmsafemap);
FREE_LOCK(&lk);
bmsafemap = malloc(sizeof(struct bmsafemap),
M_BMSAFEMAP, M_SOFTDEP_FLAGS);
workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
bmsafemap->sm_buf = bp;
- LIST_INIT(&bmsafemap->sm_allocdirecthd);
- LIST_INIT(&bmsafemap->sm_allocindirhd);
LIST_INIT(&bmsafemap->sm_inodedephd);
+ LIST_INIT(&bmsafemap->sm_inodedepwr);
LIST_INIT(&bmsafemap->sm_newblkhd);
+ LIST_INIT(&bmsafemap->sm_newblkwr);
+ LIST_INIT(&bmsafemap->sm_jaddrefhd);
+ LIST_INIT(&bmsafemap->sm_jnewblkhd);
ACQUIRE_LOCK(&lk);
+ if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
+ WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+ return (collision);
+ }
+ bmsafemap->sm_cg = cg;
+ LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
return (bmsafemap);
}
@@ -1645,9 +4187,9 @@ bmsafemap_lookup(mp, bp)
* unreferenced fragments.
*/
void
-softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
+softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
struct inode *ip; /* inode to which block is being added */
- ufs_lbn_t lbn; /* block pointer within inode */
+ ufs_lbn_t off; /* block pointer within inode */
ufs2_daddr_t newblkno; /* disk block number being added */
ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */
long newsize; /* size of new block */
@@ -1656,34 +4198,33 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
{
struct allocdirect *adp, *oldadp;
struct allocdirectlst *adphead;
- struct bmsafemap *bmsafemap;
+ struct freefrag *freefrag;
struct inodedep *inodedep;
struct pagedep *pagedep;
+ struct jnewblk *jnewblk;
struct newblk *newblk;
struct mount *mp;
+ ufs_lbn_t lbn;
+ lbn = bp->b_lblkno;
mp = UFSTOVFS(ip->i_ump);
- adp = malloc(sizeof(struct allocdirect),
- M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
- workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
- adp->ad_lbn = lbn;
- adp->ad_newblkno = newblkno;
- adp->ad_oldblkno = oldblkno;
- adp->ad_newsize = newsize;
- adp->ad_oldsize = oldsize;
- adp->ad_state = ATTACHED;
- LIST_INIT(&adp->ad_newdirblk);
- if (newblkno == oldblkno)
- adp->ad_freefrag = NULL;
+ if (oldblkno && oldblkno != newblkno)
+ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
else
- adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
+ freefrag = NULL;
ACQUIRE_LOCK(&lk);
- if (lbn >= NDADDR) {
+ if (off >= NDADDR) {
+ if (lbn > 0)
+ panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
+ lbn, off);
/* allocating an indirect block */
if (oldblkno != 0)
panic("softdep_setup_allocdirect: non-zero indir");
} else {
+ if (off != lbn)
+ panic("softdep_setup_allocdirect: lbn %jd != off %jd",
+ lbn, off);
/*
* Allocating a direct block.
*
@@ -1692,26 +4233,39 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
* deletions.
*/
if ((ip->i_mode & IFMT) == IFDIR &&
- pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
+ pagedep_lookup(mp, ip->i_number, off, DEPALLOC,
+ &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
}
- if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
+ if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
panic("softdep_setup_allocdirect: lost block");
- if (newblk->nb_state == DEPCOMPLETE) {
- adp->ad_state |= DEPCOMPLETE;
- adp->ad_buf = NULL;
- } else {
- bmsafemap = newblk->nb_bmsafemap;
- adp->ad_buf = bmsafemap->sm_buf;
- LIST_REMOVE(newblk, nb_deps);
- LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
- }
- LIST_REMOVE(newblk, nb_hash);
- free(newblk, M_NEWBLK);
+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+ ("softdep_setup_allocdirect: newblk already initialized"));
+ /*
+ * Convert the newblk to an allocdirect.
+ */
+ newblk->nb_list.wk_type = D_ALLOCDIRECT;
+ adp = (struct allocdirect *)newblk;
+ newblk->nb_freefrag = freefrag;
+ adp->ad_offset = off;
+ adp->ad_oldblkno = oldblkno;
+ adp->ad_newsize = newsize;
+ adp->ad_oldsize = oldsize;
+ /*
+ * Finish initializing the journal.
+ */
+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+ jnewblk->jn_ino = ip->i_number;
+ jnewblk->jn_lbn = lbn;
+ add_to_journal(&jnewblk->jn_list);
+ }
+ if (freefrag && freefrag->ff_jfreefrag != NULL)
+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);
inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
adp->ad_inodedep = inodedep;
- WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
+
+ WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
/*
* The list of allocdirects must be kept in sorted and ascending
* order so that the rollback routines can quickly determine the
@@ -1726,24 +4280,25 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
*/
adphead = &inodedep->id_newinoupdt;
oldadp = TAILQ_LAST(adphead, allocdirectlst);
- if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
+ if (oldadp == NULL || oldadp->ad_offset <= off) {
/* insert at end of list */
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
- if (oldadp != NULL && oldadp->ad_lbn == lbn)
+ if (oldadp != NULL && oldadp->ad_offset == off)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
return;
}
TAILQ_FOREACH(oldadp, adphead, ad_next) {
- if (oldadp->ad_lbn >= lbn)
+ if (oldadp->ad_offset >= off)
break;
}
if (oldadp == NULL)
panic("softdep_setup_allocdirect: lost entry");
/* insert in middle of list */
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
- if (oldadp->ad_lbn == lbn)
+ if (oldadp->ad_offset == off)
allocdirect_merge(adphead, adp, oldadp);
+
FREE_LOCK(&lk);
}
@@ -1761,10 +4316,11 @@ allocdirect_merge(adphead, newadp, oldadp)
struct freefrag *freefrag;
struct newdirblk *newdirblk;
+ freefrag = NULL;
mtx_assert(&lk, MA_OWNED);
if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
newadp->ad_oldsize != oldadp->ad_newsize ||
- newadp->ad_lbn >= NDADDR)
+ newadp->ad_offset >= NDADDR)
panic("%s %jd != new %jd || old size %ld != new %ld",
"allocdirect_merge: old blkno",
(intmax_t)newadp->ad_oldblkno,
@@ -1779,7 +4335,7 @@ allocdirect_merge(adphead, newadp, oldadp)
* This action is done by swapping the freefrag dependencies.
* The new dependency gains the old one's freefrag, and the
* old one gets the new one and then immediately puts it on
- * the worklist when it is freed by free_allocdirect. It is
+ * the worklist when it is freed by free_newblk. It is
* not possible to do this swap when the old dependency had a
* non-zero size but no previous fragment to free. This condition
* arises when the new block is an extension of the old block.
@@ -1788,8 +4344,8 @@ allocdirect_merge(adphead, newadp, oldadp)
* the old dependency, so cannot legitimately be freed until the
* conditions for the new dependency are fulfilled.
*/
+ freefrag = newadp->ad_freefrag;
if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
- freefrag = newadp->ad_freefrag;
newadp->ad_freefrag = oldadp->ad_freefrag;
oldadp->ad_freefrag = freefrag;
}
@@ -1804,32 +4360,118 @@ allocdirect_merge(adphead, newadp, oldadp)
panic("allocdirect_merge: extra newdirblk");
WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
}
- free_allocdirect(adphead, oldadp, 0);
+ TAILQ_REMOVE(adphead, oldadp, ad_next);
+ /*
+ * We need to move any journal dependencies over to the freefrag
+ * that releases this block if it exists. Otherwise we are
+ * extending an existing block and we'll wait until that is
+ * complete to release the journal space and extend the
+ * new journal to cover this old space as well.
+ */
+ if (freefrag == NULL) {
+ struct jnewblk *jnewblk;
+ struct jnewblk *njnewblk;
+
+ if (oldadp->ad_newblkno != newadp->ad_newblkno)
+ panic("allocdirect_merge: %jd != %jd",
+ oldadp->ad_newblkno, newadp->ad_newblkno);
+ jnewblk = oldadp->ad_block.nb_jnewblk;
+ cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork);
+ /*
+ * We have an unwritten jnewblk, we need to merge the
+ * frag bits with our own. The newer adp's journal can not
+ * be written prior to the old one so no need to check for
+ * it here.
+ */
+ if (jnewblk) {
+ njnewblk = newadp->ad_block.nb_jnewblk;
+ if (njnewblk == NULL)
+ panic("allocdirect_merge: No jnewblk");
+ if (jnewblk->jn_state & UNDONE) {
+ njnewblk->jn_state |= UNDONE | NEWBLOCK;
+ njnewblk->jn_state &= ~ATTACHED;
+ jnewblk->jn_state &= ~UNDONE;
+ }
+ njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
+ WORKLIST_REMOVE(&jnewblk->jn_list);
+ jnewblk->jn_state |= ATTACHED | COMPLETE;
+ free_jnewblk(jnewblk);
+ }
+ } else {
+ /*
+ * We can skip journaling for this freefrag and just complete
+ * any pending journal work for the allocdirect that is being
+ * removed after the freefrag completes.
+ */
+ if (freefrag->ff_jfreefrag)
+ cancel_jfreefrag(freefrag->ff_jfreefrag);
+ cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork);
+ }
+ free_newblk(&oldadp->ad_block);
}
-
+
/*
- * Allocate a new freefrag structure if needed.
+ * Allocate a jfreefrag structure to journal a single block free.
+ */
+static struct jfreefrag *
+newjfreefrag(freefrag, ip, blkno, size, lbn)
+ struct freefrag *freefrag;
+ struct inode *ip;
+ ufs2_daddr_t blkno;
+ long size;
+ ufs_lbn_t lbn;
+{
+ struct jfreefrag *jfreefrag;
+ struct fs *fs;
+
+ fs = ip->i_fs;
+ jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
+ M_SOFTDEP_FLAGS);
+ workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
+ jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
+ jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
+ jfreefrag->fr_ino = ip->i_number;
+ jfreefrag->fr_lbn = lbn;
+ jfreefrag->fr_blkno = blkno;
+ jfreefrag->fr_frags = numfrags(fs, size);
+ jfreefrag->fr_freefrag = freefrag;
+
+ return (jfreefrag);
+}
+
+/*
+ * Allocate a new freefrag structure.
*/
static struct freefrag *
-newfreefrag(ip, blkno, size)
+newfreefrag(ip, blkno, size, lbn)
struct inode *ip;
ufs2_daddr_t blkno;
long size;
+ ufs_lbn_t lbn;
{
struct freefrag *freefrag;
struct fs *fs;
- if (blkno == 0)
- return (NULL);
fs = ip->i_fs;
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
panic("newfreefrag: frag size");
freefrag = malloc(sizeof(struct freefrag),
- M_FREEFRAG, M_SOFTDEP_FLAGS);
+ M_FREEFRAG, M_SOFTDEP_FLAGS);
workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
+ freefrag->ff_state = ATTACHED;
+ LIST_INIT(&freefrag->ff_jwork);
freefrag->ff_inum = ip->i_number;
freefrag->ff_blkno = blkno;
freefrag->ff_fragsize = size;
+
+ if (fs->fs_flags & FS_SUJ) {
+ freefrag->ff_jfreefrag =
+ newjfreefrag(freefrag, ip, blkno, size, lbn);
+ } else {
+ freefrag->ff_state |= DEPCOMPLETE;
+ freefrag->ff_jfreefrag = NULL;
+ }
+
return (freefrag);
}
@@ -1842,9 +4484,17 @@ handle_workitem_freefrag(freefrag)
struct freefrag *freefrag;
{
struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
+ struct workhead wkhd;
+ /*
+ * It would be illegal to add new completion items to the
+ * freefrag after it was schedule to be done so it must be
+ * safe to modify the list head here.
+ */
+ LIST_INIT(&wkhd);
+ LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
- freefrag->ff_fragsize, freefrag->ff_inum);
+ freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);
ACQUIRE_LOCK(&lk);
WORKITEM_FREE(freefrag, D_FREEFRAG);
FREE_LOCK(&lk);
@@ -1856,9 +4506,9 @@ handle_workitem_freefrag(freefrag)
* See the description of softdep_setup_allocdirect above for details.
*/
void
-softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
+softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
struct inode *ip;
- ufs_lbn_t lbn;
+ ufs_lbn_t off;
ufs2_daddr_t newblkno;
ufs2_daddr_t oldblkno;
long newsize;
@@ -1867,50 +4517,55 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
{
struct allocdirect *adp, *oldadp;
struct allocdirectlst *adphead;
- struct bmsafemap *bmsafemap;
+ struct freefrag *freefrag;
struct inodedep *inodedep;
+ struct jnewblk *jnewblk;
struct newblk *newblk;
struct mount *mp;
+ ufs_lbn_t lbn;
+
+ if (off >= NXADDR)
+ panic("softdep_setup_allocext: lbn %lld > NXADDR",
+ (long long)off);
+ lbn = bp->b_lblkno;
mp = UFSTOVFS(ip->i_ump);
- adp = malloc(sizeof(struct allocdirect),
- M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
- workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
- adp->ad_lbn = lbn;
- adp->ad_newblkno = newblkno;
- adp->ad_oldblkno = oldblkno;
- adp->ad_newsize = newsize;
- adp->ad_oldsize = oldsize;
- adp->ad_state = ATTACHED | EXTDATA;
- LIST_INIT(&adp->ad_newdirblk);
- if (newblkno == oldblkno)
- adp->ad_freefrag = NULL;
+ if (oldblkno && oldblkno != newblkno)
+ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
else
- adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
+ freefrag = NULL;
ACQUIRE_LOCK(&lk);
- if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
+ if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
panic("softdep_setup_allocext: lost block");
+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+ ("softdep_setup_allocext: newblk already initialized"));
+ /*
+ * Convert the newblk to an allocdirect.
+ */
+ newblk->nb_list.wk_type = D_ALLOCDIRECT;
+ adp = (struct allocdirect *)newblk;
+ newblk->nb_freefrag = freefrag;
+ adp->ad_offset = off;
+ adp->ad_oldblkno = oldblkno;
+ adp->ad_newsize = newsize;
+ adp->ad_oldsize = oldsize;
+ adp->ad_state |= EXTDATA;
+ /*
+ * Finish initializing the journal.
+ */
+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+ jnewblk->jn_ino = ip->i_number;
+ jnewblk->jn_lbn = lbn;
+ add_to_journal(&jnewblk->jn_list);
+ }
+ if (freefrag && freefrag->ff_jfreefrag != NULL)
+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);
inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
adp->ad_inodedep = inodedep;
- if (newblk->nb_state == DEPCOMPLETE) {
- adp->ad_state |= DEPCOMPLETE;
- adp->ad_buf = NULL;
- } else {
- bmsafemap = newblk->nb_bmsafemap;
- adp->ad_buf = bmsafemap->sm_buf;
- LIST_REMOVE(newblk, nb_deps);
- LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
- }
- LIST_REMOVE(newblk, nb_hash);
- free(newblk, M_NEWBLK);
-
- WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
- if (lbn >= NXADDR)
- panic("softdep_setup_allocext: lbn %lld > NXADDR",
- (long long)lbn);
+ WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
/*
* The list of allocdirects must be kept in sorted and ascending
* order so that the rollback routines can quickly determine the
@@ -1925,23 +4580,23 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
*/
adphead = &inodedep->id_newextupdt;
oldadp = TAILQ_LAST(adphead, allocdirectlst);
- if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
+ if (oldadp == NULL || oldadp->ad_offset <= off) {
/* insert at end of list */
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
- if (oldadp != NULL && oldadp->ad_lbn == lbn)
+ if (oldadp != NULL && oldadp->ad_offset == off)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
return;
}
TAILQ_FOREACH(oldadp, adphead, ad_next) {
- if (oldadp->ad_lbn >= lbn)
+ if (oldadp->ad_offset >= off)
break;
}
if (oldadp == NULL)
panic("softdep_setup_allocext: lost entry");
/* insert in middle of list */
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
- if (oldadp->ad_lbn == lbn)
+ if (oldadp->ad_offset == off)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
}
@@ -1975,22 +4630,39 @@ softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
* Allocate a new allocindir structure.
*/
static struct allocindir *
-newallocindir(ip, ptrno, newblkno, oldblkno)
+newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
struct inode *ip; /* inode for file being extended */
int ptrno; /* offset of pointer in indirect block */
ufs2_daddr_t newblkno; /* disk block number being added */
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
+ ufs_lbn_t lbn;
{
+ struct newblk *newblk;
struct allocindir *aip;
+ struct freefrag *freefrag;
+ struct jnewblk *jnewblk;
- aip = malloc(sizeof(struct allocindir),
- M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
- workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
- aip->ai_state = ATTACHED;
+ if (oldblkno)
+ freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
+ else
+ freefrag = NULL;
+ ACQUIRE_LOCK(&lk);
+ if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
+ panic("new_allocindir: lost block");
+ KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
+ ("newallocindir: newblk already initialized"));
+ newblk->nb_list.wk_type = D_ALLOCINDIR;
+ newblk->nb_freefrag = freefrag;
+ aip = (struct allocindir *)newblk;
aip->ai_offset = ptrno;
- aip->ai_newblkno = newblkno;
aip->ai_oldblkno = oldblkno;
- aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
+ if ((jnewblk = newblk->nb_jnewblk) != NULL) {
+ jnewblk->jn_ino = ip->i_number;
+ jnewblk->jn_lbn = lbn;
+ add_to_journal(&jnewblk->jn_list);
+ }
+ if (freefrag && freefrag->ff_jfreefrag != NULL)
+ add_to_journal(&freefrag->ff_jfreefrag->fr_list);
return (aip);
}
@@ -2008,22 +4680,28 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
struct buf *nbp; /* buffer holding allocated page */
{
+ struct inodedep *inodedep;
struct allocindir *aip;
struct pagedep *pagedep;
+ struct mount *mp;
+ if (lbn != nbp->b_lblkno)
+ panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
+ lbn, bp->b_lblkno);
ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
- aip = newallocindir(ip, ptrno, newblkno, oldblkno);
- ACQUIRE_LOCK(&lk);
+ mp = UFSTOVFS(ip->i_ump);
+ aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
+ (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
/*
* If we are allocating a directory page, then we must
* allocate an associated pagedep to track additions and
* deletions.
*/
if ((ip->i_mode & IFMT) == IFDIR &&
- pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
+ pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
- WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
- setup_allocindir_phase2(bp, ip, aip);
+ WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
+ setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
FREE_LOCK(&lk);
}
@@ -2039,38 +4717,68 @@ softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
int ptrno; /* offset of pointer in indirect block */
ufs2_daddr_t newblkno; /* disk block number being added */
{
+ struct inodedep *inodedep;
struct allocindir *aip;
+ ufs_lbn_t lbn;
+ lbn = nbp->b_lblkno;
ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
- aip = newallocindir(ip, ptrno, newblkno, 0);
- ACQUIRE_LOCK(&lk);
- WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
- setup_allocindir_phase2(bp, ip, aip);
+ aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
+ inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
+ WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
+ setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
FREE_LOCK(&lk);
}
+static void
+indirdep_complete(indirdep)
+ struct indirdep *indirdep;
+{
+ struct allocindir *aip;
+
+ LIST_REMOVE(indirdep, ir_next);
+ indirdep->ir_state &= ~ONDEPLIST;
+
+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
+ LIST_REMOVE(aip, ai_next);
+ free_newblk(&aip->ai_block);
+ }
+ /*
+ * If this indirdep is not attached to a buf it was simply waiting
+ * on completion to clear completehd. free_indirdep() asserts
+ * that nothing is dangling.
+ */
+ if ((indirdep->ir_state & ONWORKLIST) == 0)
+ free_indirdep(indirdep);
+}
+
/*
* Called to finish the allocation of the "aip" allocated
* by one of the two routines above.
*/
static void
-setup_allocindir_phase2(bp, ip, aip)
+setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
struct buf *bp; /* in-memory copy of the indirect block */
struct inode *ip; /* inode for file being extended */
+ struct inodedep *inodedep; /* Inodedep for ip */
struct allocindir *aip; /* allocindir allocated by the above routines */
+ ufs_lbn_t lbn; /* Logical block number for this block. */
{
struct worklist *wk;
+ struct fs *fs;
+ struct newblk *newblk;
struct indirdep *indirdep, *newindirdep;
- struct bmsafemap *bmsafemap;
struct allocindir *oldaip;
struct freefrag *freefrag;
- struct newblk *newblk;
+ struct mount *mp;
ufs2_daddr_t blkno;
+ mp = UFSTOVFS(ip->i_ump);
+ fs = ip->i_fs;
mtx_assert(&lk, MA_OWNED);
if (bp->b_lblkno >= 0)
panic("setup_allocindir_phase2: not indir blk");
- for (indirdep = NULL, newindirdep = NULL; ; ) {
+ for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
if (wk->wk_type != D_INDIRDEP)
continue;
@@ -2079,49 +4787,41 @@ setup_allocindir_phase2(bp, ip, aip)
}
if (indirdep == NULL && newindirdep) {
indirdep = newindirdep;
- WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
newindirdep = NULL;
+ WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
+ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,
+ &newblk)) {
+ indirdep->ir_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&newblk->nb_indirdeps,
+ indirdep, ir_next);
+ } else
+ indirdep->ir_state |= DEPCOMPLETE;
}
if (indirdep) {
- if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
- &newblk) == 0)
- panic("setup_allocindir: lost block");
- if (newblk->nb_state == DEPCOMPLETE) {
- aip->ai_state |= DEPCOMPLETE;
- aip->ai_buf = NULL;
- } else {
- bmsafemap = newblk->nb_bmsafemap;
- aip->ai_buf = bmsafemap->sm_buf;
- LIST_REMOVE(newblk, nb_deps);
- LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
- aip, ai_deps);
- }
- LIST_REMOVE(newblk, nb_hash);
- free(newblk, M_NEWBLK);
aip->ai_indirdep = indirdep;
/*
* Check to see if there is an existing dependency
* for this block. If there is, merge the old
- * dependency into the new one.
+ * dependency into the new one. This happens
+ * as a result of reallocblk only.
*/
if (aip->ai_oldblkno == 0)
oldaip = NULL;
else
- LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
+ LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,
+ ai_next)
if (oldaip->ai_offset == aip->ai_offset)
break;
- freefrag = NULL;
- if (oldaip != NULL) {
- if (oldaip->ai_newblkno != aip->ai_oldblkno)
- panic("setup_allocindir_phase2: blkno");
- aip->ai_oldblkno = oldaip->ai_oldblkno;
- freefrag = aip->ai_freefrag;
- aip->ai_freefrag = oldaip->ai_freefrag;
- oldaip->ai_freefrag = NULL;
- free_allocindir(oldaip, NULL);
- }
+ if (oldaip != NULL)
+ freefrag = allocindir_merge(aip, oldaip);
LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
+ KASSERT(aip->ai_offset >= 0 &&
+ aip->ai_offset < NINDIR(ip->i_ump->um_fs),
+ ("setup_allocindir_phase2: Bad offset %d",
+ aip->ai_offset));
+ KASSERT(indirdep->ir_savebp != NULL,
+ ("setup_allocindir_phase2 NULL ir_savebp"));
if (ip->i_ump->um_fstype == UFS1)
((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
[aip->ai_offset] = aip->ai_oldblkno;
@@ -2148,13 +4848,16 @@ setup_allocindir_phase2(bp, ip, aip)
}
newindirdep = malloc(sizeof(struct indirdep),
M_INDIRDEP, M_SOFTDEP_FLAGS);
- workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
- UFSTOVFS(ip->i_ump));
+ workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
newindirdep->ir_state = ATTACHED;
if (ip->i_ump->um_fstype == UFS1)
newindirdep->ir_state |= UFS1FMT;
+ newindirdep->ir_saveddata = NULL;
LIST_INIT(&newindirdep->ir_deplisthd);
LIST_INIT(&newindirdep->ir_donehd);
+ LIST_INIT(&newindirdep->ir_writehd);
+ LIST_INIT(&newindirdep->ir_completehd);
+ LIST_INIT(&newindirdep->ir_jwork);
if (bp->b_blkno == bp->b_lblkno) {
ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
NULL, NULL);
@@ -2169,6 +4872,51 @@ setup_allocindir_phase2(bp, ip, aip)
}
/*
+ * Merge two allocindirs which refer to the same block. Move newblock
+ * dependencies and setup the freefrags appropriately.
+ */
+static struct freefrag *
+allocindir_merge(aip, oldaip)
+ struct allocindir *aip;
+ struct allocindir *oldaip;
+{
+ struct newdirblk *newdirblk;
+ struct freefrag *freefrag;
+ struct worklist *wk;
+
+ if (oldaip->ai_newblkno != aip->ai_oldblkno)
+ panic("allocindir_merge: blkno");
+ aip->ai_oldblkno = oldaip->ai_oldblkno;
+ freefrag = aip->ai_freefrag;
+ aip->ai_freefrag = oldaip->ai_freefrag;
+ oldaip->ai_freefrag = NULL;
+ KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
+ /*
+ * If we are tracking a new directory-block allocation,
+ * move it from the old allocindir to the new allocindir.
+ */
+ if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
+ newdirblk = WK_NEWDIRBLK(wk);
+ WORKLIST_REMOVE(&newdirblk->db_list);
+ if (!LIST_EMPTY(&oldaip->ai_newdirblk))
+ panic("allocindir_merge: extra newdirblk");
+ WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);
+ }
+ /*
+ * We can skip journaling for this freefrag and just complete
+ * any pending journal work for the allocindir that is being
+ * removed after the freefrag completes.
+ */
+ if (freefrag->ff_jfreefrag)
+ cancel_jfreefrag(freefrag->ff_jfreefrag);
+ LIST_REMOVE(oldaip, ai_next);
+ cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork);
+ free_newblk(&oldaip->ai_block);
+
+ return (freefrag);
+}
+
+/*
* Block de-allocation dependencies.
*
* When blocks are de-allocated, the on-disk pointers must be nullified before
@@ -2203,9 +4951,12 @@ softdep_setup_freeblocks(ip, length, flags)
off_t length; /* The new length for the file */
int flags; /* IO_EXT and/or IO_NORMAL */
{
+ struct ufs1_dinode *dp1;
+ struct ufs2_dinode *dp2;
struct freeblks *freeblks;
struct inodedep *inodedep;
struct allocdirect *adp;
+ struct jfreeblk *jfreeblk;
struct bufobj *bo;
struct vnode *vp;
struct buf *bp;
@@ -2213,6 +4964,13 @@ softdep_setup_freeblocks(ip, length, flags)
ufs2_daddr_t extblocks, datablocks;
struct mount *mp;
int i, delay, error;
+ ufs2_daddr_t blkno;
+ ufs_lbn_t tmpval;
+ ufs_lbn_t lbn;
+ long oldextsize;
+ long oldsize;
+ int frags;
+ int needj;
fs = ip->i_fs;
mp = UFSTOVFS(ip->i_ump);
@@ -2221,32 +4979,53 @@ softdep_setup_freeblocks(ip, length, flags)
freeblks = malloc(sizeof(struct freeblks),
M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
+ LIST_INIT(&freeblks->fb_jfreeblkhd);
+ LIST_INIT(&freeblks->fb_jwork);
freeblks->fb_state = ATTACHED;
freeblks->fb_uid = ip->i_uid;
freeblks->fb_previousinum = ip->i_number;
freeblks->fb_devvp = ip->i_devvp;
+ freeblks->fb_chkcnt = 0;
ACQUIRE_LOCK(&lk);
+ /*
+ * If we're truncating a removed file that will never be written
+ * we don't need to journal the block frees. The canceled journals
+ * for the allocations will suffice.
+ */
+ inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
+ if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED ||
+ (fs->fs_flags & FS_SUJ) == 0)
+ needj = 0;
+ else
+ needj = 1;
num_freeblkdep++;
FREE_LOCK(&lk);
extblocks = 0;
if (fs->fs_magic == FS_UFS2_MAGIC)
extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
datablocks = DIP(ip, i_blocks) - extblocks;
- if ((flags & IO_NORMAL) == 0) {
- freeblks->fb_oldsize = 0;
- freeblks->fb_chkcnt = 0;
- } else {
- freeblks->fb_oldsize = ip->i_size;
+ if ((flags & IO_NORMAL) != 0) {
+ oldsize = ip->i_size;
ip->i_size = 0;
DIP_SET(ip, i_size, 0);
freeblks->fb_chkcnt = datablocks;
for (i = 0; i < NDADDR; i++) {
- freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
+ blkno = DIP(ip, i_db[i]);
DIP_SET(ip, i_db[i], 0);
+ if (blkno == 0)
+ continue;
+ frags = sblksize(fs, oldsize, i);
+ frags = numfrags(fs, frags);
+ newfreework(freeblks, NULL, i, blkno, frags, needj);
}
- for (i = 0; i < NIADDR; i++) {
- freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
+ for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
+ i++, tmpval *= NINDIR(fs)) {
+ blkno = DIP(ip, i_ib[i]);
DIP_SET(ip, i_ib[i], 0);
+ if (blkno)
+ newfreework(freeblks, NULL, -lbn - i, blkno,
+ fs->fs_frag, needj);
+ lbn += tmpval;
}
/*
* If the file was removed, then the space being freed was
@@ -2259,17 +5038,23 @@ softdep_setup_freeblocks(ip, length, flags)
UFS_UNLOCK(ip->i_ump);
}
}
- if ((flags & IO_EXT) == 0) {
- freeblks->fb_oldextsize = 0;
- } else {
- freeblks->fb_oldextsize = ip->i_din2->di_extsize;
+ if ((flags & IO_EXT) != 0) {
+ oldextsize = ip->i_din2->di_extsize;
ip->i_din2->di_extsize = 0;
freeblks->fb_chkcnt += extblocks;
for (i = 0; i < NXADDR; i++) {
- freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
+ blkno = ip->i_din2->di_extb[i];
ip->i_din2->di_extb[i] = 0;
+ if (blkno == 0)
+ continue;
+ frags = sblksize(fs, oldextsize, i);
+ frags = numfrags(fs, frags);
+ newfreework(freeblks, NULL, -1 - i, blkno, frags,
+ needj);
}
}
+ if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))
+ needj = 0;
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
/*
* Push the zero'ed inode to to its disk buffer so that we are free
@@ -2282,12 +5067,17 @@ softdep_setup_freeblocks(ip, length, flags)
brelse(bp);
softdep_error("softdep_setup_freeblocks", error);
}
- if (ip->i_ump->um_fstype == UFS1)
- *((struct ufs1_dinode *)bp->b_data +
- ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
- else
- *((struct ufs2_dinode *)bp->b_data +
- ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
+ if (ip->i_ump->um_fstype == UFS1) {
+ dp1 = ((struct ufs1_dinode *)bp->b_data +
+ ino_to_fsbo(fs, ip->i_number));
+ ip->i_din1->di_freelink = dp1->di_freelink;
+ *dp1 = *ip->i_din1;
+ } else {
+ dp2 = ((struct ufs2_dinode *)bp->b_data +
+ ino_to_fsbo(fs, ip->i_number));
+ ip->i_din2->di_freelink = dp2->di_freelink;
+ *dp2 = *ip->i_din2;
+ }
/*
* Find and eliminate any inode dependencies.
*/
@@ -2304,7 +5094,9 @@ softdep_setup_freeblocks(ip, length, flags)
*/
delay = (inodedep->id_state & DEPCOMPLETE);
if (delay)
- WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
+ WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
+ else if (needj)
+ freeblks->fb_state |= DEPCOMPLETE | COMPLETE;
/*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
@@ -2318,14 +5110,19 @@ softdep_setup_freeblocks(ip, length, flags)
merge_inode_lists(&inodedep->id_newinoupdt,
&inodedep->id_inoupdt);
while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
- free_allocdirect(&inodedep->id_inoupdt, adp, delay);
+ cancel_allocdirect(&inodedep->id_inoupdt, adp,
+ freeblks, delay);
}
if (flags & IO_EXT) {
merge_inode_lists(&inodedep->id_newextupdt,
&inodedep->id_extupdt);
while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
- free_allocdirect(&inodedep->id_extupdt, adp, delay);
+ cancel_allocdirect(&inodedep->id_extupdt, adp,
+ freeblks, delay);
}
+ LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)
+ add_to_journal(&jfreeblk->jf_list);
+
FREE_LOCK(&lk);
bdwrite(bp);
/*
@@ -2349,9 +5146,9 @@ restart:
BO_UNLOCK(bo);
ACQUIRE_LOCK(&lk);
(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
- deallocate_dependencies(bp, inodedep);
+ if (deallocate_dependencies(bp, inodedep, freeblks))
+ bp->b_flags |= B_INVAL | B_NOCACHE;
FREE_LOCK(&lk);
- bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
BO_LOCK(bo);
goto restart;
@@ -2361,7 +5158,7 @@ restart:
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
(void) free_inodedep(inodedep);
- if(delay) {
+ if (delay) {
freeblks->fb_state |= DEPCOMPLETE;
/*
* If the inode with zeroed block pointers is now on disk
@@ -2371,16 +5168,16 @@ restart:
* the request here than in the !delay case.
*/
if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
- add_to_worklist(&freeblks->fb_list);
+ add_to_worklist(&freeblks->fb_list, 1);
}
FREE_LOCK(&lk);
/*
- * If the inode has never been written to disk (delay == 0),
- * then we can process the freeblks now that we have deleted
- * the dependencies.
+ * If the inode has never been written to disk (delay == 0) and
+ * we're not waiting on any journal writes, then we can process the
+ * freeblks now that we have deleted the dependencies.
*/
- if (!delay)
+ if (!delay && !needj)
handle_workitem_freeblocks(freeblks, 0);
}
@@ -2389,19 +5186,23 @@ restart:
* be reallocated to a new vnode. The buffer must be locked, thus,
* no I/O completion operations can occur while we are manipulating
* its associated dependencies. The mutex is held so that other I/O's
- * associated with related dependencies do not occur.
+ * associated with related dependencies do not occur. Returns 1 if
+ * all dependencies were cleared, 0 otherwise.
*/
-static void
-deallocate_dependencies(bp, inodedep)
+static int
+deallocate_dependencies(bp, inodedep, freeblks)
struct buf *bp;
struct inodedep *inodedep;
+ struct freeblks *freeblks;
{
struct worklist *wk;
struct indirdep *indirdep;
+ struct newdirblk *newdirblk;
struct allocindir *aip;
struct pagedep *pagedep;
+ struct jremref *jremref;
+ struct jmvref *jmvref;
struct dirrem *dirrem;
- struct diradd *dap;
int i;
mtx_assert(&lk, MA_OWNED);
@@ -2410,47 +5211,24 @@ deallocate_dependencies(bp, inodedep)
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
- /*
- * None of the indirect pointers will ever be visible,
- * so they can simply be tossed. GOINGAWAY ensures
- * that allocated pointers will be saved in the buffer
- * cache until they are freed. Note that they will
- * only be able to be found by their physical address
- * since the inode mapping the logical address will
- * be gone. The save buffer used for the safe copy
- * was allocated in setup_allocindir_phase2 using
- * the physical address so it could be used for this
- * purpose. Hence we swap the safe copy with the real
- * copy, allowing the safe copy to be freed and holding
- * on to the real copy for later use in indir_trunc.
- */
- if (indirdep->ir_state & GOINGAWAY)
- panic("deallocate_dependencies: already gone");
- indirdep->ir_state |= GOINGAWAY;
- VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
- while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
- free_allocindir(aip, inodedep);
if (bp->b_lblkno >= 0 ||
bp->b_blkno != indirdep->ir_savebp->b_lblkno)
panic("deallocate_dependencies: not indir");
- bcopy(bp->b_data, indirdep->ir_savebp->b_data,
- bp->b_bcount);
- WORKLIST_REMOVE(wk);
- WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
+ cancel_indirdep(indirdep, bp, inodedep, freeblks);
continue;
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
/*
- * None of the directory additions will ever be
- * visible, so they can simply be tossed.
+ * There should be no directory add dependencies present
+ * as the directory could not be truncated until all
+ * children were removed.
*/
+ KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
+ ("deallocate_dependencies: pendinghd != NULL"));
for (i = 0; i < DAHASHSZ; i++)
- while ((dap =
- LIST_FIRST(&pagedep->pd_diraddhd[i])))
- free_diradd(dap);
- while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
- free_diradd(dap);
+ KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
+ ("deallocate_dependencies: diraddhd != NULL"));
/*
* Copy any directory remove dependencies to the list
* to be processed after the zero'ed inode is written.
@@ -2458,28 +5236,40 @@ deallocate_dependencies(bp, inodedep)
* can be dumped directly onto the work list.
*/
LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
+ /*
+ * If there are any dirrems we wait for
+ * the journal write to complete and
+ * then restart the buf scan as the lock
+ * has been dropped.
+ */
+ while ((jremref =
+ LIST_FIRST(&dirrem->dm_jremrefhd))
+ != NULL) {
+ stat_jwait_filepage++;
+ jwait(&jremref->jr_list);
+ return (0);
+ }
LIST_REMOVE(dirrem, dm_next);
dirrem->dm_dirinum = pagedep->pd_ino;
if (inodedep == NULL ||
(inodedep->id_state & ALLCOMPLETE) ==
- ALLCOMPLETE)
- add_to_worklist(&dirrem->dm_list);
- else
+ ALLCOMPLETE) {
+ dirrem->dm_state |= COMPLETE;
+ add_to_worklist(&dirrem->dm_list, 0);
+ } else
WORKLIST_INSERT(&inodedep->id_bufwait,
&dirrem->dm_list);
}
if ((pagedep->pd_state & NEWBLOCK) != 0) {
- LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
- if (wk->wk_type == D_NEWDIRBLK &&
- WK_NEWDIRBLK(wk)->db_pagedep ==
- pagedep)
- break;
- if (wk != NULL) {
- WORKLIST_REMOVE(wk);
- free_newdirblk(WK_NEWDIRBLK(wk));
- } else
- panic("deallocate_dependencies: "
- "lost pagedep");
+ newdirblk = pagedep->pd_newdirblk;
+ WORKLIST_REMOVE(&newdirblk->db_list);
+ free_newdirblk(newdirblk);
+ }
+ while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd))
+ != NULL) {
+ stat_jwait_filepage++;
+ jwait(&jmvref->jm_list);
+ return (0);
}
WORKLIST_REMOVE(&pagedep->pd_list);
LIST_REMOVE(pagedep, pd_hash);
@@ -2487,7 +5277,8 @@ deallocate_dependencies(bp, inodedep)
continue;
case D_ALLOCINDIR:
- free_allocindir(WK_ALLOCINDIR(wk), inodedep);
+ aip = WK_ALLOCINDIR(wk);
+ cancel_allocindir(aip, inodedep, freeblks);
continue;
case D_ALLOCDIRECT:
@@ -2502,46 +5293,155 @@ deallocate_dependencies(bp, inodedep)
/* NOTREACHED */
}
}
+
+ return (1);
}
/*
- * Free an allocdirect. Generate a new freefrag work request if appropriate.
- * This routine must be called with splbio interrupts blocked.
+ * An allocdirect is being canceled due to a truncate. We must make sure
+ * the journal entry is released in concert with the blkfree that releases
+ * the storage. Completed journal entries must not be released until the
+ * space is no longer pointed to by the inode or in the bitmap.
*/
static void
-free_allocdirect(adphead, adp, delay)
+cancel_allocdirect(adphead, adp, freeblks, delay)
struct allocdirectlst *adphead;
struct allocdirect *adp;
+ struct freeblks *freeblks;
int delay;
{
+ struct freework *freework;
+ struct newblk *newblk;
+ struct worklist *wk;
+ ufs_lbn_t lbn;
+
+ TAILQ_REMOVE(adphead, adp, ad_next);
+ newblk = (struct newblk *)adp;
+ /*
+ * If the journal hasn't been written the jnewblk must be passed
+ * to the call to ffs_freeblk that reclaims the space. We accomplish
+ * this by linking the journal dependency into the freework to be
+ * freed when freework_freeblock() is called. If the journal has
+ * been written we can simply reclaim the journal space when the
+ * freeblks work is complete.
+ */
+ if (newblk->nb_jnewblk == NULL) {
+ cancel_newblk(newblk, &freeblks->fb_jwork);
+ goto found;
+ }
+ lbn = newblk->nb_jnewblk->jn_lbn;
+ /*
+ * Find the correct freework structure so it releases the canceled
+ * journal when the bitmap is cleared. This preserves rollback
+ * until the allocation is reverted.
+ */
+ LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
+ freework = WK_FREEWORK(wk);
+ if (freework->fw_lbn != lbn)
+ continue;
+ cancel_newblk(newblk, &freework->fw_jwork);
+ goto found;
+ }
+ panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);
+found:
+ if (delay)
+ WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
+ &newblk->nb_list);
+ else
+ free_newblk(newblk);
+ return;
+}
+
+
+static void
+cancel_newblk(newblk, wkhd)
+ struct newblk *newblk;
+ struct workhead *wkhd;
+{
+ struct indirdep *indirdep;
+ struct allocindir *aip;
+
+ while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
+ indirdep->ir_state &= ~ONDEPLIST;
+ LIST_REMOVE(indirdep, ir_next);
+ /*
+ * If an indirdep is not on the buf worklist we need to
+ * free it here as deallocate_dependencies() will never
+ * find it. These pointers were never visible on disk and
+ * can be discarded immediately.
+ */
+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
+ LIST_REMOVE(aip, ai_next);
+ cancel_newblk(&aip->ai_block, wkhd);
+ free_newblk(&aip->ai_block);
+ }
+ /*
+ * If this indirdep is not attached to a buf it was simply
+ * waiting on completion to clear completehd. free_indirdep()
+ * asserts that nothing is dangling.
+ */
+ if ((indirdep->ir_state & ONWORKLIST) == 0)
+ free_indirdep(indirdep);
+ }
+ if (newblk->nb_state & ONDEPLIST) {
+ newblk->nb_state &= ~ONDEPLIST;
+ LIST_REMOVE(newblk, nb_deps);
+ }
+ if (newblk->nb_state & ONWORKLIST)
+ WORKLIST_REMOVE(&newblk->nb_list);
+ /*
+ * If the journal entry hasn't been written we hold onto the dep
+ * until it is safe to free along with the other journal work.
+ */
+ if (newblk->nb_jnewblk != NULL) {
+ cancel_jnewblk(newblk->nb_jnewblk, wkhd);
+ newblk->nb_jnewblk = NULL;
+ }
+ if (!LIST_EMPTY(&newblk->nb_jwork))
+ jwork_move(wkhd, &newblk->nb_jwork);
+}
+
+/*
+ * Free a newblk. Generate a new freefrag work request if appropriate.
+ * This must be called after the inode pointer and any direct block pointers
+ * are valid or fully removed via truncate or frag extension.
+ */
+static void
+free_newblk(newblk)
+ struct newblk *newblk;
+{
+ struct indirdep *indirdep;
struct newdirblk *newdirblk;
+ struct freefrag *freefrag;
struct worklist *wk;
mtx_assert(&lk, MA_OWNED);
- if ((adp->ad_state & DEPCOMPLETE) == 0)
- LIST_REMOVE(adp, ad_deps);
- TAILQ_REMOVE(adphead, adp, ad_next);
- if ((adp->ad_state & COMPLETE) == 0)
- WORKLIST_REMOVE(&adp->ad_list);
- if (adp->ad_freefrag != NULL) {
- if (delay)
- WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
- &adp->ad_freefrag->ff_list);
- else
- add_to_worklist(&adp->ad_freefrag->ff_list);
+ if (newblk->nb_state & ONDEPLIST)
+ LIST_REMOVE(newblk, nb_deps);
+ if (newblk->nb_state & ONWORKLIST)
+ WORKLIST_REMOVE(&newblk->nb_list);
+ LIST_REMOVE(newblk, nb_hash);
+ if ((freefrag = newblk->nb_freefrag) != NULL) {
+ freefrag->ff_state |= COMPLETE;
+ if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
+ add_to_worklist(&freefrag->ff_list, 0);
}
- if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
+ if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {
newdirblk = WK_NEWDIRBLK(wk);
WORKLIST_REMOVE(&newdirblk->db_list);
- if (!LIST_EMPTY(&adp->ad_newdirblk))
- panic("free_allocdirect: extra newdirblk");
- if (delay)
- WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
- &newdirblk->db_list);
- else
- free_newdirblk(newdirblk);
- }
- WORKITEM_FREE(adp, D_ALLOCDIRECT);
+ if (!LIST_EMPTY(&newblk->nb_newdirblk))
+ panic("free_newblk: extra newdirblk");
+ free_newdirblk(newdirblk);
+ }
+ while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
+ indirdep->ir_state |= DEPCOMPLETE;
+ indirdep_complete(indirdep);
+ }
+ KASSERT(newblk->nb_jnewblk == NULL,
+ ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
+ handle_jwork(&newblk->nb_jwork);
+ newblk->nb_list.wk_type = D_NEWBLK;
+ WORKITEM_FREE(newblk, D_NEWBLK);
}
/*
@@ -2554,6 +5454,7 @@ free_newdirblk(newdirblk)
{
struct pagedep *pagedep;
struct diradd *dap;
+ struct worklist *wk;
int i;
mtx_assert(&lk, MA_OWNED);
@@ -2571,17 +5472,25 @@ free_newdirblk(newdirblk)
pagedep->pd_state &= ~NEWBLOCK;
if ((pagedep->pd_state & ONWORKLIST) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
- free_diradd(dap);
+ free_diradd(dap, NULL);
/*
* If no dependencies remain, the pagedep will be freed.
*/
for (i = 0; i < DAHASHSZ; i++)
if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
break;
- if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
+ if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&
+ LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
+ KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,
+ ("free_newdirblk: Freeing non-free pagedep %p", pagedep));
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
}
+ /* Should only ever be one item in the list. */
+ while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
+ }
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
}
@@ -2608,6 +5517,7 @@ softdep_freefile(pvp, ino, mode)
freefile->fx_mode = mode;
freefile->fx_oldinum = ino;
freefile->fx_devvp = ip->i_devvp;
+ LIST_INIT(&freefile->fx_jwork);
if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
UFS_LOCK(ip->i_ump);
ip->i_fs->fs_pendinginodes += 1;
@@ -2618,11 +5528,34 @@ softdep_freefile(pvp, ino, mode)
* If the inodedep does not exist, then the zero'ed inode has
* been written to disk. If the allocated inode has never been
* written to disk, then the on-disk inode is zero'ed. In either
- * case we can free the file immediately.
+ * case we can free the file immediately. If the journal was
+ * canceled before being written the inode will never make it to
+ * disk and we must send the canceled journal entrys to
+ * ffs_freefile() to be cleared in conjunction with the bitmap.
+ * Any blocks waiting on the inode to write can be safely freed
+ * here as it will never been written.
*/
ACQUIRE_LOCK(&lk);
- if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
- check_inode_unwritten(inodedep)) {
+ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
+ /*
+ * Remove this inode from the unlinked list and set
+ * GOINGAWAY as appropriate to indicate that this inode
+ * will never be written.
+ */
+ if (inodedep && inodedep->id_state & UNLINKED) {
+ /*
+ * Save the journal work to be freed with the bitmap
+ * before we clear UNLINKED. Otherwise it can be lost
+ * if the inode block is written.
+ */
+ handle_bufwait(inodedep, &freefile->fx_jwork);
+ clear_unlinked_inodedep(inodedep);
+ /* Re-acquire inodedep as we've dropped lk. */
+ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
+ if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0)
+ inodedep->id_state |= GOINGAWAY;
+ }
+ if (inodedep == NULL || check_inode_unwritten(inodedep)) {
FREE_LOCK(&lk);
handle_workitem_freefile(freefile);
return;
@@ -2654,7 +5587,8 @@ check_inode_unwritten(inodedep)
{
mtx_assert(&lk, MA_OWNED);
- if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
+
+ if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
!LIST_EMPTY(&inodedep->id_pendinghd) ||
!LIST_EMPTY(&inodedep->id_bufwait) ||
!LIST_EMPTY(&inodedep->id_inowait) ||
@@ -2662,9 +5596,9 @@ check_inode_unwritten(inodedep)
!TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
+ inodedep->id_mkdiradd != NULL ||
inodedep->id_nlinkdelta != 0)
return (0);
-
/*
* Another process might be in initiate_write_inodeblock_ufs[12]
* trying to allocate memory without holding "Softdep Lock".
@@ -2673,9 +5607,11 @@ check_inode_unwritten(inodedep)
inodedep->id_savedino1 == NULL)
return (0);
+ if (inodedep->id_state & ONDEPLIST)
+ LIST_REMOVE(inodedep, id_deps);
+ inodedep->id_state &= ~ONDEPLIST;
inodedep->id_state |= ALLCOMPLETE;
- LIST_REMOVE(inodedep, id_deps);
- inodedep->id_buf = NULL;
+ inodedep->id_bmsafemap = NULL;
if (inodedep->id_state & ONWORKLIST)
WORKLIST_REMOVE(&inodedep->id_list);
if (inodedep->id_savedino1 != NULL) {
@@ -2696,17 +5632,23 @@ free_inodedep(inodedep)
{
mtx_assert(&lk, MA_OWNED);
- if ((inodedep->id_state & ONWORKLIST) != 0 ||
+ if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
+ !LIST_EMPTY(&inodedep->id_dirremhd) ||
!LIST_EMPTY(&inodedep->id_pendinghd) ||
!LIST_EMPTY(&inodedep->id_bufwait) ||
!LIST_EMPTY(&inodedep->id_inowait) ||
+ !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
!TAILQ_EMPTY(&inodedep->id_inoupdt) ||
!TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
- inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
+ inodedep->id_mkdiradd != NULL ||
+ inodedep->id_nlinkdelta != 0 ||
+ inodedep->id_savedino1 != NULL)
return (0);
+ if (inodedep->id_state & ONDEPLIST)
+ LIST_REMOVE(inodedep, id_deps);
LIST_REMOVE(inodedep, id_hash);
WORKITEM_FREE(inodedep, D_INODEDEP);
num_inodedep -= 1;
@@ -2714,6 +5656,126 @@ free_inodedep(inodedep)
}
/*
+ * Free the block referenced by a freework structure. The parent freeblks
+ * structure is released and completed when the final cg bitmap reaches
+ * the disk. This routine may be freeing a jnewblk which never made it to
+ * disk in which case we do not have to wait as the operation is undone
+ * in memory immediately.
+ */
+static void
+freework_freeblock(freework)
+ struct freework *freework;
+{
+ struct freeblks *freeblks;
+ struct ufsmount *ump;
+ struct workhead wkhd;
+ struct fs *fs;
+ int complete;
+ int pending;
+ int bsize;
+ int needj;
+
+ freeblks = freework->fw_freeblks;
+ ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+ fs = ump->um_fs;
+ needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ;
+ complete = 0;
+ LIST_INIT(&wkhd);
+ /*
+ * If we are canceling an existing jnewblk pass it to the free
+ * routine, otherwise pass the freeblk which will ultimately
+ * release the freeblks. If we're not journaling, we can just
+ * free the freeblks immediately.
+ */
+ if (!LIST_EMPTY(&freework->fw_jwork)) {
+ LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
+ complete = 1;
+ } else if (needj)
+ WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list);
+ bsize = lfragtosize(fs, freework->fw_frags);
+ pending = btodb(bsize);
+ ACQUIRE_LOCK(&lk);
+ freeblks->fb_chkcnt -= pending;
+ FREE_LOCK(&lk);
+ /*
+ * extattr blocks don't show up in pending blocks. XXX why?
+ */
+ if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {
+ UFS_LOCK(ump);
+ fs->fs_pendingblocks -= pending;
+ UFS_UNLOCK(ump);
+ }
+ ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,
+ bsize, freeblks->fb_previousinum, &wkhd);
+ if (complete == 0 && needj)
+ return;
+ /*
+ * The jnewblk will be discarded and the bits in the map never
+ * made it to disk. We can immediately free the freeblk.
+ */
+ ACQUIRE_LOCK(&lk);
+ handle_written_freework(freework);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Start, continue, or finish the process of freeing an indirect block tree.
+ * The free operation may be paused at any point with fw_off containing the
+ * offset to restart from. This enables us to implement some flow control
+ * for large truncates which may fan out and generate a huge number of
+ * dependencies.
+ */
+static void
+handle_workitem_indirblk(freework)
+ struct freework *freework;
+{
+ struct freeblks *freeblks;
+ struct ufsmount *ump;
+ struct fs *fs;
+
+
+ freeblks = freework->fw_freeblks;
+ ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+ fs = ump->um_fs;
+ if (freework->fw_off == NINDIR(fs))
+ freework_freeblock(freework);
+ else
+ indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
+ freework->fw_lbn);
+}
+
+/*
+ * Called when a freework structure attached to a cg buf is written. The
+ * ref on either the parent or the freeblks structure is released and
+ * either may be added to the worklist if it is the final ref.
+ */
+static void
+handle_written_freework(freework)
+ struct freework *freework;
+{
+ struct freeblks *freeblks;
+ struct freework *parent;
+
+ freeblks = freework->fw_freeblks;
+ parent = freework->fw_parent;
+ if (parent) {
+ if (--parent->fw_ref != 0)
+ parent = NULL;
+ freeblks = NULL;
+ } else if (--freeblks->fb_ref != 0)
+ freeblks = NULL;
+ WORKITEM_FREE(freework, D_FREEWORK);
+ /*
+ * Don't delay these block frees or it takes an intolerable amount
+ * of time to process truncates and free their journal entries.
+ */
+ if (freeblks)
+ add_to_worklist(&freeblks->fb_list, 1);
+ if (parent)
+ add_to_worklist(&parent->fw_list, 1);
+}
+
+/*
* This workitem routine performs the block de-allocation.
* The workitem is added to the pending list after the updated
* inode block has been written to disk. As mentioned above,
@@ -2726,99 +5788,79 @@ handle_workitem_freeblocks(freeblks, flags)
struct freeblks *freeblks;
int flags;
{
+ struct freework *freework;
+ struct worklist *wk;
+
+ KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),
+ ("handle_workitem_freeblocks: Journal entries not written."));
+ if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {
+ handle_complete_freeblocks(freeblks);
+ return;
+ }
+ freeblks->fb_ref++;
+ while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
+ KASSERT(wk->wk_type == D_FREEWORK,
+ ("handle_workitem_freeblocks: Unknown type %s",
+ TYPENAME(wk->wk_type)));
+ WORKLIST_REMOVE_UNLOCKED(wk);
+ freework = WK_FREEWORK(wk);
+ if (freework->fw_lbn <= -NDADDR)
+ handle_workitem_indirblk(freework);
+ else
+ freework_freeblock(freework);
+ }
+ ACQUIRE_LOCK(&lk);
+ if (--freeblks->fb_ref != 0)
+ freeblks = NULL;
+ FREE_LOCK(&lk);
+ if (freeblks)
+ handle_complete_freeblocks(freeblks);
+}
+
+/*
+ * Once all of the freework workitems are complete we can retire the
+ * freeblocks dependency and any journal work awaiting completion. This
+ * can not be called until all other dependencies are stable on disk.
+ */
+static void
+handle_complete_freeblocks(freeblks)
+ struct freeblks *freeblks;
+{
struct inode *ip;
struct vnode *vp;
struct fs *fs;
struct ufsmount *ump;
- int i, nblocks, level, bsize;
- ufs2_daddr_t bn, blocksreleased = 0;
- int error, allerror = 0;
- ufs_lbn_t baselbns[NIADDR], tmpval;
- int fs_pendingblocks;
+ int flags;
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
fs = ump->um_fs;
- fs_pendingblocks = 0;
- tmpval = 1;
- baselbns[0] = NDADDR;
- for (i = 1; i < NIADDR; i++) {
- tmpval *= NINDIR(fs);
- baselbns[i] = baselbns[i - 1] + tmpval;
- }
- nblocks = btodb(fs->fs_bsize);
- blocksreleased = 0;
- /*
- * Release all extended attribute blocks or frags.
- */
- if (freeblks->fb_oldextsize > 0) {
- for (i = (NXADDR - 1); i >= 0; i--) {
- if ((bn = freeblks->fb_eblks[i]) == 0)
- continue;
- bsize = sblksize(fs, freeblks->fb_oldextsize, i);
- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
- freeblks->fb_previousinum);
- blocksreleased += btodb(bsize);
- }
- }
- /*
- * Release all data blocks or frags.
- */
- if (freeblks->fb_oldsize > 0) {
- /*
- * Indirect blocks first.
- */
- for (level = (NIADDR - 1); level >= 0; level--) {
- if ((bn = freeblks->fb_iblks[level]) == 0)
- continue;
- if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
- level, baselbns[level], &blocksreleased)) != 0)
- allerror = error;
- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
- fs->fs_bsize, freeblks->fb_previousinum);
- fs_pendingblocks += nblocks;
- blocksreleased += nblocks;
- }
- /*
- * All direct blocks or frags.
- */
- for (i = (NDADDR - 1); i >= 0; i--) {
- if ((bn = freeblks->fb_dblks[i]) == 0)
- continue;
- bsize = sblksize(fs, freeblks->fb_oldsize, i);
- ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
- freeblks->fb_previousinum);
- fs_pendingblocks += btodb(bsize);
- blocksreleased += btodb(bsize);
- }
- }
- UFS_LOCK(ump);
- fs->fs_pendingblocks -= fs_pendingblocks;
- UFS_UNLOCK(ump);
+ flags = LK_NOWAIT;
+
/*
* If we still have not finished background cleanup, then check
* to see if the block count needs to be adjusted.
*/
- if (freeblks->fb_chkcnt != blocksreleased &&
- (fs->fs_flags & FS_UNCLEAN) != 0 &&
+ if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&
ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
- (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)
- == 0) {
+ (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {
ip = VTOI(vp);
- DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
- freeblks->fb_chkcnt - blocksreleased);
+ DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);
ip->i_flag |= IN_CHANGE;
vput(vp);
}
#ifdef INVARIANTS
- if (freeblks->fb_chkcnt != blocksreleased &&
+ if (freeblks->fb_chkcnt != 0 &&
((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
printf("handle_workitem_freeblocks: block count\n");
- if (allerror)
- softdep_error("handle_workitem_freeblks", allerror);
#endif /* INVARIANTS */
ACQUIRE_LOCK(&lk);
+ /*
+ * All of the freeblock deps must be complete prior to this call
+ * so it's now safe to complete earlier outstanding journal entries.
+ */
+ handle_jwork(&freeblks->fb_jwork);
WORKITEM_FREE(freeblks, D_FREEBLKS);
num_freeblkdep--;
FREE_LOCK(&lk);
@@ -2830,29 +5872,42 @@ handle_workitem_freeblocks(freeblks, flags)
* and recursive calls to indirtrunc must be used to cleanse other indirect
* blocks.
*/
-static int
-indir_trunc(freeblks, dbn, level, lbn, countp)
- struct freeblks *freeblks;
+static void
+indir_trunc(freework, dbn, lbn)
+ struct freework *freework;
ufs2_daddr_t dbn;
- int level;
ufs_lbn_t lbn;
- ufs2_daddr_t *countp;
{
+ struct freework *nfreework;
+ struct workhead wkhd;
+ struct jnewblk *jnewblk;
+ struct freeblks *freeblks;
struct buf *bp;
struct fs *fs;
+ struct worklist *wkn;
struct worklist *wk;
struct indirdep *indirdep;
struct ufsmount *ump;
ufs1_daddr_t *bap1 = 0;
- ufs2_daddr_t nb, *bap2 = 0;
+ ufs2_daddr_t nb, nnb, *bap2 = 0;
ufs_lbn_t lbnadd;
int i, nblocks, ufs1fmt;
- int error, allerror = 0;
int fs_pendingblocks;
+ int freedeps;
+ int needj;
+ int level;
+ int cnt;
+ LIST_INIT(&wkhd);
+ level = lbn_level(lbn);
+ if (level == -1)
+ panic("indir_trunc: Invalid lbn %jd\n", lbn);
+ freeblks = freework->fw_freeblks;
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
fs = ump->um_fs;
fs_pendingblocks = 0;
+ freedeps = 0;
+ needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ;
lbnadd = 1;
for (i = level; i > 0; i--)
lbnadd *= NINDIR(fs);
@@ -2877,13 +5932,14 @@ indir_trunc(freeblks, dbn, level, lbn, countp)
ACQUIRE_LOCK(&lk);
if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
if (wk->wk_type != D_INDIRDEP ||
- (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
- (indirdep->ir_state & GOINGAWAY) == 0)
- panic("indir_trunc: lost indirdep");
- WORKLIST_REMOVE(wk);
- WORKITEM_FREE(indirdep, D_INDIRDEP);
+ (wk->wk_state & GOINGAWAY) == 0)
+ panic("indir_trunc: lost indirdep %p", wk);
+ indirdep = WK_INDIRDEP(wk);
+ LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
+ free_indirdep(indirdep);
if (!LIST_EMPTY(&bp->b_dep))
- panic("indir_trunc: dangling dep");
+ panic("indir_trunc: dangling dep %p",
+ LIST_FIRST(&bp->b_dep));
ump->um_numindirdeps -= 1;
FREE_LOCK(&lk);
} else {
@@ -2892,11 +5948,10 @@ indir_trunc(freeblks, dbn, level, lbn, countp)
brelse(bp);
#endif
FREE_LOCK(&lk);
- error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
- NOCRED, &bp);
- if (error) {
+ if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
+ NOCRED, &bp) != 0) {
brelse(bp);
- return (error);
+ return;
}
}
/*
@@ -2909,57 +5964,264 @@ indir_trunc(freeblks, dbn, level, lbn, countp)
ufs1fmt = 0;
bap2 = (ufs2_daddr_t *)bp->b_data;
}
- nblocks = btodb(fs->fs_bsize);
- for (i = NINDIR(fs) - 1; i >= 0; i--) {
- if (ufs1fmt)
- nb = bap1[i];
+ /*
+ * Reclaim indirect blocks which never made it to disk.
+ */
+ cnt = 0;
+ LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {
+ struct workhead freewk;
+ if (wk->wk_type != D_JNEWBLK)
+ continue;
+ WORKLIST_REMOVE_UNLOCKED(wk);
+ LIST_INIT(&freewk);
+ WORKLIST_INSERT_UNLOCKED(&freewk, wk);
+ jnewblk = WK_JNEWBLK(wk);
+ if (jnewblk->jn_lbn > 0)
+ i = (jnewblk->jn_lbn - -lbn) / lbnadd;
else
+ i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd;
+ KASSERT(i >= 0 && i < NINDIR(fs),
+ ("indir_trunc: Index out of range %d parent %jd lbn %jd",
+ i, lbn, jnewblk->jn_lbn));
+ /* Clear the pointer so it isn't found below. */
+ if (ufs1fmt) {
+ nb = bap1[i];
+ bap1[i] = 0;
+ } else {
nb = bap2[i];
+ bap2[i] = 0;
+ }
+ KASSERT(nb == jnewblk->jn_blkno,
+ ("indir_trunc: Block mismatch %jd != %jd",
+ nb, jnewblk->jn_blkno));
+ ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno,
+ fs->fs_bsize, freeblks->fb_previousinum, &freewk);
+ cnt++;
+ }
+ ACQUIRE_LOCK(&lk);
+ if (needj)
+ freework->fw_ref += NINDIR(fs) + 1;
+ /* Any remaining journal work can be completed with freeblks. */
+ jwork_move(&freeblks->fb_jwork, &wkhd);
+ FREE_LOCK(&lk);
+ nblocks = btodb(fs->fs_bsize);
+ if (ufs1fmt)
+ nb = bap1[0];
+ else
+ nb = bap2[0];
+ nfreework = freework;
+ /*
+ * Reclaim on disk blocks.
+ */
+ for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
+ if (i != NINDIR(fs) - 1) {
+ if (ufs1fmt)
+ nnb = bap1[i+1];
+ else
+ nnb = bap2[i+1];
+ } else
+ nnb = 0;
if (nb == 0)
continue;
+ cnt++;
if (level != 0) {
- if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
- level - 1, lbn + (i * lbnadd), countp)) != 0)
- allerror = error;
+ ufs_lbn_t nlbn;
+
+ nlbn = (lbn + 1) - (i * lbnadd);
+ if (needj != 0) {
+ nfreework = newfreework(freeblks, freework,
+ nlbn, nb, fs->fs_frag, 0);
+ freedeps++;
+ }
+ indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
+ } else {
+ struct freedep *freedep;
+
+ /*
+ * Attempt to aggregate freedep dependencies for
+ * all blocks being released to the same CG.
+ */
+ LIST_INIT(&wkhd);
+ if (needj != 0 &&
+ (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
+ freedep = newfreedep(freework);
+ WORKLIST_INSERT_UNLOCKED(&wkhd,
+ &freedep->fd_list);
+ freedeps++;
+ }
+ ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
+ fs->fs_bsize, freeblks->fb_previousinum, &wkhd);
}
- ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
- freeblks->fb_previousinum);
+ }
+ if (level == 0)
+ fs_pendingblocks = (nblocks * cnt);
+ /*
+ * If we're not journaling we can free the indirect now. Otherwise
+ * setup the ref counts and offset so this indirect can be completed
+ * when its children are free.
+ */
+ if (needj == 0) {
fs_pendingblocks += nblocks;
- *countp += nblocks;
+ dbn = dbtofsb(fs, dbn);
+ ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
+ freeblks->fb_previousinum, NULL);
+ ACQUIRE_LOCK(&lk);
+ freeblks->fb_chkcnt -= fs_pendingblocks;
+ if (freework->fw_blkno == dbn)
+ handle_written_freework(freework);
+ FREE_LOCK(&lk);
+ freework = NULL;
+ } else {
+ ACQUIRE_LOCK(&lk);
+ freework->fw_off = i;
+ freework->fw_ref += freedeps;
+ freework->fw_ref -= NINDIR(fs) + 1;
+ if (freework->fw_ref != 0)
+ freework = NULL;
+ freeblks->fb_chkcnt -= fs_pendingblocks;
+ FREE_LOCK(&lk);
+ }
+ if (fs_pendingblocks) {
+ UFS_LOCK(ump);
+ fs->fs_pendingblocks -= fs_pendingblocks;
+ UFS_UNLOCK(ump);
}
- UFS_LOCK(ump);
- fs->fs_pendingblocks -= fs_pendingblocks;
- UFS_UNLOCK(ump);
bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
- return (allerror);
+ if (freework)
+ handle_workitem_indirblk(freework);
+ return;
}
/*
- * Free an allocindir.
- * This routine must be called with splbio interrupts blocked.
+ * Cancel an allocindir when it is removed via truncation.
*/
static void
-free_allocindir(aip, inodedep)
+cancel_allocindir(aip, inodedep, freeblks)
struct allocindir *aip;
struct inodedep *inodedep;
+ struct freeblks *freeblks;
{
- struct freefrag *freefrag;
+ struct newblk *newblk;
- mtx_assert(&lk, MA_OWNED);
- if ((aip->ai_state & DEPCOMPLETE) == 0)
- LIST_REMOVE(aip, ai_deps);
- if (aip->ai_state & ONWORKLIST)
- WORKLIST_REMOVE(&aip->ai_list);
+ /*
+ * If the journal hasn't been written the jnewblk must be passed
+ * to the call to ffs_freeblk that reclaims the space. We accomplish
+ * this by linking the journal dependency into the indirdep to be
+ * freed when indir_trunc() is called. If the journal has already
+ * been written we can simply reclaim the journal space when the
+ * freeblks work is complete.
+ */
LIST_REMOVE(aip, ai_next);
- if ((freefrag = aip->ai_freefrag) != NULL) {
+ newblk = (struct newblk *)aip;
+ if (newblk->nb_jnewblk == NULL)
+ cancel_newblk(newblk, &freeblks->fb_jwork);
+ else
+ cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork);
+ if (inodedep && inodedep->id_state & DEPCOMPLETE)
+ WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);
+ else
+ free_newblk(newblk);
+}
+
+/*
+ * Create the mkdir dependencies for . and .. in a new directory. Link them
+ * in to a newdirblk so any subsequent additions are tracked properly. The
+ * caller is responsible for adding the mkdir1 dependency to the journal
+ * and updating id_mkdiradd. This function returns with lk held.
+ */
+static struct mkdir *
+setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
+ struct diradd *dap;
+ ino_t newinum;
+ ino_t dinum;
+ struct buf *newdirbp;
+ struct mkdir **mkdirp;
+{
+ struct newblk *newblk;
+ struct pagedep *pagedep;
+ struct inodedep *inodedep;
+ struct newdirblk *newdirblk = 0;
+ struct mkdir *mkdir1, *mkdir2;
+ struct worklist *wk;
+ struct jaddref *jaddref;
+ struct mount *mp;
+
+ mp = dap->da_list.wk_mp;
+ newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
+ M_SOFTDEP_FLAGS);
+ workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
+ LIST_INIT(&newdirblk->db_mkdir);
+ mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
+ workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
+ mkdir1->md_state = ATTACHED | MKDIR_BODY;
+ mkdir1->md_diradd = dap;
+ mkdir1->md_jaddref = NULL;
+ mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
+ workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
+ mkdir2->md_state = ATTACHED | MKDIR_PARENT;
+ mkdir2->md_diradd = dap;
+ mkdir2->md_jaddref = NULL;
+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) {
+ mkdir1->md_state |= DEPCOMPLETE;
+ mkdir2->md_state |= DEPCOMPLETE;
+ }
+ /*
+ * Dependency on "." and ".." being written to disk.
+ */
+ mkdir1->md_buf = newdirbp;
+ ACQUIRE_LOCK(&lk);
+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
+ /*
+ * We must link the pagedep, allocdirect, and newdirblk for
+ * the initial file page so the pointer to the new directory
+ * is not written until the directory contents are live and
+ * any subsequent additions are not marked live until the
+ * block is reachable via the inode.
+ */
+ if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)
+ panic("setup_newdir: lost pagedep");
+ LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
+ if (wk->wk_type == D_ALLOCDIRECT)
+ break;
+ if (wk == NULL)
+ panic("setup_newdir: lost allocdirect");
+ newblk = WK_NEWBLK(wk);
+ pagedep->pd_state |= NEWBLOCK;
+ pagedep->pd_newdirblk = newdirblk;
+ newdirblk->db_pagedep = pagedep;
+ WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
+ WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
+ /*
+ * Look up the inodedep for the parent directory so that we
+ * can link mkdir2 into the pending dotdot jaddref or
+ * the inode write if there is none. If the inode is
+ * ALLCOMPLETE and no jaddref is present all dependencies have
+ * been satisfied and mkdir2 can be freed.
+ */
+ inodedep_lookup(mp, dinum, 0, &inodedep);
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
if (inodedep == NULL)
- add_to_worklist(&freefrag->ff_list);
- else
- WORKLIST_INSERT(&inodedep->id_bufwait,
- &freefrag->ff_list);
+ panic("setup_newdir: Lost parent.");
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
+ (jaddref->ja_state & MKDIR_PARENT),
+ ("setup_newdir: bad dotdot jaddref %p", jaddref));
+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
+ mkdir2->md_jaddref = jaddref;
+ jaddref->ja_mkdir = mkdir2;
+ } else if (inodedep == NULL ||
+ (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ dap->da_state &= ~MKDIR_PARENT;
+ WORKITEM_FREE(mkdir2, D_MKDIR);
+ } else {
+ LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
+ WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
}
- WORKITEM_FREE(aip, D_ALLOCINDIR);
+ *mkdirp = mkdir2;
+
+ return (mkdir1);
}
/*
@@ -2998,12 +6260,14 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
ufs_lbn_t lbn; /* block in directory containing new entry */
struct fs *fs;
struct diradd *dap;
- struct allocdirect *adp;
+ struct newblk *newblk;
struct pagedep *pagedep;
struct inodedep *inodedep;
struct newdirblk *newdirblk = 0;
struct mkdir *mkdir1, *mkdir2;
+ struct jaddref *jaddref;
struct mount *mp;
+ int isindir;
/*
* Whiteouts have no dependencies.
@@ -3013,6 +6277,8 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
bdwrite(newdirbp);
return (0);
}
+ jaddref = NULL;
+ mkdir1 = mkdir2 = NULL;
mp = UFSTOVFS(dp->i_ump);
fs = dp->i_fs;
lbn = lblkno(fs, diroffset);
@@ -3023,111 +6289,123 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
dap->da_offset = offset;
dap->da_newinum = newinum;
dap->da_state = ATTACHED;
- if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
+ LIST_INIT(&dap->da_jwork);
+ isindir = bp->b_lblkno >= NDADDR;
+ if (isnewblk &&
+ (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
newdirblk = malloc(sizeof(struct newdirblk),
M_NEWDIRBLK, M_SOFTDEP_FLAGS);
workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
+ LIST_INIT(&newdirblk->db_mkdir);
}
+ /*
+ * If we're creating a new directory setup the dependencies and set
+ * the dap state to wait for them. Otherwise it's COMPLETE and
+ * we can move on.
+ */
if (newdirbp == NULL) {
dap->da_state |= DEPCOMPLETE;
ACQUIRE_LOCK(&lk);
} else {
dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
- mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR,
- M_SOFTDEP_FLAGS);
- workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
- mkdir1->md_state = MKDIR_BODY;
- mkdir1->md_diradd = dap;
- mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR,
- M_SOFTDEP_FLAGS);
- workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
- mkdir2->md_state = MKDIR_PARENT;
- mkdir2->md_diradd = dap;
- /*
- * Dependency on "." and ".." being written to disk.
- */
- mkdir1->md_buf = newdirbp;
- ACQUIRE_LOCK(&lk);
- LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
- WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
- FREE_LOCK(&lk);
- bdwrite(newdirbp);
- /*
- * Dependency on link count increase for parent directory
- */
- ACQUIRE_LOCK(&lk);
- if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
- || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
- dap->da_state &= ~MKDIR_PARENT;
- WORKITEM_FREE(mkdir2, D_MKDIR);
- } else {
- LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
- WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
- }
+ mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
+ &mkdir2);
}
/*
* Link into parent directory pagedep to await its being written.
*/
- if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
+ if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
+#ifdef DEBUG
+ if (diradd_lookup(pagedep, offset) != NULL)
+ panic("softdep_setup_directory_add: %p already at off %d\n",
+ diradd_lookup(pagedep, offset), offset);
+#endif
dap->da_pagedep = pagedep;
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
da_pdlist);
+ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
/*
- * Link into its inodedep. Put it on the id_bufwait list if the inode
- * is not yet written. If it is written, do the post-inode write
- * processing to put it on the id_pendinghd list.
+ * If we're journaling, link the diradd into the jaddref so it
+ * may be completed after the journal entry is written. Otherwise,
+ * link the diradd into its inodedep. If the inode is not yet
+ * written place it on the bufwait list, otherwise do the post-inode
+ * write processing to put it on the id_pendinghd list.
*/
- (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
- if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+ ("softdep_setup_directory_add: bad jaddref %p", jaddref));
+ jaddref->ja_diroff = diroffset;
+ jaddref->ja_diradd = dap;
+ add_to_journal(&jaddref->ja_list);
+ } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
diradd_inode_written(dap, inodedep);
else
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
- if (isnewblk) {
+ /*
+ * Add the journal entries for . and .. links now that the primary
+ * link is written.
+ */
+ if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) {
+ jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
+ inoreflst, if_deps);
+ KASSERT(jaddref != NULL &&
+ jaddref->ja_ino == jaddref->ja_parent &&
+ (jaddref->ja_state & MKDIR_BODY),
+ ("softdep_setup_directory_add: bad dot jaddref %p",
+ jaddref));
+ mkdir1->md_jaddref = jaddref;
+ jaddref->ja_mkdir = mkdir1;
/*
- * Directories growing into indirect blocks are rare
- * enough and the frequency of new block allocation
- * in those cases even more rare, that we choose not
- * to bother tracking them. Rather we simply force the
- * new directory entry to disk.
+ * It is important that the dotdot journal entry
+ * is added prior to the dot entry since dot writes
+ * both the dot and dotdot links. These both must
+ * be added after the primary link for the journal
+ * to remain consistent.
*/
- if (lbn >= NDADDR) {
- FREE_LOCK(&lk);
- /*
- * We only have a new allocation when at the
- * beginning of a new block, not when we are
- * expanding into an existing block.
- */
- if (blkoff(fs, diroffset) == 0)
- return (1);
- return (0);
- }
+ add_to_journal(&mkdir2->md_jaddref->ja_list);
+ add_to_journal(&jaddref->ja_list);
+ }
+ /*
+ * If we are adding a new directory remember this diradd so that if
+ * we rename it we can keep the dot and dotdot dependencies. If
+ * we are adding a new name for an inode that has a mkdiradd we
+ * must be in rename and we have to move the dot and dotdot
+ * dependencies to this new name. The old name is being orphaned
+ * soon.
+ */
+ if (mkdir1 != NULL) {
+ if (inodedep->id_mkdiradd != NULL)
+ panic("softdep_setup_directory_add: Existing mkdir");
+ inodedep->id_mkdiradd = dap;
+ } else if (inodedep->id_mkdiradd)
+ merge_diradd(inodedep, dap);
+ if (newdirblk) {
/*
- * We only have a new allocation when at the beginning
- * of a new fragment, not when we are expanding into an
- * existing fragment. Also, there is nothing to do if we
- * are already tracking this block.
+ * There is nothing to do if we are already tracking
+ * this block.
*/
- if (fragoff(fs, diroffset) != 0) {
- FREE_LOCK(&lk);
- return (0);
- }
if ((pagedep->pd_state & NEWBLOCK) != 0) {
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
FREE_LOCK(&lk);
return (0);
}
- /*
- * Find our associated allocdirect and have it track us.
- */
- if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
- panic("softdep_setup_directory_add: lost inodedep");
- adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
- if (adp == NULL || adp->ad_lbn != lbn)
+ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
+ == 0)
panic("softdep_setup_directory_add: lost entry");
+ WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
pagedep->pd_state |= NEWBLOCK;
+ pagedep->pd_newdirblk = newdirblk;
newdirblk->db_pagedep = pagedep;
- WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
+ FREE_LOCK(&lk);
+ /*
+ * If we extended into an indirect signal direnter to sync.
+ */
+ if (isindir)
+ return (1);
+ return (0);
}
FREE_LOCK(&lk);
return (0);
@@ -3141,7 +6419,8 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
* occur while the move is in progress.
*/
void
-softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
+softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
+ struct buf *bp; /* Buffer holding directory block. */
struct inode *dp; /* inode for directory */
caddr_t base; /* address of dp->i_offset */
caddr_t oldloc; /* address of old directory location */
@@ -3150,40 +6429,204 @@ softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
{
int offset, oldoffset, newoffset;
struct pagedep *pagedep;
+ struct jmvref *jmvref;
struct diradd *dap;
+ struct direct *de;
+ struct mount *mp;
ufs_lbn_t lbn;
+ int flags;
- ACQUIRE_LOCK(&lk);
+ mp = UFSTOVFS(dp->i_ump);
+ de = (struct direct *)oldloc;
+ jmvref = NULL;
+ flags = 0;
+ /*
+ * Moves are always journaled as it would be too complex to
+ * determine if any affected adds or removes are present in the
+ * journal.
+ */
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ flags = DEPALLOC;
+ jmvref = newjmvref(dp, de->d_ino,
+ dp->i_offset + (oldloc - base),
+ dp->i_offset + (newloc - base));
+ }
lbn = lblkno(dp->i_fs, dp->i_offset);
offset = blkoff(dp->i_fs, dp->i_offset);
- if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
- goto done;
oldoffset = offset + (oldloc - base);
newoffset = offset + (newloc - base);
-
- LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
- if (dap->da_offset != oldoffset)
- continue;
+ ACQUIRE_LOCK(&lk);
+ if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {
+ if (pagedep)
+ WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
+ goto done;
+ }
+ dap = diradd_lookup(pagedep, oldoffset);
+ if (dap) {
dap->da_offset = newoffset;
- if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
- break;
- LIST_REMOVE(dap, da_pdlist);
- LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
- dap, da_pdlist);
- break;
+ newoffset = DIRADDHASH(newoffset);
+ oldoffset = DIRADDHASH(oldoffset);
+ if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
+ newoffset != oldoffset) {
+ LIST_REMOVE(dap, da_pdlist);
+ LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
+ dap, da_pdlist);
+ }
}
- if (dap == NULL) {
+done:
+ if (jmvref) {
+ jmvref->jm_pagedep = pagedep;
+ LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
+ add_to_journal(&jmvref->jm_list);
+ }
+ bcopy(oldloc, newloc, entrysize);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Move the mkdir dependencies and journal work from one diradd to another
+ * when renaming a directory. The new name must depend on the mkdir deps
+ * completing as the old name did. Directories can only have one valid link
+ * at a time so one must be canonical.
+ */
+static void
+merge_diradd(inodedep, newdap)
+ struct inodedep *inodedep;
+ struct diradd *newdap;
+{
+ struct diradd *olddap;
+ struct mkdir *mkdir, *nextmd;
+ short state;
- LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
- if (dap->da_offset == oldoffset) {
- dap->da_offset = newoffset;
+ olddap = inodedep->id_mkdiradd;
+ inodedep->id_mkdiradd = newdap;
+ if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
+ newdap->da_state &= ~DEPCOMPLETE;
+ for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
+ nextmd = LIST_NEXT(mkdir, md_mkdirs);
+ if (mkdir->md_diradd != olddap)
+ continue;
+ mkdir->md_diradd = newdap;
+ state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
+ newdap->da_state |= state;
+ olddap->da_state &= ~state;
+ if ((olddap->da_state &
+ (MKDIR_PARENT | MKDIR_BODY)) == 0)
break;
+ }
+ if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
+ panic("merge_diradd: unfound ref");
+ }
+ /*
+ * Any mkdir related journal items are not safe to be freed until
+ * the new name is stable.
+ */
+ jwork_move(&newdap->da_jwork, &olddap->da_jwork);
+ olddap->da_state |= DEPCOMPLETE;
+ complete_diradd(olddap);
+}
+
+/*
+ * Move the diradd to the pending list when all diradd dependencies are
+ * complete.
+ */
+static void
+complete_diradd(dap)
+ struct diradd *dap;
+{
+ struct pagedep *pagedep;
+
+ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ if (dap->da_state & DIRCHG)
+ pagedep = dap->da_previous->dm_pagedep;
+ else
+ pagedep = dap->da_pagedep;
+ LIST_REMOVE(dap, da_pdlist);
+ LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
+ }
+}
+
+/*
+ * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal
+ * add entries and conditonally journal the remove.
+ */
+static void
+cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
+ struct diradd *dap;
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+ struct jremref *dotremref;
+ struct jremref *dotdotremref;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct inoref *inoref;
+ struct mkdir *mkdir;
+
+ /*
+ * If no remove references were allocated we're on a non-journaled
+ * filesystem and can skip the cancel step.
+ */
+ if (jremref == NULL) {
+ free_diradd(dap, NULL);
+ return;
+ }
+ /*
+ * Cancel the primary name an free it if it does not require
+ * journaling.
+ */
+ if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
+ 0, &inodedep) != 0) {
+ /* Abort the addref that reference this diradd. */
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if (inoref->if_list.wk_type != D_JADDREF)
+ continue;
+ jaddref = (struct jaddref *)inoref;
+ if (jaddref->ja_diradd != dap)
+ continue;
+ if (cancel_jaddref(jaddref, inodedep,
+ &dirrem->dm_jwork) == 0) {
+ free_jremref(jremref);
+ jremref = NULL;
}
+ break;
}
}
-done:
- bcopy(oldloc, newloc, entrysize);
- FREE_LOCK(&lk);
+ /*
+ * Cancel subordinate names and free them if they do not require
+ * journaling.
+ */
+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
+ LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
+ if (mkdir->md_diradd != dap)
+ continue;
+ if ((jaddref = mkdir->md_jaddref) == NULL)
+ continue;
+ mkdir->md_jaddref = NULL;
+ if (mkdir->md_state & MKDIR_PARENT) {
+ if (cancel_jaddref(jaddref, NULL,
+ &dirrem->dm_jwork) == 0) {
+ free_jremref(dotdotremref);
+ dotdotremref = NULL;
+ }
+ } else {
+ if (cancel_jaddref(jaddref, inodedep,
+ &dirrem->dm_jwork) == 0) {
+ free_jremref(dotremref);
+ dotremref = NULL;
+ }
+ }
+ }
+ }
+
+ if (jremref)
+ journal_jremref(dirrem, jremref, inodedep);
+ if (dotremref)
+ journal_jremref(dirrem, dotremref, inodedep);
+ if (dotdotremref)
+ journal_jremref(dirrem, dotdotremref, NULL);
+ jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
+ free_diradd(dap, &dirrem->dm_jwork);
}
/*
@@ -3191,8 +6634,9 @@ done:
* with splbio interrupts blocked.
*/
static void
-free_diradd(dap)
+free_diradd(dap, wkhd)
struct diradd *dap;
+ struct workhead *wkhd;
{
struct dirrem *dirrem;
struct pagedep *pagedep;
@@ -3200,32 +6644,48 @@ free_diradd(dap)
struct mkdir *mkdir, *nextmd;
mtx_assert(&lk, MA_OWNED);
- WORKLIST_REMOVE(&dap->da_list);
LIST_REMOVE(dap, da_pdlist);
+ if (dap->da_state & ONWORKLIST)
+ WORKLIST_REMOVE(&dap->da_list);
if ((dap->da_state & DIRCHG) == 0) {
pagedep = dap->da_pagedep;
} else {
dirrem = dap->da_previous;
pagedep = dirrem->dm_pagedep;
dirrem->dm_dirinum = pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ dirrem->dm_state |= COMPLETE;
+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+ add_to_worklist(&dirrem->dm_list, 0);
}
if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
0, &inodedep) != 0)
- (void) free_inodedep(inodedep);
+ if (inodedep->id_mkdiradd == dap)
+ inodedep->id_mkdiradd = NULL;
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
nextmd = LIST_NEXT(mkdir, md_mkdirs);
if (mkdir->md_diradd != dap)
continue;
- dap->da_state &= ~mkdir->md_state;
- WORKLIST_REMOVE(&mkdir->md_list);
+ dap->da_state &=
+ ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
LIST_REMOVE(mkdir, md_mkdirs);
+ if (mkdir->md_state & ONWORKLIST)
+ WORKLIST_REMOVE(&mkdir->md_list);
+ if (mkdir->md_jaddref != NULL)
+ panic("free_diradd: Unexpected jaddref");
WORKITEM_FREE(mkdir, D_MKDIR);
+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
+ break;
}
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
panic("free_diradd: unfound ref");
}
+ if (inodedep)
+ free_inodedep(inodedep);
+ /*
+ * Free any journal segments waiting for the directory write.
+ */
+ handle_jwork(&dap->da_jwork);
WORKITEM_FREE(dap, D_DIRADD);
}
@@ -3254,11 +6714,24 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
int isrmdir; /* indicates if doing RMDIR */
{
struct dirrem *dirrem, *prevdirrem;
+ struct inodedep *inodedep;
+ int direct;
/*
- * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
+ * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want
+ * newdirrem() to setup the full directory remove which requires
+ * isrmdir > 1.
*/
- dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
+ dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem);
+ /*
+ * Add the dirrem to the inodedep's pending remove list for quick
+ * discovery later.
+ */
+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+ &inodedep) == 0)
+ panic("softdep_setup_remove: Lost inodedep.");
+ dirrem->dm_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
/*
* If the COMPLETE flag is clear, then there were no active
@@ -3280,9 +6753,146 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
prevdirrem, dm_next);
dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
+ direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
FREE_LOCK(&lk);
- handle_workitem_remove(dirrem, NULL);
+ if (direct)
+ handle_workitem_remove(dirrem, NULL);
+ }
+}
+
+/*
+ * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
+ * pd_pendinghd list of a pagedep.
+ */
+static struct diradd *
+diradd_lookup(pagedep, offset)
+ struct pagedep *pagedep;
+ int offset;
+{
+ struct diradd *dap;
+
+ LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
+ if (dap->da_offset == offset)
+ return (dap);
+ LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
+ if (dap->da_offset == offset)
+ return (dap);
+ return (NULL);
+}
+
+/*
+ * Search for a .. diradd dependency in a directory that is being removed.
+ * If the directory was renamed to a new parent we have a diradd rather
+ * than a mkdir for the .. entry. We need to cancel it now before
+ * it is found in truncate().
+ */
+static struct jremref *
+cancel_diradd_dotdot(ip, dirrem, jremref)
+ struct inode *ip;
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+{
+ struct pagedep *pagedep;
+ struct diradd *dap;
+ struct worklist *wk;
+
+ if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,
+ &pagedep) == 0)
+ return (jremref);
+ dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
+ if (dap == NULL)
+ return (jremref);
+ cancel_diradd(dap, dirrem, jremref, NULL, NULL);
+ /*
+ * Mark any journal work as belonging to the parent so it is freed
+ * with the .. reference.
+ */
+ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
+ wk->wk_state |= MKDIR_PARENT;
+ return (NULL);
+}
+
+/*
+ * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
+ * replace it with a dirrem/diradd pair as a result of re-parenting a
+ * directory. This ensures that we don't simultaneously have a mkdir and
+ * a diradd for the same .. entry.
+ */
+static struct jremref *
+cancel_mkdir_dotdot(ip, dirrem, jremref)
+ struct inode *ip;
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+{
+ struct inodedep *inodedep;
+ struct jaddref *jaddref;
+ struct mkdir *mkdir;
+ struct diradd *dap;
+
+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+ &inodedep) == 0)
+ panic("cancel_mkdir_dotdot: Lost inodedep");
+ dap = inodedep->id_mkdiradd;
+ if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
+ return (jremref);
+ for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
+ mkdir = LIST_NEXT(mkdir, md_mkdirs))
+ if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
+ break;
+ if (mkdir == NULL)
+ panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
+ if ((jaddref = mkdir->md_jaddref) != NULL) {
+ mkdir->md_jaddref = NULL;
+ jaddref->ja_state &= ~MKDIR_PARENT;
+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
+ &inodedep) == 0)
+ panic("cancel_mkdir_dotdot: Lost parent inodedep");
+ if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
+ journal_jremref(dirrem, jremref, inodedep);
+ jremref = NULL;
+ }
}
+ if (mkdir->md_state & ONWORKLIST)
+ WORKLIST_REMOVE(&mkdir->md_list);
+ mkdir->md_state |= ALLCOMPLETE;
+ complete_mkdir(mkdir);
+ return (jremref);
+}
+
+static void
+journal_jremref(dirrem, jremref, inodedep)
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+ struct inodedep *inodedep;
+{
+
+ if (inodedep == NULL)
+ if (inodedep_lookup(jremref->jr_list.wk_mp,
+ jremref->jr_ref.if_ino, 0, &inodedep) == 0)
+ panic("journal_jremref: Lost inodedep");
+ LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
+ TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
+ add_to_journal(&jremref->jr_list);
+}
+
+static void
+dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
+ struct dirrem *dirrem;
+ struct jremref *jremref;
+ struct jremref *dotremref;
+ struct jremref *dotdotremref;
+{
+ struct inodedep *inodedep;
+
+
+ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
+ &inodedep) == 0)
+ panic("dirrem_journal: Lost inodedep");
+ journal_jremref(dirrem, jremref, inodedep);
+ if (dotremref)
+ journal_jremref(dirrem, dotremref, inodedep);
+ if (dotdotremref)
+ journal_jremref(dirrem, dotdotremref, NULL);
}
/*
@@ -3303,12 +6913,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
struct diradd *dap;
struct dirrem *dirrem;
struct pagedep *pagedep;
+ struct jremref *jremref;
+ struct jremref *dotremref;
+ struct jremref *dotdotremref;
+ struct vnode *dvp;
/*
* Whiteouts have no deletion dependencies.
*/
if (ip == NULL)
panic("newdirrem: whiteout");
+ dvp = ITOV(dp);
/*
* If we are over our limit, try to improve the situation.
* Limiting the number of dirrem structures will also limit
@@ -3321,34 +6936,75 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
FREE_LOCK(&lk);
dirrem = malloc(sizeof(struct dirrem),
M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
- workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
+ workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
+ LIST_INIT(&dirrem->dm_jremrefhd);
+ LIST_INIT(&dirrem->dm_jwork);
dirrem->dm_state = isrmdir ? RMDIR : 0;
dirrem->dm_oldinum = ip->i_number;
*prevdirremp = NULL;
-
+ /*
+ * Allocate remove reference structures to track journal write
+ * dependencies. We will always have one for the link and
+ * when doing directories we will always have one more for dot.
+ * When renaming a directory we skip the dotdot link change so
+ * this is not needed.
+ */
+ jremref = dotremref = dotdotremref = NULL;
+ if (DOINGSUJ(dvp)) {
+ if (isrmdir) {
+ jremref = newjremref(dirrem, dp, ip, dp->i_offset,
+ ip->i_effnlink + 2);
+ dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
+ ip->i_effnlink + 1);
+ } else
+ jremref = newjremref(dirrem, dp, ip, dp->i_offset,
+ ip->i_effnlink + 1);
+ if (isrmdir > 1) {
+ dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
+ dp->i_effnlink + 1);
+ dotdotremref->jr_state |= MKDIR_PARENT;
+ }
+ }
ACQUIRE_LOCK(&lk);
lbn = lblkno(dp->i_fs, dp->i_offset);
offset = blkoff(dp->i_fs, dp->i_offset);
- if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
+ if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,
+ &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dirrem->dm_pagedep = pagedep;
/*
+ * If we're renaming a .. link to a new directory, cancel any
+ * existing MKDIR_PARENT mkdir. If it has already been canceled
+ * the jremref is preserved for any potential diradd in this
+ * location. This can not coincide with a rmdir.
+ */
+ if (dp->i_offset == DOTDOT_OFFSET) {
+ if (isrmdir)
+ panic("newdirrem: .. directory change during remove?");
+ jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
+ }
+ /*
+ * If we're removing a directory search for the .. dependency now and
+ * cancel it. Any pending journal work will be added to the dirrem
+ * to be completed when the workitem remove completes.
+ */
+ if (isrmdir > 1)
+ dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
+ /*
* Check for a diradd dependency for the same directory entry.
* If present, then both dependencies become obsolete and can
- * be de-allocated. Check for an entry on both the pd_dirraddhd
- * list and the pd_pendinghd list.
+ * be de-allocated.
*/
-
- LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
- if (dap->da_offset == offset)
- break;
+ dap = diradd_lookup(pagedep, offset);
if (dap == NULL) {
-
- LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
- if (dap->da_offset == offset)
- break;
- if (dap == NULL)
- return (dirrem);
+ /*
+ * Link the jremref structures into the dirrem so they are
+ * written prior to the pagedep.
+ */
+ if (jremref)
+ dirrem_journal(dirrem, jremref, dotremref,
+ dotdotremref);
+ return (dirrem);
}
/*
* Must be ATTACHED at this point.
@@ -3373,7 +7029,17 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
* Mark it COMPLETE so we can delete its inode immediately.
*/
dirrem->dm_state |= COMPLETE;
- free_diradd(dap);
+ cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
+#ifdef SUJ_DEBUG
+ if (isrmdir == 0) {
+ struct worklist *wk;
+
+ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
+ if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
+ panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
+ }
+#endif
+
return (dirrem);
}
@@ -3407,6 +7073,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
struct dirrem *dirrem, *prevdirrem;
struct pagedep *pagedep;
struct inodedep *inodedep;
+ struct jaddref *jaddref;
struct mount *mp;
offset = blkoff(dp->i_fs, dp->i_offset);
@@ -3422,6 +7089,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
dap->da_offset = offset;
dap->da_newinum = newinum;
+ LIST_INIT(&dap->da_jwork);
}
/*
@@ -3454,11 +7122,21 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
dm_next);
} else {
dirrem->dm_dirinum = pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+ add_to_worklist(&dirrem->dm_list, 0);
}
FREE_LOCK(&lk);
return;
}
+ /*
+ * Add the dirrem to the inodedep's pending remove list for quick
+ * discovery later. A valid nlinkdelta ensures that this lookup
+ * will not fail.
+ */
+ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
+ panic("softdep_setup_directory_change: Lost inodedep.");
+ dirrem->dm_state |= ONDEPLIST;
+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
/*
* If the COMPLETE flag is clear, then there were no active
@@ -3483,15 +7161,29 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
dap->da_pagedep = pagedep;
}
dirrem->dm_dirinum = pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ if (LIST_EMPTY(&dirrem->dm_jremrefhd))
+ add_to_worklist(&dirrem->dm_list, 0);
}
/*
- * Link into its inodedep. Put it on the id_bufwait list if the inode
+ * Lookup the jaddref for this journal entry. We must finish
+ * initializing it and make the diradd write dependent on it.
+ * If we're not journaling Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
- if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
- (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
+ if (mp->mnt_kern_flag & MNTK_SUJ) {
+ jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
+ inoreflst);
+ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
+ ("softdep_setup_directory_change: bad jaddref %p",
+ jaddref));
+ jaddref->ja_diroff = dp->i_offset;
+ jaddref->ja_diradd = dap;
+ LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
+ dap, da_pdlist);
+ add_to_journal(&jaddref->ja_list);
+ } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
dap->da_state |= COMPLETE;
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
@@ -3500,6 +7192,13 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
}
+ /*
+ * If we're making a new name for a directory that has not been
+ * committed when need to move the dot and dotdot references to
+ * this new name.
+ */
+ if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
+ merge_diradd(inodedep, dap);
FREE_LOCK(&lk);
}
@@ -3516,8 +7215,7 @@ softdep_change_linkcnt(ip)
struct inodedep *inodedep;
ACQUIRE_LOCK(&lk);
- (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
- DEPALLOC, &inodedep);
+ inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
if (ip->i_nlink < ip->i_effnlink)
panic("softdep_change_linkcnt: bad delta");
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
@@ -3574,6 +7272,305 @@ softdep_releasefile(ip)
}
/*
+ * Attach a sbdep dependency to the superblock buf so that we can keep
+ * track of the head of the linked list of referenced but unlinked inodes.
+ */
+void
+softdep_setup_sbupdate(ump, fs, bp)
+ struct ufsmount *ump;
+ struct fs *fs;
+ struct buf *bp;
+{
+ struct sbdep *sbdep;
+ struct worklist *wk;
+
+ if ((fs->fs_flags & FS_SUJ) == 0)
+ return;
+ LIST_FOREACH(wk, &bp->b_dep, wk_list)
+ if (wk->wk_type == D_SBDEP)
+ break;
+ if (wk != NULL)
+ return;
+ sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
+ workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
+ sbdep->sb_fs = fs;
+ sbdep->sb_ump = ump;
+ ACQUIRE_LOCK(&lk);
+ WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
+ FREE_LOCK(&lk);
+}
+
+/*
+ * Return the first unlinked inodedep which is ready to be the head of the
+ * list. The inodedep and all those after it must have valid next pointers.
+ */
+static struct inodedep *
+first_unlinked_inodedep(ump)
+ struct ufsmount *ump;
+{
+ struct inodedep *inodedep;
+ struct inodedep *idp;
+
+ for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
+ inodedep; inodedep = idp) {
+ if ((inodedep->id_state & UNLINKNEXT) == 0)
+ return (NULL);
+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+ if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
+ break;
+ if ((inodedep->id_state & UNLINKPREV) == 0)
+ panic("first_unlinked_inodedep: prev != next");
+ }
+ if (inodedep == NULL)
+ return (NULL);
+
+ return (inodedep);
+}
+
+/*
+ * Set the sujfree unlinked head pointer prior to writing a superblock.
+ */
+static void
+initiate_write_sbdep(sbdep)
+ struct sbdep *sbdep;
+{
+ struct inodedep *inodedep;
+ struct fs *bpfs;
+ struct fs *fs;
+
+ bpfs = sbdep->sb_fs;
+ fs = sbdep->sb_ump->um_fs;
+ inodedep = first_unlinked_inodedep(sbdep->sb_ump);
+ if (inodedep) {
+ fs->fs_sujfree = inodedep->id_ino;
+ inodedep->id_state |= UNLINKPREV;
+ } else
+ fs->fs_sujfree = 0;
+ bpfs->fs_sujfree = fs->fs_sujfree;
+}
+
+/*
+ * After a superblock is written determine whether it must be written again
+ * due to a changing unlinked list head.
+ */
+static int
+handle_written_sbdep(sbdep, bp)
+ struct sbdep *sbdep;
+ struct buf *bp;
+{
+ struct inodedep *inodedep;
+ struct mount *mp;
+ struct fs *fs;
+
+ fs = sbdep->sb_fs;
+ mp = UFSTOVFS(sbdep->sb_ump);
+ inodedep = first_unlinked_inodedep(sbdep->sb_ump);
+ if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
+ (inodedep == NULL && fs->fs_sujfree != 0)) {
+ bdirty(bp);
+ return (1);
+ }
+ WORKITEM_FREE(sbdep, D_SBDEP);
+ if (fs->fs_sujfree == 0)
+ return (0);
+ if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
+ panic("handle_written_sbdep: lost inodedep");
+ /*
+ * Now that we have a record of this indode in stable store allow it
+ * to be written to free up pending work. Inodes may see a lot of
+ * write activity after they are unlinked which we must not hold up.
+ */
+ for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
+ if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
+ panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
+ inodedep, inodedep->id_state);
+ if (inodedep->id_state & UNLINKONLIST)
+ break;
+ inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
+ }
+
+ return (0);
+}
+
+/*
+ * Mark an inodedep has unlinked and insert it into the in-memory unlinked
+ * list.
+ */
+static void
+unlinked_inodedep(mp, inodedep)
+ struct mount *mp;
+ struct inodedep *inodedep;
+{
+ struct ufsmount *ump;
+
+ if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
+ return;
+ ump = VFSTOUFS(mp);
+ ump->um_fs->fs_fmod = 1;
+ inodedep->id_state |= UNLINKED;
+ TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
+}
+
+/*
+ * Remove an inodedep from the unlinked inodedep list. This may require
+ * disk writes if the inode has made it that far.
+ */
+static void
+clear_unlinked_inodedep(inodedep)
+ struct inodedep *inodedep;
+{
+ struct ufsmount *ump;
+ struct inodedep *idp;
+ struct inodedep *idn;
+ struct fs *fs;
+ struct buf *bp;
+ ino_t ino;
+ ino_t nino;
+ ino_t pino;
+ int error;
+
+ ump = VFSTOUFS(inodedep->id_list.wk_mp);
+ fs = ump->um_fs;
+ ino = inodedep->id_ino;
+ error = 0;
+ for (;;) {
+ /*
+ * If nothing has yet been written simply remove us from
+ * the in memory list and return. This is the most common
+ * case where handle_workitem_remove() loses the final
+ * reference.
+ */
+ if ((inodedep->id_state & UNLINKLINKS) == 0)
+ break;
+ /*
+ * If we have a NEXT pointer and no PREV pointer we can simply
+ * clear NEXT's PREV and remove ourselves from the list. Be
+ * careful not to clear PREV if the superblock points at
+ * next as well.
+ */
+ idn = TAILQ_NEXT(inodedep, id_unlinked);
+ if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
+ if (idn && fs->fs_sujfree != idn->id_ino)
+ idn->id_state &= ~UNLINKPREV;
+ break;
+ }
+ /*
+ * Here we have an inodedep which is actually linked into
+ * the list. We must remove it by forcing a write to the
+ * link before us, whether it be the superblock or an inode.
+ * Unfortunately the list may change while we're waiting
+ * on the buf lock for either resource so we must loop until
+ * we lock. the right one. If both the superblock and an
+ * inode point to this inode we must clear the inode first
+ * followed by the superblock.
+ */
+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+ pino = 0;
+ if (idp && (idp->id_state & UNLINKNEXT))
+ pino = idp->id_ino;
+ FREE_LOCK(&lk);
+ if (pino == 0)
+ bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
+ (int)fs->fs_sbsize, 0, 0, 0);
+ else
+ error = bread(ump->um_devvp,
+ fsbtodb(fs, ino_to_fsba(fs, pino)),
+ (int)fs->fs_bsize, NOCRED, &bp);
+ ACQUIRE_LOCK(&lk);
+ if (error)
+ break;
+ /* If the list has changed restart the loop. */
+ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
+ nino = 0;
+ if (idp && (idp->id_state & UNLINKNEXT))
+ nino = idp->id_ino;
+ if (nino != pino ||
+ (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
+ FREE_LOCK(&lk);
+ brelse(bp);
+ ACQUIRE_LOCK(&lk);
+ continue;
+ }
+ /*
+ * Remove us from the in memory list. After this we cannot
+ * access the inodedep.
+ */
+ idn = TAILQ_NEXT(inodedep, id_unlinked);
+ inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
+ TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
+ /*
+ * Determine the next inode number.
+ */
+ nino = 0;
+ if (idn) {
+ /*
+ * If next isn't on the list we can just clear prev's
+ * state and schedule it to be fixed later. No need
+ * to synchronously write if we're not in the real
+ * list.
+ */
+ if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
+ idp->id_state &= ~UNLINKNEXT;
+ if ((idp->id_state & ONWORKLIST) == 0)
+ WORKLIST_INSERT(&bp->b_dep,
+ &idp->id_list);
+ FREE_LOCK(&lk);
+ bawrite(bp);
+ ACQUIRE_LOCK(&lk);
+ return;
+ }
+ nino = idn->id_ino;
+ }
+ FREE_LOCK(&lk);
+ /*
+ * The predecessor's next pointer is manually updated here
+ * so that the NEXT flag is never cleared for an element
+ * that is in the list.
+ */
+ if (pino == 0) {
+ bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
+ ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
+ softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
+ bp);
+ } else if (fs->fs_magic == FS_UFS1_MAGIC)
+ ((struct ufs1_dinode *)bp->b_data +
+ ino_to_fsbo(fs, pino))->di_freelink = nino;
+ else
+ ((struct ufs2_dinode *)bp->b_data +
+ ino_to_fsbo(fs, pino))->di_freelink = nino;
+ /*
+ * If the bwrite fails we have no recourse to recover. The
+ * filesystem is corrupted already.
+ */
+ bwrite(bp);
+ ACQUIRE_LOCK(&lk);
+ /*
+ * If the superblock pointer still needs to be cleared force
+ * a write here.
+ */
+ if (fs->fs_sujfree == ino) {
+ FREE_LOCK(&lk);
+ bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
+ (int)fs->fs_sbsize, 0, 0, 0);
+ bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
+ ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
+ softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
+ bp);
+ bwrite(bp);
+ ACQUIRE_LOCK(&lk);
+ }
+ if (fs->fs_sujfree != ino)
+ return;
+ panic("clear_unlinked_inodedep: Failed to clear free head");
+ }
+ if (inodedep->id_ino == fs->fs_sujfree)
+ panic("clear_unlinked_inodedep: Freeing head of free list");
+ inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
+ TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
+ return;
+}
+
+/*
* This workitem decrements the inode's link count.
* If the link count reaches zero, the file is removed.
*/
@@ -3584,22 +7581,54 @@ handle_workitem_remove(dirrem, xp)
{
struct thread *td = curthread;
struct inodedep *inodedep;
+ struct workhead dotdotwk;
+ struct worklist *wk;
+ struct ufsmount *ump;
+ struct mount *mp;
struct vnode *vp;
struct inode *ip;
ino_t oldinum;
int error;
+ if (dirrem->dm_state & ONWORKLIST)
+ panic("handle_workitem_remove: dirrem %p still on worklist",
+ dirrem);
+ oldinum = dirrem->dm_oldinum;
+ mp = dirrem->dm_list.wk_mp;
+ ump = VFSTOUFS(mp);
if ((vp = xp) == NULL &&
- (error = ffs_vgetf(dirrem->dm_list.wk_mp,
- dirrem->dm_oldinum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) {
+ (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,
+ FFSV_FORCEINSMQ)) != 0) {
softdep_error("handle_workitem_remove: vget", error);
return;
}
ip = VTOI(vp);
ACQUIRE_LOCK(&lk);
- if ((inodedep_lookup(dirrem->dm_list.wk_mp,
- dirrem->dm_oldinum, 0, &inodedep)) == 0)
+ if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
panic("handle_workitem_remove: lost inodedep");
+ if (dirrem->dm_state & ONDEPLIST)
+ LIST_REMOVE(dirrem, dm_inonext);
+ KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
+ ("handle_workitem_remove: Journal entries not written."));
+
+ /*
+ * Move all dependencies waiting on the remove to complete
+ * from the dirrem to the inode inowait list to be completed
+ * after the inode has been updated and written to disk. Any
+ * marked MKDIR_PARENT are saved to be completed when the .. ref
+ * is removed.
+ */
+ LIST_INIT(&dotdotwk);
+ while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ if (wk->wk_state & MKDIR_PARENT) {
+ wk->wk_state &= ~MKDIR_PARENT;
+ WORKLIST_INSERT(&dotdotwk, wk);
+ continue;
+ }
+ WORKLIST_INSERT(&inodedep->id_inowait, wk);
+ }
+ LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
/*
* Normal file deletion.
*/
@@ -3609,12 +7638,16 @@ handle_workitem_remove(dirrem, xp)
ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad file delta");
+ if (ip->i_nlink == 0)
+ unlinked_inodedep(mp, inodedep);
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
num_dirrem -= 1;
+ KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
+ ("handle_workitem_remove: worklist not empty. %s",
+ TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
WORKITEM_FREE(dirrem, D_DIRREM);
FREE_LOCK(&lk);
- vput(vp);
- return;
+ goto out;
}
/*
* Directory deletion. Decrement reference count for both the
@@ -3628,6 +7661,8 @@ handle_workitem_remove(dirrem, xp)
ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad dir delta");
+ if (ip->i_nlink == 0)
+ unlinked_inodedep(mp, inodedep);
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
FREE_LOCK(&lk);
if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
@@ -3639,36 +7674,47 @@ handle_workitem_remove(dirrem, xp)
* directory should not change. Thus we skip the followup dirrem.
*/
if (dirrem->dm_state & DIRCHG) {
+ KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
+ ("handle_workitem_remove: DIRCHG and worklist not empty."));
num_dirrem -= 1;
WORKITEM_FREE(dirrem, D_DIRREM);
FREE_LOCK(&lk);
- vput(vp);
- return;
+ goto out;
}
+ dirrem->dm_state = ONDEPLIST;
+ dirrem->dm_oldinum = dirrem->dm_dirinum;
/*
- * If the inodedep does not exist, then the zero'ed inode has
- * been written to disk. If the allocated inode has never been
- * written to disk, then the on-disk inode is zero'ed. In either
- * case we can remove the file immediately.
+ * Place the dirrem on the parent's diremhd list.
*/
- dirrem->dm_state = 0;
- oldinum = dirrem->dm_oldinum;
- dirrem->dm_oldinum = dirrem->dm_dirinum;
- if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
- 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
+ if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
+ panic("handle_workitem_remove: lost dir inodedep");
+ LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
+ /*
+ * If the allocated inode has never been written to disk, then
+ * the on-disk inode is zero'ed and we can remove the file
+ * immediately. When journaling if the inode has been marked
+ * unlinked and not DEPCOMPLETE we know it can never be written.
+ */
+ inodedep_lookup(mp, oldinum, 0, &inodedep);
+ if (inodedep == NULL ||
+ (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
+ check_inode_unwritten(inodedep)) {
if (xp != NULL)
- add_to_worklist(&dirrem->dm_list);
+ add_to_worklist(&dirrem->dm_list, 0);
FREE_LOCK(&lk);
- vput(vp);
- if (xp == NULL)
+ if (xp == NULL) {
+ vput(vp);
handle_workitem_remove(dirrem, NULL);
+ }
return;
}
WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
FREE_LOCK(&lk);
ip->i_flag |= IN_CHANGE;
+out:
ffs_update(vp, 0);
- vput(vp);
+ if (xp == NULL)
+ vput(vp);
}
/*
@@ -3689,6 +7735,7 @@ static void
handle_workitem_freefile(freefile)
struct freefile *freefile;
{
+ struct workhead wkhd;
struct fs *fs;
struct inodedep *idp;
struct ufsmount *ump;
@@ -3701,13 +7748,15 @@ handle_workitem_freefile(freefile)
error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
FREE_LOCK(&lk);
if (error)
- panic("handle_workitem_freefile: inodedep survived");
+ panic("handle_workitem_freefile: inodedep %p survived", idp);
#endif
UFS_LOCK(ump);
fs->fs_pendinginodes -= 1;
UFS_UNLOCK(ump);
+ LIST_INIT(&wkhd);
+ LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
- freefile->fx_oldinum, freefile->fx_mode)) != 0)
+ freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
softdep_error("handle_workitem_freefile", error);
ACQUIRE_LOCK(&lk);
WORKITEM_FREE(freefile, D_FREEFILE);
@@ -3757,8 +7806,10 @@ softdep_disk_io_initiation(bp)
{
struct worklist *wk;
struct worklist marker;
- struct indirdep *indirdep;
struct inodedep *inodedep;
+ struct freeblks *freeblks;
+ struct jfreeblk *jfreeblk;
+ struct newblk *newblk;
/*
* We only care about write operations. There should never
@@ -3767,6 +7818,10 @@ softdep_disk_io_initiation(bp)
if (bp->b_iocmd != BIO_WRITE)
panic("softdep_disk_io_initiation: not write");
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("softdep_disk_io_initiation: Writing buffer with "
+ "background write in progress: %p", bp);
+
marker.wk_type = D_LAST + 1; /* Not a normal workitem */
PHOLD(curproc); /* Don't swap out kernel stack */
@@ -3792,46 +7847,58 @@ softdep_disk_io_initiation(bp)
continue;
case D_INDIRDEP:
- indirdep = WK_INDIRDEP(wk);
- if (indirdep->ir_state & GOINGAWAY)
- panic("disk_io_initiation: indirdep gone");
+ initiate_write_indirdep(WK_INDIRDEP(wk), bp);
+ continue;
+
+ case D_BMSAFEMAP:
+ initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
+ continue;
+
+ case D_JSEG:
+ WK_JSEG(wk)->js_buf = NULL;
+ continue;
+
+ case D_FREEBLKS:
+ freeblks = WK_FREEBLKS(wk);
+ jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);
/*
- * If there are no remaining dependencies, this
- * will be writing the real pointers, so the
- * dependency can be freed.
+ * We have to wait for the jfreeblks to be journaled
+ * before we can write an inodeblock with updated
+ * pointers. Be careful to arrange the marker so
+ * we revisit the jfreeblk if it's not removed by
+ * the first jwait().
*/
- if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
- struct buf *bp;
-
- bp = indirdep->ir_savebp;
- bp->b_flags |= B_INVAL | B_NOCACHE;
- /* inline expand WORKLIST_REMOVE(wk); */
- wk->wk_state &= ~ONWORKLIST;
- LIST_REMOVE(wk, wk_list);
- WORKITEM_FREE(indirdep, D_INDIRDEP);
- FREE_LOCK(&lk);
- brelse(bp);
- ACQUIRE_LOCK(&lk);
- continue;
+ if (jfreeblk != NULL) {
+ LIST_REMOVE(&marker, wk_list);
+ LIST_INSERT_BEFORE(wk, &marker, wk_list);
+ jwait(&jfreeblk->jf_list);
}
+ continue;
+ case D_ALLOCDIRECT:
+ case D_ALLOCINDIR:
/*
- * Replace up-to-date version with safe version.
+ * We have to wait for the jnewblk to be journaled
+ * before we can write to a block otherwise the
+ * contents may be confused with an earlier file
+ * at recovery time. Handle the marker as described
+ * above.
*/
- FREE_LOCK(&lk);
- indirdep->ir_saveddata = malloc(bp->b_bcount,
- M_INDIRDEP, M_SOFTDEP_FLAGS);
- ACQUIRE_LOCK(&lk);
- indirdep->ir_state &= ~ATTACHED;
- indirdep->ir_state |= UNDONE;
- bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
- bcopy(indirdep->ir_savebp->b_data, bp->b_data,
- bp->b_bcount);
+ newblk = WK_NEWBLK(wk);
+ if (newblk->nb_jnewblk != NULL) {
+ LIST_REMOVE(&marker, wk_list);
+ LIST_INSERT_BEFORE(wk, &marker, wk_list);
+ jwait(&newblk->nb_jnewblk->jn_list);
+ }
+ continue;
+
+ case D_SBDEP:
+ initiate_write_sbdep(WK_SBDEP(wk));
continue;
case D_MKDIR:
- case D_BMSAFEMAP:
- case D_ALLOCDIRECT:
- case D_ALLOCINDIR:
+ case D_FREEWORK:
+ case D_FREEDEP:
+ case D_JSEGDEP:
continue;
default:
@@ -3855,6 +7922,9 @@ initiate_write_filepage(pagedep, bp)
struct pagedep *pagedep;
struct buf *bp;
{
+ struct jremref *jremref;
+ struct jmvref *jmvref;
+ struct dirrem *dirrem;
struct diradd *dap;
struct direct *ep;
int i;
@@ -3869,6 +7939,22 @@ initiate_write_filepage(pagedep, bp)
return;
}
pagedep->pd_state |= IOSTARTED;
+ /*
+ * Wait for all journal remove dependencies to hit the disk.
+ * We can not allow any potentially conflicting directory adds
+ * to be visible before removes and rollback is too difficult.
+ * lk may be dropped and re-acquired, however we hold the buf
+ * locked so the dependency can not go away.
+ */
+ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
+ while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
+ stat_jwait_filepage++;
+ jwait(&jremref->jr_list);
+ }
+ while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
+ stat_jwait_filepage++;
+ jwait(&jmvref->jm_list);
+ }
for (i = 0; i < DAHASHSZ; i++) {
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
ep = (struct direct *)
@@ -3905,6 +7991,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
struct allocdirect *adp, *lastadp;
struct ufs1_dinode *dp;
struct ufs1_dinode *sip;
+ struct inoref *inoref;
struct fs *fs;
ufs_lbn_t i;
#ifdef INVARIANTS
@@ -3918,6 +8005,17 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
fs = inodedep->id_fs;
dp = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, inodedep->id_ino);
+
+ /*
+ * If we're on the unlinked list but have not yet written our
+ * next pointer initialize it here.
+ */
+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+ struct inodedep *inon;
+
+ inon = TAILQ_NEXT(inodedep, id_unlinked);
+ dp->di_freelink = inon ? inon->id_ino : 0;
+ }
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
@@ -3933,6 +8031,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
*inodedep->id_savedino1 = *dp;
bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
dp->di_gen = inodedep->id_savedino1->di_gen;
+ dp->di_freelink = inodedep->id_savedino1->di_freelink;
return;
}
/*
@@ -3940,32 +8039,40 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
*/
inodedep->id_savedsize = dp->di_size;
inodedep->id_savedextsize = 0;
- if (TAILQ_EMPTY(&inodedep->id_inoupdt))
+ inodedep->id_savednlink = dp->di_nlink;
+ if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
+ TAILQ_EMPTY(&inodedep->id_inoreflst))
return;
/*
+ * Revert the link count to that of the first unwritten journal entry.
+ */
+ inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
+ if (inoref)
+ dp->di_nlink = inoref->if_nlink;
+ /*
* Set the dependencies to busy.
*/
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef INVARIANTS
- if (deplist != 0 && prevlbn >= adp->ad_lbn)
+ if (deplist != 0 && prevlbn >= adp->ad_offset)
panic("softdep_write_inodeblock: lbn order");
- prevlbn = adp->ad_lbn;
- if (adp->ad_lbn < NDADDR &&
- dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
+ prevlbn = adp->ad_offset;
+ if (adp->ad_offset < NDADDR &&
+ dp->di_db[adp->ad_offset] != adp->ad_newblkno)
panic("%s: direct pointer #%jd mismatch %d != %jd",
"softdep_write_inodeblock",
- (intmax_t)adp->ad_lbn,
- dp->di_db[adp->ad_lbn],
+ (intmax_t)adp->ad_offset,
+ dp->di_db[adp->ad_offset],
(intmax_t)adp->ad_newblkno);
- if (adp->ad_lbn >= NDADDR &&
- dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
+ if (adp->ad_offset >= NDADDR &&
+ dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
panic("%s: indirect pointer #%jd mismatch %d != %jd",
"softdep_write_inodeblock",
- (intmax_t)adp->ad_lbn - NDADDR,
- dp->di_ib[adp->ad_lbn - NDADDR],
+ (intmax_t)adp->ad_offset - NDADDR,
+ dp->di_ib[adp->ad_offset - NDADDR],
(intmax_t)adp->ad_newblkno);
- deplist |= 1 << adp->ad_lbn;
+ deplist |= 1 << adp->ad_offset;
if ((adp->ad_state & ATTACHED) == 0)
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
@@ -3981,14 +8088,14 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
- if (adp->ad_lbn >= NDADDR)
+ if (adp->ad_offset >= NDADDR)
break;
- dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
+ dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
- dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
- for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
+ dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+ for (i = adp->ad_offset + 1; i < NDADDR; i++) {
#ifdef INVARIANTS
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
panic("softdep_write_inodeblock: lost dep1");
@@ -4012,8 +8119,8 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
- dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
- for (i = lastadp->ad_lbn; i >= 0; i--)
+ dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+ for (i = lastadp->ad_offset; i >= 0; i--)
if (dp->di_db[i] != 0)
break;
dp->di_size = (i + 1) * fs->fs_bsize;
@@ -4030,7 +8137,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp)
* postpone fsck, we are stuck with this argument.
*/
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
- dp->di_ib[adp->ad_lbn - NDADDR] = 0;
+ dp->di_ib[adp->ad_offset - NDADDR] = 0;
}
/*
@@ -4051,6 +8158,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
struct allocdirect *adp, *lastadp;
struct ufs2_dinode *dp;
struct ufs2_dinode *sip;
+ struct inoref *inoref;
struct fs *fs;
ufs_lbn_t i;
#ifdef INVARIANTS
@@ -4064,6 +8172,29 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
fs = inodedep->id_fs;
dp = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, inodedep->id_ino);
+
+ /*
+ * If we're on the unlinked list but have not yet written our
+ * next pointer initialize it here.
+ */
+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+ struct inodedep *inon;
+
+ inon = TAILQ_NEXT(inodedep, id_unlinked);
+ dp->di_freelink = inon ? inon->id_ino : 0;
+ }
+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==
+ (UNLINKED | UNLINKNEXT)) {
+ struct inodedep *inon;
+ ino_t freelink;
+
+ inon = TAILQ_NEXT(inodedep, id_unlinked);
+ freelink = inon ? inon->id_ino : 0;
+ if (freelink != dp->di_freelink)
+ panic("ino %p(0x%X) %d, %d != %d",
+ inodedep, inodedep->id_state, inodedep->id_ino,
+ freelink, dp->di_freelink);
+ }
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
@@ -4079,6 +8210,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
*inodedep->id_savedino2 = *dp;
bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
dp->di_gen = inodedep->id_savedino2->di_gen;
+ dp->di_freelink = inodedep->id_savedino2->di_freelink;
return;
}
/*
@@ -4086,25 +8218,34 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
*/
inodedep->id_savedsize = dp->di_size;
inodedep->id_savedextsize = dp->di_extsize;
+ inodedep->id_savednlink = dp->di_nlink;
if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
- TAILQ_EMPTY(&inodedep->id_extupdt))
+ TAILQ_EMPTY(&inodedep->id_extupdt) &&
+ TAILQ_EMPTY(&inodedep->id_inoreflst))
return;
/*
+ * Revert the link count to that of the first unwritten journal entry.
+ */
+ inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
+ if (inoref)
+ dp->di_nlink = inoref->if_nlink;
+
+ /*
* Set the ext data dependencies to busy.
*/
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef INVARIANTS
- if (deplist != 0 && prevlbn >= adp->ad_lbn)
+ if (deplist != 0 && prevlbn >= adp->ad_offset)
panic("softdep_write_inodeblock: lbn order");
- prevlbn = adp->ad_lbn;
- if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
+ prevlbn = adp->ad_offset;
+ if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
panic("%s: direct pointer #%jd mismatch %jd != %jd",
"softdep_write_inodeblock",
- (intmax_t)adp->ad_lbn,
- (intmax_t)dp->di_extb[adp->ad_lbn],
+ (intmax_t)adp->ad_offset,
+ (intmax_t)dp->di_extb[adp->ad_offset],
(intmax_t)adp->ad_newblkno);
- deplist |= 1 << adp->ad_lbn;
+ deplist |= 1 << adp->ad_offset;
if ((adp->ad_state & ATTACHED) == 0)
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
@@ -4120,12 +8261,12 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
- dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
+ dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
- dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
- for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
+ dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+ for (i = adp->ad_offset + 1; i < NXADDR; i++) {
#ifdef INVARIANTS
if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
panic("softdep_write_inodeblock: lost dep1");
@@ -4142,8 +8283,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
- dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
- for (i = lastadp->ad_lbn; i >= 0; i--)
+ dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+ for (i = lastadp->ad_offset; i >= 0; i--)
if (dp->di_extb[i] != 0)
break;
dp->di_extsize = (i + 1) * fs->fs_bsize;
@@ -4154,24 +8295,24 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef INVARIANTS
- if (deplist != 0 && prevlbn >= adp->ad_lbn)
+ if (deplist != 0 && prevlbn >= adp->ad_offset)
panic("softdep_write_inodeblock: lbn order");
- prevlbn = adp->ad_lbn;
- if (adp->ad_lbn < NDADDR &&
- dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
+ prevlbn = adp->ad_offset;
+ if (adp->ad_offset < NDADDR &&
+ dp->di_db[adp->ad_offset] != adp->ad_newblkno)
panic("%s: direct pointer #%jd mismatch %jd != %jd",
"softdep_write_inodeblock",
- (intmax_t)adp->ad_lbn,
- (intmax_t)dp->di_db[adp->ad_lbn],
+ (intmax_t)adp->ad_offset,
+ (intmax_t)dp->di_db[adp->ad_offset],
(intmax_t)adp->ad_newblkno);
- if (adp->ad_lbn >= NDADDR &&
- dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
+ if (adp->ad_offset >= NDADDR &&
+ dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
panic("%s indirect pointer #%jd mismatch %jd != %jd",
"softdep_write_inodeblock:",
- (intmax_t)adp->ad_lbn - NDADDR,
- (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
+ (intmax_t)adp->ad_offset - NDADDR,
+ (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
(intmax_t)adp->ad_newblkno);
- deplist |= 1 << adp->ad_lbn;
+ deplist |= 1 << adp->ad_offset;
if ((adp->ad_state & ATTACHED) == 0)
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
@@ -4187,14 +8328,14 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
- if (adp->ad_lbn >= NDADDR)
+ if (adp->ad_offset >= NDADDR)
break;
- dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
+ dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
- dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
- for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
+ dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
+ for (i = adp->ad_offset + 1; i < NDADDR; i++) {
#ifdef INVARIANTS
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
panic("softdep_write_inodeblock: lost dep2");
@@ -4218,8 +8359,8 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
- dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
- for (i = lastadp->ad_lbn; i >= 0; i--)
+ dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
+ for (i = lastadp->ad_offset; i >= 0; i--)
if (dp->di_db[i] != 0)
break;
dp->di_size = (i + 1) * fs->fs_bsize;
@@ -4236,7 +8377,355 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
* postpone fsck, we are stuck with this argument.
*/
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
- dp->di_ib[adp->ad_lbn - NDADDR] = 0;
+ dp->di_ib[adp->ad_offset - NDADDR] = 0;
+}
+
+/*
+ * Cancel an indirdep as a result of truncation. Release all of the
+ * children allocindirs and place their journal work on the appropriate
+ * list.
+ */
+static void
+cancel_indirdep(indirdep, bp, inodedep, freeblks)
+ struct indirdep *indirdep;
+ struct buf *bp;
+ struct inodedep *inodedep;
+ struct freeblks *freeblks;
+{
+ struct allocindir *aip;
+
+ /*
+ * None of the indirect pointers will ever be visible,
+ * so they can simply be tossed. GOINGAWAY ensures
+ * that allocated pointers will be saved in the buffer
+ * cache until they are freed. Note that they will
+ * only be able to be found by their physical address
+ * since the inode mapping the logical address will
+ * be gone. The save buffer used for the safe copy
+ * was allocated in setup_allocindir_phase2 using
+ * the physical address so it could be used for this
+ * purpose. Hence we swap the safe copy with the real
+ * copy, allowing the safe copy to be freed and holding
+ * on to the real copy for later use in indir_trunc.
+ */
+ if (indirdep->ir_state & GOINGAWAY)
+ panic("cancel_indirdep: already gone");
+ if (indirdep->ir_state & ONDEPLIST) {
+ indirdep->ir_state &= ~ONDEPLIST;
+ LIST_REMOVE(indirdep, ir_next);
+ }
+ indirdep->ir_state |= GOINGAWAY;
+ VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
+ while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
+ cancel_allocindir(aip, inodedep, freeblks);
+ while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
+ cancel_allocindir(aip, inodedep, freeblks);
+ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
+ cancel_allocindir(aip, inodedep, freeblks);
+ while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
+ cancel_allocindir(aip, inodedep, freeblks);
+ bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
+ WORKLIST_REMOVE(&indirdep->ir_list);
+ WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
+ indirdep->ir_savebp = NULL;
+}
+
+/*
+ * Free an indirdep once it no longer has new pointers to track.
+ */
+static void
+free_indirdep(indirdep)
+ struct indirdep *indirdep;
+{
+
+ KASSERT(LIST_EMPTY(&indirdep->ir_jwork),
+ ("free_indirdep: Journal work not empty."));
+ KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
+ ("free_indirdep: Complete head not empty."));
+ KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
+ ("free_indirdep: write head not empty."));
+ KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
+ ("free_indirdep: done head not empty."));
+ KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
+ ("free_indirdep: deplist head not empty."));
+ KASSERT(indirdep->ir_savebp == NULL,
+ ("free_indirdep: %p ir_savebp != NULL", indirdep));
+ KASSERT((indirdep->ir_state & ONDEPLIST) == 0,
+ ("free_indirdep: %p still on deplist.", indirdep));
+ if (indirdep->ir_state & ONWORKLIST)
+ WORKLIST_REMOVE(&indirdep->ir_list);
+ WORKITEM_FREE(indirdep, D_INDIRDEP);
+}
+
+/*
+ * Called before a write to an indirdep. This routine is responsible for
+ * rolling back pointers to a safe state which includes only those
+ * allocindirs which have been completed.
+ */
+static void
+initiate_write_indirdep(indirdep, bp)
+ struct indirdep *indirdep;
+ struct buf *bp;
+{
+
+ if (indirdep->ir_state & GOINGAWAY)
+ panic("disk_io_initiation: indirdep gone");
+
+ /*
+ * If there are no remaining dependencies, this will be writing
+ * the real pointers.
+ */
+ if (LIST_EMPTY(&indirdep->ir_deplisthd))
+ return;
+ /*
+ * Replace up-to-date version with safe version.
+ */
+ FREE_LOCK(&lk);
+ indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
+ M_SOFTDEP_FLAGS);
+ ACQUIRE_LOCK(&lk);
+ indirdep->ir_state &= ~ATTACHED;
+ indirdep->ir_state |= UNDONE;
+ bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
+ bcopy(indirdep->ir_savebp->b_data, bp->b_data,
+ bp->b_bcount);
+}
+
+/*
+ * Called when an inode has been cleared in a cg bitmap. This finally
+ * eliminates any canceled jaddrefs
+ */
+void
+softdep_setup_inofree(mp, bp, ino, wkhd)
+ struct mount *mp;
+ struct buf *bp;
+ ino_t ino;
+ struct workhead *wkhd;
+{
+ struct worklist *wk, *wkn;
+ struct inodedep *inodedep;
+ uint8_t *inosused;
+ struct cg *cgp;
+ struct fs *fs;
+
+ ACQUIRE_LOCK(&lk);
+ fs = VFSTOUFS(mp)->um_fs;
+ cgp = (struct cg *)bp->b_data;
+ inosused = cg_inosused(cgp);
+ if (isset(inosused, ino % fs->fs_ipg))
+ panic("softdep_setup_inofree: inode %d not freed.", ino);
+ if (inodedep_lookup(mp, ino, 0, &inodedep))
+ panic("softdep_setup_inofree: ino %d has existing inodedep %p",
+ ino, inodedep);
+ if (wkhd) {
+ LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
+ if (wk->wk_type != D_JADDREF)
+ continue;
+ WORKLIST_REMOVE(wk);
+ /*
+ * We can free immediately even if the jaddref
+ * isn't attached in a background write as now
+ * the bitmaps are reconciled.
+ */
+ wk->wk_state |= COMPLETE | ATTACHED;
+ free_jaddref(WK_JADDREF(wk));
+ }
+ jwork_move(&bp->b_dep, wkhd);
+ }
+ FREE_LOCK(&lk);
+}
+
+
+/*
+ * Called via ffs_blkfree() after a set of frags has been cleared from a cg
+ * map. Any dependencies waiting for the write to clear are added to the
+ * buf's list and any jnewblks that are being canceled are discarded
+ * immediately.
+ */
+void
+softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
+ struct mount *mp;
+ struct buf *bp;
+ ufs2_daddr_t blkno;
+ int frags;
+ struct workhead *wkhd;
+{
+ struct jnewblk *jnewblk;
+ struct worklist *wk, *wkn;
+#ifdef SUJ_DEBUG
+ struct bmsafemap *bmsafemap;
+ struct fs *fs;
+ uint8_t *blksfree;
+ struct cg *cgp;
+ ufs2_daddr_t jstart;
+ ufs2_daddr_t jend;
+ ufs2_daddr_t end;
+ long bno;
+ int i;
+#endif
+
+ ACQUIRE_LOCK(&lk);
+ /*
+ * Detach any jnewblks which have been canceled. They must linger
+ * until the bitmap is cleared again by ffs_blkfree() to prevent
+ * an unjournaled allocation from hitting the disk.
+ */
+ if (wkhd) {
+ LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
+ if (wk->wk_type != D_JNEWBLK)
+ continue;
+ jnewblk = WK_JNEWBLK(wk);
+ KASSERT(jnewblk->jn_state & GOINGAWAY,
+ ("softdep_setup_blkfree: jnewblk not canceled."));
+ WORKLIST_REMOVE(wk);
+#ifdef SUJ_DEBUG
+ /*
+ * Assert that this block is free in the bitmap
+ * before we discard the jnewblk.
+ */
+ fs = VFSTOUFS(mp)->um_fs;
+ cgp = (struct cg *)bp->b_data;
+ blksfree = cg_blksfree(cgp);
+ bno = dtogd(fs, jnewblk->jn_blkno);
+ for (i = jnewblk->jn_oldfrags;
+ i < jnewblk->jn_frags; i++) {
+ if (isset(blksfree, bno + i))
+ continue;
+ panic("softdep_setup_blkfree: not free");
+ }
+#endif
+ /*
+ * Even if it's not attached we can free immediately
+ * as the new bitmap is correct.
+ */
+ wk->wk_state |= COMPLETE | ATTACHED;
+ free_jnewblk(jnewblk);
+ }
+ /*
+ * The buf must be locked by the caller otherwise these could
+ * be added while it's being written and the write would
+ * complete them before they made it to disk.
+ */
+ jwork_move(&bp->b_dep, wkhd);
+ }
+
+#ifdef SUJ_DEBUG
+ /*
+ * Assert that we are not freeing a block which has an outstanding
+ * allocation dependency.
+ */
+ fs = VFSTOUFS(mp)->um_fs;
+ bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));
+ end = blkno + frags;
+ LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
+ /*
+ * Don't match against blocks that will be freed when the
+ * background write is done.
+ */
+ if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
+ (COMPLETE | DEPCOMPLETE))
+ continue;
+ jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
+ jend = jnewblk->jn_blkno + jnewblk->jn_frags;
+ if ((blkno >= jstart && blkno < jend) ||
+ (end > jstart && end <= jend)) {
+ printf("state 0x%X %jd - %d %d dep %p\n",
+ jnewblk->jn_state, jnewblk->jn_blkno,
+ jnewblk->jn_oldfrags, jnewblk->jn_frags,
+ jnewblk->jn_newblk);
+ panic("softdep_setup_blkfree: "
+ "%jd-%jd(%d) overlaps with %jd-%jd",
+ blkno, end, frags, jstart, jend);
+ }
+ }
+#endif
+ FREE_LOCK(&lk);
+}
+
+static void
+initiate_write_bmsafemap(bmsafemap, bp)
+ struct bmsafemap *bmsafemap;
+ struct buf *bp; /* The cg block. */
+{
+ struct jaddref *jaddref;
+ struct jnewblk *jnewblk;
+ uint8_t *inosused;
+ uint8_t *blksfree;
+ struct cg *cgp;
+ struct fs *fs;
+ int cleared;
+ ino_t ino;
+ long bno;
+ int i;
+
+ if (bmsafemap->sm_state & IOSTARTED)
+ panic("initiate_write_bmsafemap: Already started\n");
+ bmsafemap->sm_state |= IOSTARTED;
+ /*
+ * Clear any inode allocations which are pending journal writes.
+ */
+ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
+ cgp = (struct cg *)bp->b_data;
+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+ inosused = cg_inosused(cgp);
+ LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
+ ino = jaddref->ja_ino % fs->fs_ipg;
+ /*
+ * If this is a background copy the inode may not
+ * be marked used yet.
+ */
+ if (isset(inosused, ino)) {
+ if ((jaddref->ja_mode & IFMT) == IFDIR)
+ cgp->cg_cs.cs_ndir--;
+ cgp->cg_cs.cs_nifree++;
+ clrbit(inosused, ino);
+ jaddref->ja_state &= ~ATTACHED;
+ jaddref->ja_state |= UNDONE;
+ stat_jaddref++;
+ } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
+ panic("initiate_write_bmsafemap: inode %d "
+ "marked free", jaddref->ja_ino);
+ }
+ }
+ /*
+ * Clear any block allocations which are pending journal writes.
+ */
+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
+ cgp = (struct cg *)bp->b_data;
+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+ blksfree = cg_blksfree(cgp);
+ LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
+ bno = dtogd(fs, jnewblk->jn_blkno);
+ cleared = 0;
+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+ i++) {
+ if (isclr(blksfree, bno + i)) {
+ cleared = 1;
+ setbit(blksfree, bno + i);
+ }
+ }
+ /*
+ * We may not clear the block if it's a background
+ * copy. In that case there is no reason to detach
+ * it.
+ */
+ if (cleared) {
+ stat_jnewblk++;
+ jnewblk->jn_state &= ~ATTACHED;
+ jnewblk->jn_state |= UNDONE;
+ } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
+ panic("initiate_write_bmsafemap: block %jd "
+ "marked free", jnewblk->jn_blkno);
+ }
+ }
+ /*
+ * Move allocation lists to the written lists so they can be
+ * cleared once the block write is complete.
+ */
+ LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
+ inodedep, id_deps);
+ LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
+ newblk, nb_deps);
}
/*
@@ -4246,6 +8735,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
* a request completion). It should be called early in this
* procedure, before the block is made available to other
* processes or other routines are called.
+ *
*/
static void
softdep_disk_write_complete(bp)
@@ -4254,12 +8744,7 @@ softdep_disk_write_complete(bp)
struct worklist *wk;
struct worklist *owk;
struct workhead reattach;
- struct newblk *newblk;
- struct allocindir *aip;
- struct allocdirect *adp;
- struct indirdep *indirdep;
- struct inodedep *inodedep;
- struct bmsafemap *bmsafemap;
+ struct buf *sbp;
/*
* If an error occurred while doing the write, then the data
@@ -4271,8 +8756,9 @@ softdep_disk_write_complete(bp)
/*
* This lock must not be released anywhere in this code segment.
*/
- ACQUIRE_LOCK(&lk);
+ sbp = NULL;
owk = NULL;
+ ACQUIRE_LOCK(&lk);
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
WORKLIST_REMOVE(wk);
if (wk == owk)
@@ -4291,33 +8777,8 @@ softdep_disk_write_complete(bp)
continue;
case D_BMSAFEMAP:
- bmsafemap = WK_BMSAFEMAP(wk);
- while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
- newblk->nb_state |= DEPCOMPLETE;
- newblk->nb_bmsafemap = NULL;
- LIST_REMOVE(newblk, nb_deps);
- }
- while ((adp =
- LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
- adp->ad_state |= DEPCOMPLETE;
- adp->ad_buf = NULL;
- LIST_REMOVE(adp, ad_deps);
- handle_allocdirect_partdone(adp);
- }
- while ((aip =
- LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
- aip->ai_state |= DEPCOMPLETE;
- aip->ai_buf = NULL;
- LIST_REMOVE(aip, ai_deps);
- handle_allocindir_partdone(aip);
- }
- while ((inodedep =
- LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
- inodedep->id_state |= DEPCOMPLETE;
- LIST_REMOVE(inodedep, id_deps);
- inodedep->id_buf = NULL;
- }
- WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+ if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
+ WORKLIST_INSERT(&reattach, wk);
continue;
case D_MKDIR:
@@ -4325,35 +8786,45 @@ softdep_disk_write_complete(bp)
continue;
case D_ALLOCDIRECT:
- adp = WK_ALLOCDIRECT(wk);
- adp->ad_state |= COMPLETE;
- handle_allocdirect_partdone(adp);
+ wk->wk_state |= COMPLETE;
+ handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
continue;
case D_ALLOCINDIR:
- aip = WK_ALLOCINDIR(wk);
- aip->ai_state |= COMPLETE;
- handle_allocindir_partdone(aip);
+ wk->wk_state |= COMPLETE;
+ handle_allocindir_partdone(WK_ALLOCINDIR(wk));
continue;
case D_INDIRDEP:
- indirdep = WK_INDIRDEP(wk);
- if (indirdep->ir_state & GOINGAWAY)
- panic("disk_write_complete: indirdep gone");
- bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
- free(indirdep->ir_saveddata, M_INDIRDEP);
- indirdep->ir_saveddata = 0;
- indirdep->ir_state &= ~UNDONE;
- indirdep->ir_state |= ATTACHED;
- while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
- handle_allocindir_partdone(aip);
- if (aip == LIST_FIRST(&indirdep->ir_donehd))
- panic("disk_write_complete: not gone");
- }
- WORKLIST_INSERT(&reattach, wk);
- if ((bp->b_flags & B_DELWRI) == 0)
- stat_indir_blk_ptrs++;
- bdirty(bp);
+ if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
+ WORKLIST_INSERT(&reattach, wk);
+ continue;
+
+ case D_FREEBLKS:
+ wk->wk_state |= COMPLETE;
+ if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
+ add_to_worklist(wk, 1);
+ continue;
+
+ case D_FREEWORK:
+ handle_written_freework(WK_FREEWORK(wk));
+ break;
+
+ case D_FREEDEP:
+ free_freedep(WK_FREEDEP(wk));
+ continue;
+
+ case D_JSEGDEP:
+ free_jsegdep(WK_JSEGDEP(wk));
+ continue;
+
+ case D_JSEG:
+ handle_written_jseg(WK_JSEG(wk), bp);
+ continue;
+
+ case D_SBDEP:
+ if (handle_written_sbdep(WK_SBDEP(wk), bp))
+ WORKLIST_INSERT(&reattach, wk);
continue;
default:
@@ -4370,6 +8841,8 @@ softdep_disk_write_complete(bp)
WORKLIST_INSERT(&bp->b_dep, wk);
}
FREE_LOCK(&lk);
+ if (sbp)
+ brelse(sbp);
}
/*
@@ -4378,18 +8851,17 @@ softdep_disk_write_complete(bp)
* splbio interrupts blocked.
*/
static void
-handle_allocdirect_partdone(adp)
+handle_allocdirect_partdone(adp, wkhd)
struct allocdirect *adp; /* the completed allocdirect */
+ struct workhead *wkhd; /* Work to do when inode is writtne. */
{
struct allocdirectlst *listhead;
struct allocdirect *listadp;
struct inodedep *inodedep;
- long bsize, delay;
+ long bsize;
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
- if (adp->ad_buf != NULL)
- panic("handle_allocdirect_partdone: dangling dep");
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
@@ -4439,25 +8911,27 @@ handle_allocdirect_partdone(adp)
return;
}
/*
- * If we have found the just finished dependency, then free
+ * If we have found the just finished dependency, then queue
* it along with anything that follows it that is complete.
- * If the inode still has a bitmap dependency, then it has
- * never been written to disk, hence the on-disk inode cannot
- * reference the old fragment so we can free it without delay.
+ * Since the pointer has not yet been written in the inode
+ * as the dependency prevents it, place the allocdirect on the
+ * bufwait list where it will be freed once the pointer is
+ * valid.
*/
- delay = (inodedep->id_state & DEPCOMPLETE);
+ if (wkhd == NULL)
+ wkhd = &inodedep->id_bufwait;
for (; adp; adp = listadp) {
listadp = TAILQ_NEXT(adp, ad_next);
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
- free_allocdirect(listhead, adp, delay);
+ TAILQ_REMOVE(listhead, adp, ad_next);
+ WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
}
}
/*
- * Called from within softdep_disk_write_complete above. Note that
- * this routine is always called from interrupt level with further
- * splbio interrupts blocked.
+ * Called from within softdep_disk_write_complete above. This routine
+ * completes successfully written allocindirs.
*/
static void
handle_allocindir_partdone(aip)
@@ -4467,11 +8941,9 @@ handle_allocindir_partdone(aip)
if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
- if (aip->ai_buf != NULL)
- panic("handle_allocindir_partdone: dangling dependency");
indirdep = aip->ai_indirdep;
+ LIST_REMOVE(aip, ai_next);
if (indirdep->ir_state & UNDONE) {
- LIST_REMOVE(aip, ai_next);
LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
return;
}
@@ -4481,13 +8953,130 @@ handle_allocindir_partdone(aip)
else
((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
aip->ai_newblkno;
- LIST_REMOVE(aip, ai_next);
- if (aip->ai_freefrag != NULL)
- add_to_worklist(&aip->ai_freefrag->ff_list);
- WORKITEM_FREE(aip, D_ALLOCINDIR);
+ /*
+ * Await the pointer write before freeing the allocindir.
+ */
+ LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
}
/*
+ * Release segments held on a jwork list.
+ */
+static void
+handle_jwork(wkhd)
+ struct workhead *wkhd;
+{
+ struct worklist *wk;
+
+ while ((wk = LIST_FIRST(wkhd)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ switch (wk->wk_type) {
+ case D_JSEGDEP:
+ free_jsegdep(WK_JSEGDEP(wk));
+ continue;
+ default:
+ panic("handle_jwork: Unknown type %s\n",
+ TYPENAME(wk->wk_type));
+ }
+ }
+}
+
+/*
+ * Handle the bufwait list on an inode when it is safe to release items
+ * held there. This normally happens after an inode block is written but
+ * may be delayed and handle later if there are pending journal items that
+ * are not yet safe to be released.
+ */
+static struct freefile *
+handle_bufwait(inodedep, refhd)
+ struct inodedep *inodedep;
+ struct workhead *refhd;
+{
+ struct jaddref *jaddref;
+ struct freefile *freefile;
+ struct worklist *wk;
+
+ freefile = NULL;
+ while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ switch (wk->wk_type) {
+ case D_FREEFILE:
+ /*
+ * We defer adding freefile to the worklist
+ * until all other additions have been made to
+ * ensure that it will be done after all the
+ * old blocks have been freed.
+ */
+ if (freefile != NULL)
+ panic("handle_bufwait: freefile");
+ freefile = WK_FREEFILE(wk);
+ continue;
+
+ case D_MKDIR:
+ handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
+ continue;
+
+ case D_DIRADD:
+ diradd_inode_written(WK_DIRADD(wk), inodedep);
+ continue;
+
+ case D_FREEFRAG:
+ wk->wk_state |= COMPLETE;
+ if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
+ add_to_worklist(wk, 0);
+ continue;
+
+ case D_DIRREM:
+ wk->wk_state |= COMPLETE;
+ add_to_worklist(wk, 0);
+ continue;
+
+ case D_ALLOCDIRECT:
+ case D_ALLOCINDIR:
+ free_newblk(WK_NEWBLK(wk));
+ continue;
+
+ case D_JNEWBLK:
+ wk->wk_state |= COMPLETE;
+ free_jnewblk(WK_JNEWBLK(wk));
+ continue;
+
+ /*
+ * Save freed journal segments and add references on
+ * the supplied list which will delay their release
+ * until the cg bitmap is cleared on disk.
+ */
+ case D_JSEGDEP:
+ if (refhd == NULL)
+ free_jsegdep(WK_JSEGDEP(wk));
+ else
+ WORKLIST_INSERT(refhd, wk);
+ continue;
+
+ case D_JADDREF:
+ jaddref = WK_JADDREF(wk);
+ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
+ if_deps);
+ /*
+ * Transfer any jaddrefs to the list to be freed with
+ * the bitmap if we're handling a removed file.
+ */
+ if (refhd == NULL) {
+ wk->wk_state |= COMPLETE;
+ free_jaddref(jaddref);
+ } else
+ WORKLIST_INSERT(refhd, wk);
+ continue;
+
+ default:
+ panic("handle_bufwait: Unknown type %p(%s)",
+ wk, TYPENAME(wk->wk_type));
+ /* NOTREACHED */
+ }
+ }
+ return (freefile);
+}
+/*
* Called from within softdep_disk_write_complete above to restore
* in-memory inode block contents to their most up-to-date state. Note
* that this routine is always called from interrupt level with further
@@ -4498,12 +9087,17 @@ handle_written_inodeblock(inodedep, bp)
struct inodedep *inodedep;
struct buf *bp; /* buffer containing the inode block */
{
- struct worklist *wk, *filefree;
+ struct freefile *freefile;
struct allocdirect *adp, *nextadp;
struct ufs1_dinode *dp1 = NULL;
struct ufs2_dinode *dp2 = NULL;
+ struct workhead wkhd;
int hadchanges, fstype;
+ ino_t freelink;
+ LIST_INIT(&wkhd);
+ hadchanges = 0;
+ freefile = NULL;
if ((inodedep->id_state & IOSTARTED) == 0)
panic("handle_written_inodeblock: not started");
inodedep->id_state &= ~IOSTARTED;
@@ -4511,11 +9105,32 @@ handle_written_inodeblock(inodedep, bp)
fstype = UFS1;
dp1 = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
+ freelink = dp1->di_freelink;
} else {
fstype = UFS2;
dp2 = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
+ freelink = dp2->di_freelink;
+ }
+ /*
+ * If we wrote a valid freelink pointer during the last write
+ * record it here.
+ */
+ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+ struct inodedep *inon;
+
+ inon = TAILQ_NEXT(inodedep, id_unlinked);
+ if ((inon == NULL && freelink == 0) ||
+ (inon && inon->id_ino == freelink)) {
+ if (inon)
+ inon->id_state |= UNLINKPREV;
+ inodedep->id_state |= UNLINKNEXT;
+ } else
+ hadchanges = 1;
}
+ /* Leave this inodeblock dirty until it's in the list. */
+ if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED)
+ hadchanges = 1;
/*
* If we had to rollback the inode allocation because of
* bitmaps being incomplete, then simply restore it.
@@ -4524,6 +9139,7 @@ handle_written_inodeblock(inodedep, bp)
* corresponding updates written to disk.
*/
if (inodedep->id_savedino1 != NULL) {
+ hadchanges = 1;
if (fstype == UFS1)
*dp1 = *inodedep->id_savedino1;
else
@@ -4533,6 +9149,13 @@ handle_written_inodeblock(inodedep, bp)
if ((bp->b_flags & B_DELWRI) == 0)
stat_inode_bitmap++;
bdirty(bp);
+ /*
+ * If the inode is clear here and GOINGAWAY it will never
+ * be written. Process the bufwait and clear any pending
+ * work which may include the freefile.
+ */
+ if (inodedep->id_state & GOINGAWAY)
+ goto bufwait;
return (1);
}
inodedep->id_state |= COMPLETE;
@@ -4540,50 +9163,49 @@ handle_written_inodeblock(inodedep, bp)
* Roll forward anything that had to be rolled back before
* the inode could be updated.
*/
- hadchanges = 0;
for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
nextadp = TAILQ_NEXT(adp, ad_next);
if (adp->ad_state & ATTACHED)
panic("handle_written_inodeblock: new entry");
if (fstype == UFS1) {
- if (adp->ad_lbn < NDADDR) {
- if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
+ if (adp->ad_offset < NDADDR) {
+ if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
panic("%s %s #%jd mismatch %d != %jd",
"handle_written_inodeblock:",
"direct pointer",
- (intmax_t)adp->ad_lbn,
- dp1->di_db[adp->ad_lbn],
+ (intmax_t)adp->ad_offset,
+ dp1->di_db[adp->ad_offset],
(intmax_t)adp->ad_oldblkno);
- dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
+ dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
} else {
- if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
+ if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
panic("%s: %s #%jd allocated as %d",
"handle_written_inodeblock",
"indirect pointer",
- (intmax_t)adp->ad_lbn - NDADDR,
- dp1->di_ib[adp->ad_lbn - NDADDR]);
- dp1->di_ib[adp->ad_lbn - NDADDR] =
+ (intmax_t)adp->ad_offset - NDADDR,
+ dp1->di_ib[adp->ad_offset - NDADDR]);
+ dp1->di_ib[adp->ad_offset - NDADDR] =
adp->ad_newblkno;
}
} else {
- if (adp->ad_lbn < NDADDR) {
- if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
+ if (adp->ad_offset < NDADDR) {
+ if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
panic("%s: %s #%jd %s %jd != %jd",
"handle_written_inodeblock",
"direct pointer",
- (intmax_t)adp->ad_lbn, "mismatch",
- (intmax_t)dp2->di_db[adp->ad_lbn],
+ (intmax_t)adp->ad_offset, "mismatch",
+ (intmax_t)dp2->di_db[adp->ad_offset],
(intmax_t)adp->ad_oldblkno);
- dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
+ dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
} else {
- if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
+ if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
panic("%s: %s #%jd allocated as %jd",
"handle_written_inodeblock",
"indirect pointer",
- (intmax_t)adp->ad_lbn - NDADDR,
+ (intmax_t)adp->ad_offset - NDADDR,
(intmax_t)
- dp2->di_ib[adp->ad_lbn - NDADDR]);
- dp2->di_ib[adp->ad_lbn - NDADDR] =
+ dp2->di_ib[adp->ad_offset - NDADDR]);
+ dp2->di_ib[adp->ad_offset - NDADDR] =
adp->ad_newblkno;
}
}
@@ -4595,13 +9217,13 @@ handle_written_inodeblock(inodedep, bp)
nextadp = TAILQ_NEXT(adp, ad_next);
if (adp->ad_state & ATTACHED)
panic("handle_written_inodeblock: new entry");
- if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
+ if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
panic("%s: direct pointers #%jd %s %jd != %jd",
"handle_written_inodeblock",
- (intmax_t)adp->ad_lbn, "mismatch",
- (intmax_t)dp2->di_extb[adp->ad_lbn],
+ (intmax_t)adp->ad_offset, "mismatch",
+ (intmax_t)dp2->di_extb[adp->ad_offset],
(intmax_t)adp->ad_oldblkno);
- dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
+ dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
adp->ad_state &= ~UNDONE;
adp->ad_state |= ATTACHED;
hadchanges = 1;
@@ -4613,12 +9235,23 @@ handle_written_inodeblock(inodedep, bp)
*/
if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
panic("handle_written_inodeblock: bad size");
+ if (inodedep->id_savednlink > LINK_MAX)
+ panic("handle_written_inodeblock: Invalid link count "
+ "%d for inodedep %p", inodedep->id_savednlink, inodedep);
if (fstype == UFS1) {
+ if (dp1->di_nlink != inodedep->id_savednlink) {
+ dp1->di_nlink = inodedep->id_savednlink;
+ hadchanges = 1;
+ }
if (dp1->di_size != inodedep->id_savedsize) {
dp1->di_size = inodedep->id_savedsize;
hadchanges = 1;
}
} else {
+ if (dp2->di_nlink != inodedep->id_savednlink) {
+ dp2->di_nlink = inodedep->id_savednlink;
+ hadchanges = 1;
+ }
if (dp2->di_size != inodedep->id_savedsize) {
dp2->di_size = inodedep->id_savedsize;
hadchanges = 1;
@@ -4630,6 +9263,7 @@ handle_written_inodeblock(inodedep, bp)
}
inodedep->id_savedsize = -1;
inodedep->id_savedextsize = -1;
+ inodedep->id_savednlink = -1;
/*
* If there were any rollbacks in the inode block, then it must be
* marked dirty so that its will eventually get written back in
@@ -4637,69 +9271,49 @@ handle_written_inodeblock(inodedep, bp)
*/
if (hadchanges)
bdirty(bp);
+bufwait:
/*
* Process any allocdirects that completed during the update.
*/
if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
- handle_allocdirect_partdone(adp);
+ handle_allocdirect_partdone(adp, &wkhd);
if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
- handle_allocdirect_partdone(adp);
+ handle_allocdirect_partdone(adp, &wkhd);
/*
* Process deallocations that were held pending until the
* inode had been written to disk. Freeing of the inode
* is delayed until after all blocks have been freed to
* avoid creation of new <vfsid, inum, lbn> triples
- * before the old ones have been deleted.
+ * before the old ones have been deleted. Completely
+ * unlinked inodes are not processed until the unlinked
+ * inode list is written or the last reference is removed.
*/
- filefree = NULL;
- while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
- WORKLIST_REMOVE(wk);
- switch (wk->wk_type) {
-
- case D_FREEFILE:
- /*
- * We defer adding filefree to the worklist until
- * all other additions have been made to ensure
- * that it will be done after all the old blocks
- * have been freed.
- */
- if (filefree != NULL)
- panic("handle_written_inodeblock: filefree");
- filefree = wk;
- continue;
-
- case D_MKDIR:
- handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
- continue;
-
- case D_DIRADD:
- diradd_inode_written(WK_DIRADD(wk), inodedep);
- continue;
-
- case D_FREEBLKS:
- wk->wk_state |= COMPLETE;
- if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
- continue;
- /* -- fall through -- */
- case D_FREEFRAG:
- case D_DIRREM:
- add_to_worklist(wk);
- continue;
-
- case D_NEWDIRBLK:
- free_newdirblk(WK_NEWDIRBLK(wk));
- continue;
-
- default:
- panic("handle_written_inodeblock: Unknown type %s",
- TYPENAME(wk->wk_type));
- /* NOTREACHED */
+ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
+ freefile = handle_bufwait(inodedep, NULL);
+ if (freefile && !LIST_EMPTY(&wkhd)) {
+ WORKLIST_INSERT(&wkhd, &freefile->fx_list);
+ freefile = NULL;
}
}
- if (filefree != NULL) {
+ /*
+ * Move rolled forward dependency completions to the bufwait list
+ * now that those that were already written have been processed.
+ */
+ if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
+ panic("handle_written_inodeblock: bufwait but no changes");
+ jwork_move(&inodedep->id_bufwait, &wkhd);
+
+ if (freefile != NULL) {
+ /*
+ * If the inode is goingaway it was never written. Fake up
+ * the state here so free_inodedep() can succeed.
+ */
+ if (inodedep->id_state & GOINGAWAY)
+ inodedep->id_state |= COMPLETE | DEPCOMPLETE;
if (free_inodedep(inodedep) == 0)
- panic("handle_written_inodeblock: live inodedep");
- add_to_worklist(filefree);
+ panic("handle_written_inodeblock: live inodedep %p",
+ inodedep);
+ add_to_worklist(&freefile->fx_list, 0);
return (0);
}
@@ -4707,12 +9321,101 @@ handle_written_inodeblock(inodedep, bp)
* If no outstanding dependencies, free it.
*/
if (free_inodedep(inodedep) ||
- (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
- TAILQ_FIRST(&inodedep->id_extupdt) == 0))
+ (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
+ TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
+ TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
+ LIST_FIRST(&inodedep->id_bufwait) == 0))
return (0);
return (hadchanges);
}
+static int
+handle_written_indirdep(indirdep, bp, bpp)
+ struct indirdep *indirdep;
+ struct buf *bp;
+ struct buf **bpp;
+{
+ struct allocindir *aip;
+ int chgs;
+
+ if (indirdep->ir_state & GOINGAWAY)
+ panic("disk_write_complete: indirdep gone");
+ chgs = 0;
+ /*
+ * If there were rollbacks revert them here.
+ */
+ if (indirdep->ir_saveddata) {
+ bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
+ free(indirdep->ir_saveddata, M_INDIRDEP);
+ indirdep->ir_saveddata = 0;
+ chgs = 1;
+ }
+ indirdep->ir_state &= ~UNDONE;
+ indirdep->ir_state |= ATTACHED;
+ /*
+ * Move allocindirs with written pointers to the completehd if
+ * the the indirdep's pointer is not yet written. Otherwise
+ * free them here.
+ */
+ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
+ LIST_REMOVE(aip, ai_next);
+ if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
+ LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
+ ai_next);
+ continue;
+ }
+ free_newblk(&aip->ai_block);
+ }
+ /*
+ * Move allocindirs that have finished dependency processing from
+ * the done list to the write list after updating the pointers.
+ */
+ while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
+ handle_allocindir_partdone(aip);
+ if (aip == LIST_FIRST(&indirdep->ir_donehd))
+ panic("disk_write_complete: not gone");
+ chgs = 1;
+ }
+ /*
+ * If this indirdep has been detached from its newblk during
+ * I/O we need to keep this dep attached to the buffer so
+ * deallocate_dependencies can find it and properly resolve
+ * any outstanding dependencies.
+ */
+ if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)
+ chgs = 1;
+ if ((bp->b_flags & B_DELWRI) == 0)
+ stat_indir_blk_ptrs++;
+ /*
+ * If there were no changes we can discard the savedbp and detach
+ * ourselves from the buf. We are only carrying completed pointers
+ * in this case.
+ */
+ if (chgs == 0) {
+ struct buf *sbp;
+
+ sbp = indirdep->ir_savebp;
+ sbp->b_flags |= B_INVAL | B_NOCACHE;
+ indirdep->ir_savebp = NULL;
+ if (*bpp != NULL)
+ panic("handle_written_indirdep: bp already exists.");
+ *bpp = sbp;
+ } else
+ bdirty(bp);
+ /*
+ * If there are no fresh dependencies and none waiting on writes
+ * we can free the indirdep.
+ */
+ if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {
+ if (indirdep->ir_state & ONDEPLIST)
+ LIST_REMOVE(indirdep, ir_next);
+ free_indirdep(indirdep);
+ return (0);
+ }
+
+ return (chgs);
+}
+
/*
* Process a diradd entry after its dependent inode has been written.
* This routine must be called with splbio interrupts blocked.
@@ -4722,50 +9425,200 @@ diradd_inode_written(dap, inodedep)
struct diradd *dap;
struct inodedep *inodedep;
{
- struct pagedep *pagedep;
dap->da_state |= COMPLETE;
- if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
- if (dap->da_state & DIRCHG)
- pagedep = dap->da_previous->dm_pagedep;
- else
- pagedep = dap->da_pagedep;
- LIST_REMOVE(dap, da_pdlist);
- LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
- }
+ complete_diradd(dap);
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
}
/*
- * Handle the completion of a mkdir dependency.
+ * Returns true if the bmsafemap will have rollbacks when written. Must
+ * only be called with lk and the buf lock on the cg held.
+ */
+static int
+bmsafemap_rollbacks(bmsafemap)
+ struct bmsafemap *bmsafemap;
+{
+
+ return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
+ !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
+}
+
+/*
+ * Complete a write to a bmsafemap structure. Roll forward any bitmap
+ * changes if it's not a background write. Set all written dependencies
+ * to DEPCOMPLETE and free the structure if possible.
+ */
+static int
+handle_written_bmsafemap(bmsafemap, bp)
+ struct bmsafemap *bmsafemap;
+ struct buf *bp;
+{
+ struct newblk *newblk;
+ struct inodedep *inodedep;
+ struct jaddref *jaddref, *jatmp;
+ struct jnewblk *jnewblk, *jntmp;
+ uint8_t *inosused;
+ uint8_t *blksfree;
+ struct cg *cgp;
+ struct fs *fs;
+ ino_t ino;
+ long bno;
+ int chgs;
+ int i;
+
+ if ((bmsafemap->sm_state & IOSTARTED) == 0)
+ panic("initiate_write_bmsafemap: Not started\n");
+ chgs = 0;
+ bmsafemap->sm_state &= ~IOSTARTED;
+ /*
+ * Restore unwritten inode allocation pending jaddref writes.
+ */
+ if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
+ cgp = (struct cg *)bp->b_data;
+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+ inosused = cg_inosused(cgp);
+ LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
+ ja_bmdeps, jatmp) {
+ if ((jaddref->ja_state & UNDONE) == 0)
+ continue;
+ ino = jaddref->ja_ino % fs->fs_ipg;
+ if (isset(inosused, ino))
+ panic("handle_written_bmsafemap: "
+ "re-allocated inode");
+ if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
+ if ((jaddref->ja_mode & IFMT) == IFDIR)
+ cgp->cg_cs.cs_ndir++;
+ cgp->cg_cs.cs_nifree--;
+ setbit(inosused, ino);
+ chgs = 1;
+ }
+ jaddref->ja_state &= ~UNDONE;
+ jaddref->ja_state |= ATTACHED;
+ free_jaddref(jaddref);
+ }
+ }
+ /*
+ * Restore any block allocations which are pending journal writes.
+ */
+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
+ cgp = (struct cg *)bp->b_data;
+ fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
+ blksfree = cg_blksfree(cgp);
+ LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
+ jntmp) {
+ if ((jnewblk->jn_state & UNDONE) == 0)
+ continue;
+ bno = dtogd(fs, jnewblk->jn_blkno);
+ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
+ i++) {
+ if (bp->b_xflags & BX_BKGRDMARKER)
+ break;
+ if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
+ isclr(blksfree, bno + i))
+ panic("handle_written_bmsafemap: "
+ "re-allocated fragment");
+ clrbit(blksfree, bno + i);
+ chgs = 1;
+ }
+ jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
+ jnewblk->jn_state |= ATTACHED;
+ free_jnewblk(jnewblk);
+ }
+ }
+ while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
+ newblk->nb_state |= DEPCOMPLETE;
+ newblk->nb_state &= ~ONDEPLIST;
+ newblk->nb_bmsafemap = NULL;
+ LIST_REMOVE(newblk, nb_deps);
+ if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
+ handle_allocdirect_partdone(
+ WK_ALLOCDIRECT(&newblk->nb_list), NULL);
+ else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
+ handle_allocindir_partdone(
+ WK_ALLOCINDIR(&newblk->nb_list));
+ else if (newblk->nb_list.wk_type != D_NEWBLK)
+ panic("handle_written_bmsafemap: Unexpected type: %s",
+ TYPENAME(newblk->nb_list.wk_type));
+ }
+ while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
+ inodedep->id_state |= DEPCOMPLETE;
+ inodedep->id_state &= ~ONDEPLIST;
+ LIST_REMOVE(inodedep, id_deps);
+ inodedep->id_bmsafemap = NULL;
+ }
+ if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
+ LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
+ LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
+ LIST_EMPTY(&bmsafemap->sm_inodedephd)) {
+ if (chgs)
+ bdirty(bp);
+ LIST_REMOVE(bmsafemap, sm_hash);
+ WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
+ return (0);
+ }
+ bdirty(bp);
+ return (1);
+}
+
+/*
+ * Try to free a mkdir dependency.
*/
static void
-handle_written_mkdir(mkdir, type)
+complete_mkdir(mkdir)
struct mkdir *mkdir;
- int type;
{
struct diradd *dap;
- struct pagedep *pagedep;
- if (mkdir->md_state != type)
- panic("handle_written_mkdir: bad type");
+ if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
+ return;
+ LIST_REMOVE(mkdir, md_mkdirs);
dap = mkdir->md_diradd;
- dap->da_state &= ~type;
- if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
+ dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
+ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
dap->da_state |= DEPCOMPLETE;
- if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
- if (dap->da_state & DIRCHG)
- pagedep = dap->da_previous->dm_pagedep;
- else
- pagedep = dap->da_pagedep;
- LIST_REMOVE(dap, da_pdlist);
- LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
+ complete_diradd(dap);
}
- LIST_REMOVE(mkdir, md_mkdirs);
WORKITEM_FREE(mkdir, D_MKDIR);
}
/*
+ * Handle the completion of a mkdir dependency.
+ */
+static void
+handle_written_mkdir(mkdir, type)
+ struct mkdir *mkdir;
+ int type;
+{
+
+ if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
+ panic("handle_written_mkdir: bad type");
+ mkdir->md_state |= COMPLETE;
+ complete_mkdir(mkdir);
+}
+
+static void
+free_pagedep(pagedep)
+ struct pagedep *pagedep;
+{
+ int i;
+
+ if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))
+ return;
+ for (i = 0; i < DAHASHSZ; i++)
+ if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
+ return;
+ if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
+ return;
+ if (!LIST_EMPTY(&pagedep->pd_dirremhd))
+ return;
+ if (!LIST_EMPTY(&pagedep->pd_pendinghd))
+ return;
+ LIST_REMOVE(pagedep, pd_hash);
+ WORKITEM_FREE(pagedep, D_PAGEDEP);
+}
+
+/*
* Called from within softdep_disk_write_complete above.
* A write operation was just completed. Removed inodes can
* now be freed and associated block pointers may be committed.
@@ -4790,8 +9643,11 @@ handle_written_filepage(pagedep, bp)
*/
while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
LIST_REMOVE(dirrem, dm_next);
+ dirrem->dm_state |= COMPLETE;
dirrem->dm_dirinum = pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
+ ("handle_written_filepage: Journal entries not written."));
+ add_to_worklist(&dirrem->dm_list, 0);
}
/*
* Free any directory additions that have been committed.
@@ -4800,7 +9656,7 @@ handle_written_filepage(pagedep, bp)
*/
if ((pagedep->pd_state & NEWBLOCK) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
- free_diradd(dap);
+ free_diradd(dap, NULL);
/*
* Uncommitted directory entries must be restored.
*/
@@ -4845,7 +9701,8 @@ handle_written_filepage(pagedep, bp)
* Otherwise it will remain to track any new entries on
* the page in case they are fsync'ed.
*/
- if ((pagedep->pd_state & NEWBLOCK) == 0) {
+ if ((pagedep->pd_state & NEWBLOCK) == 0 &&
+ LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
}
@@ -4880,8 +9737,8 @@ softdep_load_inodeblock(ip)
*/
ip->i_effnlink = ip->i_nlink;
ACQUIRE_LOCK(&lk);
- if (inodedep_lookup(UFSTOVFS(ip->i_ump),
- ip->i_number, 0, &inodedep) == 0) {
+ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
+ &inodedep) == 0) {
FREE_LOCK(&lk);
return;
}
@@ -4908,11 +9765,26 @@ softdep_update_inodeblock(ip, bp, waitfor)
int waitfor; /* nonzero => update must be allowed */
{
struct inodedep *inodedep;
+ struct inoref *inoref;
struct worklist *wk;
struct mount *mp;
struct buf *ibp;
+ struct fs *fs;
int error;
+ mp = UFSTOVFS(ip->i_ump);
+ fs = ip->i_fs;
+ /*
+ * Preserve the freelink that is on disk. clear_unlinked_inodedep()
+ * does not have access to the in-core ip so must write directly into
+ * the inode block buffer when setting freelink.
+ */
+ if (fs->fs_magic == FS_UFS1_MAGIC)
+ DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
+ ino_to_fsbo(fs, ip->i_number))->di_freelink);
+ else
+ DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
+ ino_to_fsbo(fs, ip->i_number))->di_freelink);
/*
* If the effective link count is not equal to the actual link
* count, then we must track the difference in an inodedep while
@@ -4920,8 +9792,8 @@ softdep_update_inodeblock(ip, bp, waitfor)
* if there is no existing inodedep, then there are no dependencies
* to track.
*/
- mp = UFSTOVFS(ip->i_ump);
ACQUIRE_LOCK(&lk);
+again:
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
if (ip->i_effnlink != ip->i_nlink)
@@ -4931,6 +9803,20 @@ softdep_update_inodeblock(ip, bp, waitfor)
if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
panic("softdep_update_inodeblock: bad delta");
/*
+ * If we're flushing all dependencies we must also move any waiting
+ * for journal writes onto the bufwait list prior to I/O.
+ */
+ if (waitfor) {
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+ == DEPCOMPLETE) {
+ stat_jwait_inode++;
+ jwait(&inoref->if_list);
+ goto again;
+ }
+ }
+ }
+ /*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
*/
@@ -4945,10 +9831,12 @@ softdep_update_inodeblock(ip, bp, waitfor)
*/
merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
- handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
+ handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
+ NULL);
merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
if (!TAILQ_EMPTY(&inodedep->id_extupdt))
- handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
+ handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
+ NULL);
/*
* Now that the inode has been pushed into the buffer, the
* operations dependent on the inode being written to disk
@@ -4971,11 +9859,11 @@ softdep_update_inodeblock(ip, bp, waitfor)
return;
}
retry:
- if ((inodedep->id_state & DEPCOMPLETE) != 0) {
+ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
FREE_LOCK(&lk);
return;
}
- ibp = inodedep->id_buf;
+ ibp = inodedep->id_bmsafemap->sm_buf;
ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
if (ibp == NULL) {
/*
@@ -5007,13 +9895,13 @@ merge_inode_lists(newlisthead, oldlisthead)
newadp = TAILQ_FIRST(newlisthead);
for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
- if (listadp->ad_lbn < newadp->ad_lbn) {
+ if (listadp->ad_offset < newadp->ad_offset) {
listadp = TAILQ_NEXT(listadp, ad_next);
continue;
}
TAILQ_REMOVE(newlisthead, newadp, ad_next);
TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
- if (listadp->ad_lbn == newadp->ad_lbn) {
+ if (listadp->ad_offset == newadp->ad_offset) {
allocdirect_merge(oldlisthead, newadp,
listadp);
listadp = newadp;
@@ -5036,6 +9924,7 @@ softdep_fsync(vp)
{
struct inodedep *inodedep;
struct pagedep *pagedep;
+ struct inoref *inoref;
struct worklist *wk;
struct diradd *dap;
struct mount *mp;
@@ -5052,17 +9941,25 @@ softdep_fsync(vp)
fs = ip->i_fs;
mp = vp->v_mount;
ACQUIRE_LOCK(&lk);
+restart:
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
return (0);
}
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+ == DEPCOMPLETE) {
+ stat_jwait_inode++;
+ jwait(&inoref->if_list);
+ goto restart;
+ }
+ }
if (!LIST_EMPTY(&inodedep->id_inowait) ||
- !LIST_EMPTY(&inodedep->id_bufwait) ||
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
!TAILQ_EMPTY(&inodedep->id_inoupdt) ||
!TAILQ_EMPTY(&inodedep->id_newinoupdt))
- panic("softdep_fsync: pending ops");
+ panic("softdep_fsync: pending ops %p", inodedep);
for (error = 0, flushparent = 0; ; ) {
if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
break;
@@ -5254,8 +10151,8 @@ int
softdep_sync_metadata(struct vnode *vp)
{
struct pagedep *pagedep;
- struct allocdirect *adp;
struct allocindir *aip;
+ struct newblk *newblk;
struct buf *bp, *nbp;
struct worklist *wk;
struct bufobj *bo;
@@ -5319,27 +10216,16 @@ loop:
switch (wk->wk_type) {
case D_ALLOCDIRECT:
- adp = WK_ALLOCDIRECT(wk);
- if (adp->ad_state & DEPCOMPLETE)
- continue;
- nbp = adp->ad_buf;
- nbp = getdirtybuf(nbp, &lk, waitfor);
- if (nbp == NULL)
- continue;
- FREE_LOCK(&lk);
- if (waitfor == MNT_NOWAIT) {
- bawrite(nbp);
- } else if ((error = bwrite(nbp)) != 0) {
- break;
- }
- ACQUIRE_LOCK(&lk);
- continue;
-
case D_ALLOCINDIR:
- aip = WK_ALLOCINDIR(wk);
- if (aip->ai_state & DEPCOMPLETE)
+ newblk = WK_NEWBLK(wk);
+ if (newblk->nb_jnewblk != NULL) {
+ stat_jwait_newblk++;
+ jwait(&newblk->nb_jnewblk->jn_list);
+ goto restart;
+ }
+ if (newblk->nb_state & DEPCOMPLETE)
continue;
- nbp = aip->ai_buf;
+ nbp = newblk->nb_bmsafemap->sm_buf;
nbp = getdirtybuf(nbp, &lk, waitfor);
if (nbp == NULL)
continue;
@@ -5355,10 +10241,17 @@ loop:
case D_INDIRDEP:
restart:
- LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
- if (aip->ai_state & DEPCOMPLETE)
+ LIST_FOREACH(aip,
+ &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
+ newblk = (struct newblk *)aip;
+ if (newblk->nb_jnewblk != NULL) {
+ stat_jwait_newblk++;
+ jwait(&newblk->nb_jnewblk->jn_list);
+ goto restart;
+ }
+ if (newblk->nb_state & DEPCOMPLETE)
continue;
- nbp = aip->ai_buf;
+ nbp = newblk->nb_bmsafemap->sm_buf;
nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
if (nbp == NULL)
goto restart;
@@ -5371,14 +10264,6 @@ loop:
}
continue;
- case D_INODEDEP:
- if ((error = flush_inodedep_deps(wk->wk_mp,
- WK_INODEDEP(wk)->id_ino)) != 0) {
- FREE_LOCK(&lk);
- break;
- }
- continue;
-
case D_PAGEDEP:
/*
* We are trying to sync a directory that may
@@ -5400,48 +10285,6 @@ loop:
}
continue;
- case D_MKDIR:
- /*
- * This case should never happen if the vnode has
- * been properly sync'ed. However, if this function
- * is used at a place where the vnode has not yet
- * been sync'ed, this dependency can show up. So,
- * rather than panic, just flush it.
- */
- nbp = WK_MKDIR(wk)->md_buf;
- nbp = getdirtybuf(nbp, &lk, waitfor);
- if (nbp == NULL)
- continue;
- FREE_LOCK(&lk);
- if (waitfor == MNT_NOWAIT) {
- bawrite(nbp);
- } else if ((error = bwrite(nbp)) != 0) {
- break;
- }
- ACQUIRE_LOCK(&lk);
- continue;
-
- case D_BMSAFEMAP:
- /*
- * This case should never happen if the vnode has
- * been properly sync'ed. However, if this function
- * is used at a place where the vnode has not yet
- * been sync'ed, this dependency can show up. So,
- * rather than panic, just flush it.
- */
- nbp = WK_BMSAFEMAP(wk)->sm_buf;
- nbp = getdirtybuf(nbp, &lk, waitfor);
- if (nbp == NULL)
- continue;
- FREE_LOCK(&lk);
- if (waitfor == MNT_NOWAIT) {
- bawrite(nbp);
- } else if ((error = bwrite(nbp)) != 0) {
- break;
- }
- ACQUIRE_LOCK(&lk);
- continue;
-
default:
panic("softdep_sync_metadata: Unknown type %s",
TYPENAME(wk->wk_type));
@@ -5489,7 +10332,8 @@ loop:
BO_LOCK(bo);
drain_output(vp);
BO_UNLOCK(bo);
- return (0);
+ return ffs_update(vp, 1);
+ /* return (0); */
}
/*
@@ -5502,6 +10346,7 @@ flush_inodedep_deps(mp, ino)
ino_t ino;
{
struct inodedep *inodedep;
+ struct inoref *inoref;
int error, waitfor;
/*
@@ -5522,8 +10367,17 @@ flush_inodedep_deps(mp, ino)
return (error);
FREE_LOCK(&lk);
ACQUIRE_LOCK(&lk);
+restart:
if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
return (0);
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+ == DEPCOMPLETE) {
+ stat_jwait_inode++;
+ jwait(&inoref->if_list);
+ goto restart;
+ }
+ }
if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
@@ -5555,13 +10409,20 @@ flush_deplist(listhead, waitfor, errorp)
int *errorp;
{
struct allocdirect *adp;
+ struct newblk *newblk;
struct buf *bp;
mtx_assert(&lk, MA_OWNED);
TAILQ_FOREACH(adp, listhead, ad_next) {
- if (adp->ad_state & DEPCOMPLETE)
+ newblk = (struct newblk *)adp;
+ if (newblk->nb_jnewblk != NULL) {
+ stat_jwait_newblk++;
+ jwait(&newblk->nb_jnewblk->jn_list);
+ return (1);
+ }
+ if (newblk->nb_state & DEPCOMPLETE)
continue;
- bp = adp->ad_buf;
+ bp = newblk->nb_bmsafemap->sm_buf;
bp = getdirtybuf(bp, &lk, waitfor);
if (bp == NULL) {
if (waitfor == MNT_NOWAIT)
@@ -5582,6 +10443,101 @@ flush_deplist(listhead, waitfor, errorp)
}
/*
+ * Flush dependencies associated with an allocdirect block.
+ */
+static int
+flush_newblk_dep(vp, mp, lbn)
+ struct vnode *vp;
+ struct mount *mp;
+ ufs_lbn_t lbn;
+{
+ struct newblk *newblk;
+ struct bufobj *bo;
+ struct inode *ip;
+ struct buf *bp;
+ ufs2_daddr_t blkno;
+ int error;
+
+ error = 0;
+ bo = &vp->v_bufobj;
+ ip = VTOI(vp);
+ blkno = DIP(ip, i_db[lbn]);
+ if (blkno == 0)
+ panic("flush_newblk_dep: Missing block");
+ ACQUIRE_LOCK(&lk);
+ /*
+ * Loop until all dependencies related to this block are satisfied.
+ * We must be careful to restart after each sleep in case a write
+ * completes some part of this process for us.
+ */
+ for (;;) {
+ if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
+ FREE_LOCK(&lk);
+ break;
+ }
+ if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
+ panic("flush_newblk_deps: Bad newblk %p", newblk);
+ /*
+ * Flush the journal.
+ */
+ if (newblk->nb_jnewblk != NULL) {
+ stat_jwait_newblk++;
+ jwait(&newblk->nb_jnewblk->jn_list);
+ continue;
+ }
+ /*
+ * Write the bitmap dependency.
+ */
+ if ((newblk->nb_state & DEPCOMPLETE) == 0) {
+ bp = newblk->nb_bmsafemap->sm_buf;
+ bp = getdirtybuf(bp, &lk, MNT_WAIT);
+ if (bp == NULL)
+ continue;
+ FREE_LOCK(&lk);
+ error = bwrite(bp);
+ if (error)
+ break;
+ ACQUIRE_LOCK(&lk);
+ continue;
+ }
+ /*
+ * Write the buffer.
+ */
+ FREE_LOCK(&lk);
+ BO_LOCK(bo);
+ bp = gbincore(bo, lbn);
+ if (bp != NULL) {
+ error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
+ LK_INTERLOCK, BO_MTX(bo));
+ if (error == ENOLCK) {
+ ACQUIRE_LOCK(&lk);
+ continue; /* Slept, retry */
+ }
+ if (error != 0)
+ break; /* Failed */
+ if (bp->b_flags & B_DELWRI) {
+ bremfree(bp);
+ error = bwrite(bp);
+ if (error)
+ break;
+ } else
+ BUF_UNLOCK(bp);
+ } else
+ BO_UNLOCK(bo);
+ /*
+ * We have to wait for the direct pointers to
+ * point at the newdirblk before the dependency
+ * will go away.
+ */
+ error = ffs_update(vp, MNT_WAIT);
+ if (error)
+ break;
+ ACQUIRE_LOCK(&lk);
+ }
+ return (error);
+}
+
+/*
* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
* Called with splbio blocked.
*/
@@ -5592,16 +10548,16 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
struct diraddhd *diraddhdp;
{
struct inodedep *inodedep;
+ struct inoref *inoref;
struct ufsmount *ump;
struct diradd *dap;
struct vnode *vp;
- struct bufobj *bo;
int error = 0;
struct buf *bp;
ino_t inum;
- struct worklist *wk;
ump = VFSTOUFS(mp);
+restart:
while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
/*
* Flush ourselves if this directory entry
@@ -5609,7 +10565,7 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
*/
if (dap->da_state & MKDIR_PARENT) {
FREE_LOCK(&lk);
- if ((error = ffs_update(pvp, 1)) != 0)
+ if ((error = ffs_update(pvp, MNT_WAIT)) != 0)
break;
ACQUIRE_LOCK(&lk);
/*
@@ -5623,84 +10579,52 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
/*
* A newly allocated directory must have its "." and
* ".." entries written out before its name can be
- * committed in its parent. We do not want or need
- * the full semantics of a synchronous ffs_syncvnode as
- * that may end up here again, once for each directory
- * level in the filesystem. Instead, we push the blocks
- * and wait for them to clear. We have to fsync twice
- * because the first call may choose to defer blocks
- * that still have dependencies, but deferral will
- * happen at most once.
+ * committed in its parent.
*/
inum = dap->da_newinum;
+ if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
+ panic("flush_pagedep_deps: lost inode1");
+ /*
+ * Wait for any pending journal adds to complete so we don't
+ * cause rollbacks while syncing.
+ */
+ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
+ if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
+ == DEPCOMPLETE) {
+ stat_jwait_inode++;
+ jwait(&inoref->if_list);
+ goto restart;
+ }
+ }
if (dap->da_state & MKDIR_BODY) {
FREE_LOCK(&lk);
if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
FFSV_FORCEINSMQ)))
break;
- if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
- (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
- vput(vp);
- break;
- }
- bo = &vp->v_bufobj;
- BO_LOCK(bo);
- drain_output(vp);
+ error = flush_newblk_dep(vp, mp, 0);
/*
- * If first block is still dirty with a D_MKDIR
- * dependency then it needs to be written now.
+ * If we still have the dependency we might need to
+ * update the vnode to sync the new link count to
+ * disk.
*/
- for (;;) {
- error = 0;
- bp = gbincore(bo, 0);
- if (bp == NULL)
- break; /* First block not present */
- error = BUF_LOCK(bp,
- LK_EXCLUSIVE |
- LK_SLEEPFAIL |
- LK_INTERLOCK,
- BO_MTX(bo));
- BO_LOCK(bo);
- if (error == ENOLCK)
- continue; /* Slept, retry */
- if (error != 0)
- break; /* Failed */
- if ((bp->b_flags & B_DELWRI) == 0) {
- BUF_UNLOCK(bp);
- break; /* Buffer not dirty */
- }
- for (wk = LIST_FIRST(&bp->b_dep);
- wk != NULL;
- wk = LIST_NEXT(wk, wk_list))
- if (wk->wk_type == D_MKDIR)
- break;
- if (wk == NULL)
- BUF_UNLOCK(bp); /* Dependency gone */
- else {
- /*
- * D_MKDIR dependency remains,
- * must write buffer to stable
- * storage.
- */
- BO_UNLOCK(bo);
- bremfree(bp);
- error = bwrite(bp);
- BO_LOCK(bo);
- }
- break;
- }
- BO_UNLOCK(bo);
+ if (error == 0 && dap == LIST_FIRST(diraddhdp))
+ error = ffs_update(vp, MNT_WAIT);
vput(vp);
if (error != 0)
- break; /* Flushing of first block failed */
+ break;
ACQUIRE_LOCK(&lk);
/*
* If that cleared dependencies, go on to next.
*/
if (dap != LIST_FIRST(diraddhdp))
continue;
- if (dap->da_state & MKDIR_BODY)
- panic("flush_pagedep_deps: MKDIR_BODY");
+ if (dap->da_state & MKDIR_BODY) {
+ inodedep_lookup(UFSTOVFS(ump), inum, 0,
+ &inodedep);
+ panic("flush_pagedep_deps: MKDIR_BODY "
+ "inodedep %p dap %p vp %p",
+ inodedep, dap, vp);
+ }
}
/*
* Flush the inode on which the directory entry depends.
@@ -5719,8 +10643,8 @@ retry:
* If the inode still has bitmap dependencies,
* push them to disk.
*/
- if ((inodedep->id_state & DEPCOMPLETE) == 0) {
- bp = inodedep->id_buf;
+ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
+ bp = inodedep->id_bmsafemap->sm_buf;
bp = getdirtybuf(bp, &lk, MNT_WAIT);
if (bp == NULL)
goto retry;
@@ -5733,24 +10657,29 @@ retry:
}
/*
* If the inode is still sitting in a buffer waiting
- * to be written, push it to disk.
+ * to be written or waiting for the link count to be
+ * adjusted update it here to flush it to disk.
*/
- FREE_LOCK(&lk);
- if ((error = bread(ump->um_devvp,
- fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
- (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
- brelse(bp);
- break;
+ if (dap == LIST_FIRST(diraddhdp)) {
+ FREE_LOCK(&lk);
+ if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
+ FFSV_FORCEINSMQ)))
+ break;
+ error = ffs_update(vp, MNT_WAIT);
+ vput(vp);
+ if (error)
+ break;
+ ACQUIRE_LOCK(&lk);
}
- if ((error = bwrite(bp)) != 0)
- break;
- ACQUIRE_LOCK(&lk);
/*
* If we have failed to get rid of all the dependencies
* then something is seriously wrong.
*/
- if (dap == LIST_FIRST(diraddhdp))
- panic("flush_pagedep_deps: flush failed");
+ if (dap == LIST_FIRST(diraddhdp)) {
+ inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
+ panic("flush_pagedep_deps: failed to flush "
+ "inodedep %p ino %d dap %p", inodedep, inum, dap);
+ }
}
if (error)
ACQUIRE_LOCK(&lk);
@@ -5828,6 +10757,7 @@ softdep_request_cleanup(fs, vp)
return (0);
UFS_UNLOCK(ump);
ACQUIRE_LOCK(&lk);
+ process_removes(vp);
if (ump->softdep_on_worklist > 0 &&
process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
stat_worklist_push += 1;
@@ -6100,10 +11030,15 @@ softdep_count_dependencies(bp, wantcount)
int wantcount;
{
struct worklist *wk;
+ struct bmsafemap *bmsafemap;
struct inodedep *inodedep;
struct indirdep *indirdep;
+ struct freeblks *freeblks;
struct allocindir *aip;
struct pagedep *pagedep;
+ struct dirrem *dirrem;
+ struct newblk *newblk;
+ struct mkdir *mkdir;
struct diradd *dap;
int i, retval;
@@ -6132,6 +11067,12 @@ softdep_count_dependencies(bp, wantcount)
if (!wantcount)
goto out;
}
+ if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
+ /* Add reference dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
continue;
case D_INDIRDEP:
@@ -6147,6 +11088,14 @@ softdep_count_dependencies(bp, wantcount)
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
+ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
+ if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
+ /* Journal remove ref dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ }
for (i = 0; i < DAHASHSZ; i++) {
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
@@ -6159,14 +11108,62 @@ softdep_count_dependencies(bp, wantcount)
continue;
case D_BMSAFEMAP:
+ bmsafemap = WK_BMSAFEMAP(wk);
+ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
+ /* Add reference dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
+ /* Allocate block dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
+ case D_FREEBLKS:
+ freeblks = WK_FREEBLKS(wk);
+ if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {
+ /* Freeblk journal dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
case D_ALLOCDIRECT:
case D_ALLOCINDIR:
+ newblk = WK_NEWBLK(wk);
+ if (newblk->nb_jnewblk) {
+ /* Journal allocate dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
case D_MKDIR:
+ mkdir = WK_MKDIR(wk);
+ if (mkdir->md_jaddref) {
+ /* Journal reference dependency. */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
+ case D_FREEWORK:
+ case D_FREEDEP:
+ case D_JSEGDEP:
+ case D_JSEG:
+ case D_SBDEP:
/* never a dependency on these blocks */
continue;
default:
- panic("softdep_check_for_rollback: Unexpected type %s",
+ panic("softdep_count_dependencies: Unexpected type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
@@ -6382,6 +11379,45 @@ softdep_error(func, error)
#ifdef DDB
+static void
+inodedep_print(struct inodedep *inodedep, int verbose)
+{
+ db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
+ " saveino %p\n",
+ inodedep, inodedep->id_fs, inodedep->id_state,
+ (intmax_t)inodedep->id_ino,
+ (intmax_t)fsbtodb(inodedep->id_fs,
+ ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
+ inodedep->id_nlinkdelta, inodedep->id_savednlink,
+ inodedep->id_savedino1);
+
+ if (verbose == 0)
+ return;
+
+ db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
+ "mkdiradd %p\n",
+ LIST_FIRST(&inodedep->id_pendinghd),
+ LIST_FIRST(&inodedep->id_bufwait),
+ LIST_FIRST(&inodedep->id_inowait),
+ TAILQ_FIRST(&inodedep->id_inoreflst),
+ inodedep->id_mkdiradd);
+ db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
+ TAILQ_FIRST(&inodedep->id_inoupdt),
+ TAILQ_FIRST(&inodedep->id_newinoupdt),
+ TAILQ_FIRST(&inodedep->id_extupdt),
+ TAILQ_FIRST(&inodedep->id_newextupdt));
+}
+
+DB_SHOW_COMMAND(inodedep, db_show_inodedep)
+{
+
+ if (have_addr == 0) {
+ db_printf("Address required\n");
+ return;
+ }
+ inodedep_print((struct inodedep*)addr, 1);
+}
+
DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
{
struct inodedep_hashhead *inodedephd;
@@ -6395,15 +11431,62 @@ DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
LIST_FOREACH(inodedep, inodedephd, id_hash) {
if (fs != NULL && fs != inodedep->id_fs)
continue;
- db_printf("%p fs %p st %x ino %jd inoblk %jd\n",
- inodedep, inodedep->id_fs, inodedep->id_state,
- (intmax_t)inodedep->id_ino,
- (intmax_t)fsbtodb(inodedep->id_fs,
- ino_to_fsba(inodedep->id_fs, inodedep->id_ino)));
+ inodedep_print(inodedep, 0);
}
}
}
+DB_SHOW_COMMAND(worklist, db_show_worklist)
+{
+ struct worklist *wk;
+
+ if (have_addr == 0) {
+ db_printf("Address required\n");
+ return;
+ }
+ wk = (struct worklist *)addr;
+ printf("worklist: %p type %s state 0x%X\n",
+ wk, TYPENAME(wk->wk_type), wk->wk_state);
+}
+
+DB_SHOW_COMMAND(workhead, db_show_workhead)
+{
+ struct workhead *wkhd;
+ struct worklist *wk;
+ int i;
+
+ if (have_addr == 0) {
+ db_printf("Address required\n");
+ return;
+ }
+ wkhd = (struct workhead *)addr;
+ wk = LIST_FIRST(wkhd);
+ for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
+ db_printf("worklist: %p type %s state 0x%X",
+ wk, TYPENAME(wk->wk_type), wk->wk_state);
+ if (i == 100)
+ db_printf("workhead overflow");
+ printf("\n");
+}
+
+
+DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
+{
+ struct jaddref *jaddref;
+ struct diradd *diradd;
+ struct mkdir *mkdir;
+
+ LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
+ diradd = mkdir->md_diradd;
+ db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
+ mkdir, mkdir->md_state, diradd, diradd->da_state);
+ if ((jaddref = mkdir->md_jaddref) != NULL)
+ db_printf(" jaddref %p jaddref state 0x%X",
+ jaddref, jaddref->ja_state);
+ db_printf("\n");
+ }
+}
+
#endif /* DDB */
#endif /* SOFTUPDATES */
diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c
index e34bc1372a2e..e2460a36be2d 100644
--- a/sys/ufs/ffs/ffs_subr.c
+++ b/sys/ufs/ffs/ffs_subr.c
@@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");
#ifndef _KERNEL
#include <ufs/ufs/dinode.h>
#include <ufs/ffs/fs.h>
-#include "fsck.h"
#else
#include <sys/systm.h>
#include <sys/lock.h>
@@ -223,7 +222,38 @@ ffs_isblock(fs, cp, h)
mask = 0x01 << (h & 0x7);
return ((cp[h >> 3] & mask) == mask);
default:
+#ifdef _KERNEL
panic("ffs_isblock");
+#endif
+ break;
+ }
+ return (0);
+}
+
+/*
+ * check if a block is free
+ */
+int
+ffs_isfreeblock(fs, cp, h)
+ struct fs *fs;
+ u_char *cp;
+ ufs1_daddr_t h;
+{
+
+ switch ((int)fs->fs_frag) {
+ case 8:
+ return (cp[h] == 0);
+ case 4:
+ return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
+ case 2:
+ return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
+ case 1:
+ return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
+ default:
+#ifdef _KERNEL
+ panic("ffs_isfreeblock");
+#endif
+ break;
}
return (0);
}
@@ -252,7 +282,10 @@ ffs_clrblock(fs, cp, h)
cp[h >> 3] &= ~(0x01 << (h & 0x7));
return;
default:
+#ifdef _KERNEL
panic("ffs_clrblock");
+#endif
+ break;
}
}
@@ -281,6 +314,101 @@ ffs_setblock(fs, cp, h)
cp[h >> 3] |= (0x01 << (h & 0x7));
return;
default:
+#ifdef _KERNEL
panic("ffs_setblock");
+#endif
+ break;
+ }
+}
+
+/*
+ * Update the cluster map because of an allocation or free.
+ *
+ * Cnt == 1 means free; cnt == -1 means allocating.
+ */
+void
+ffs_clusteracct(fs, cgp, blkno, cnt)
+ struct fs *fs;
+ struct cg *cgp;
+ ufs1_daddr_t blkno;
+ int cnt;
+{
+ int32_t *sump;
+ int32_t *lp;
+ u_char *freemapp, *mapp;
+ int i, start, end, forw, back, map, bit;
+
+ if (fs->fs_contigsumsize <= 0)
+ return;
+ freemapp = cg_clustersfree(cgp);
+ sump = cg_clustersum(cgp);
+ /*
+ * Allocate or clear the actual block.
+ */
+ if (cnt > 0)
+ setbit(freemapp, blkno);
+ else
+ clrbit(freemapp, blkno);
+ /*
+ * Find the size of the cluster going forward.
+ */
+ start = blkno + 1;
+ end = start + fs->fs_contigsumsize;
+ if (end >= cgp->cg_nclusterblks)
+ end = cgp->cg_nclusterblks;
+ mapp = &freemapp[start / NBBY];
+ map = *mapp++;
+ bit = 1 << (start % NBBY);
+ for (i = start; i < end; i++) {
+ if ((map & bit) == 0)
+ break;
+ if ((i & (NBBY - 1)) != (NBBY - 1)) {
+ bit <<= 1;
+ } else {
+ map = *mapp++;
+ bit = 1;
+ }
+ }
+ forw = i - start;
+ /*
+ * Find the size of the cluster going backward.
+ */
+ start = blkno - 1;
+ end = start - fs->fs_contigsumsize;
+ if (end < 0)
+ end = -1;
+ mapp = &freemapp[start / NBBY];
+ map = *mapp--;
+ bit = 1 << (start % NBBY);
+ for (i = start; i > end; i--) {
+ if ((map & bit) == 0)
+ break;
+ if ((i & (NBBY - 1)) != 0) {
+ bit >>= 1;
+ } else {
+ map = *mapp--;
+ bit = 1 << (NBBY - 1);
+ }
}
+ back = start - i;
+ /*
+ * Account for old cluster and the possibly new forward and
+ * back clusters.
+ */
+ i = back + forw + 1;
+ if (i > fs->fs_contigsumsize)
+ i = fs->fs_contigsumsize;
+ sump[i] += cnt;
+ if (back > 0)
+ sump[back] -= cnt;
+ if (forw > 0)
+ sump[forw] -= cnt;
+ /*
+ * Update cluster summary information.
+ */
+ lp = &sump[fs->fs_contigsumsize];
+ for (i = fs->fs_contigsumsize; i > 0; i--)
+ if (*lp-- > 0)
+ break;
+ fs->fs_maxcluster[cgp->cg_cgx] = i;
}
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 8aa9f9c53a4c..e40336863248 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -79,7 +79,6 @@ static int ffs_reload(struct mount *, struct thread *);
static int ffs_mountfs(struct vnode *, struct mount *, struct thread *);
static void ffs_oldfscompat_read(struct fs *, struct ufsmount *,
ufs2_daddr_t);
-static void ffs_oldfscompat_write(struct fs *, struct ufsmount *);
static void ffs_ifree(struct ufsmount *ump, struct inode *ip);
static vfs_init_t ffs_init;
static vfs_uninit_t ffs_uninit;
@@ -299,7 +298,8 @@ ffs_mount(struct mount *mp)
if (fs->fs_clean == 0) {
fs->fs_flags |= FS_UNCLEAN;
if ((mp->mnt_flag & MNT_FORCE) ||
- ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
+ ((fs->fs_flags &
+ (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
(fs->fs_flags & FS_DOSOFTDEP))) {
printf("WARNING: %s was not %s\n",
fs->fs_fsmnt, "properly dismounted");
@@ -307,6 +307,9 @@ ffs_mount(struct mount *mp)
printf(
"WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n",
fs->fs_fsmnt);
+ if (fs->fs_flags & FS_SUJ)
+ printf(
+"WARNING: Forced mount will invalidated journal contents\n");
return (EPERM);
}
}
@@ -330,17 +333,18 @@ ffs_mount(struct mount *mp)
MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_RDONLY;
MNT_IUNLOCK(mp);
- fs->fs_clean = 0;
- if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
- vn_finished_write(mp);
- return (error);
- }
+ fs->fs_mtime = time_second;
/* check to see if we need to start softdep */
if ((fs->fs_flags & FS_DOSOFTDEP) &&
(error = softdep_mount(devvp, mp, fs, td->td_ucred))){
vn_finished_write(mp);
return (error);
}
+ fs->fs_clean = 0;
+ if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
+ vn_finished_write(mp);
+ return (error);
+ }
if (fs->fs_snapinum[0] != 0)
ffs_snapshot_mount(mp);
vn_finished_write(mp);
@@ -665,7 +669,6 @@ ffs_mountfs(devvp, mp, td)
if (mp->mnt_iosize_max > MAXPHYS)
mp->mnt_iosize_max = MAXPHYS;
- devvp->v_bufobj.bo_private = cp;
devvp->v_bufobj.bo_ops = &ffs_ops;
fs = NULL;
@@ -706,7 +709,7 @@ ffs_mountfs(devvp, mp, td)
if (fs->fs_clean == 0) {
fs->fs_flags |= FS_UNCLEAN;
if (ronly || (mp->mnt_flag & MNT_FORCE) ||
- ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
+ ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
(fs->fs_flags & FS_DOSOFTDEP))) {
printf(
"WARNING: %s was not properly dismounted\n",
@@ -715,6 +718,9 @@ ffs_mountfs(devvp, mp, td)
printf(
"WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n",
fs->fs_fsmnt);
+ if (fs->fs_flags & FS_SUJ)
+ printf(
+"WARNING: Forced mount will invalidated journal contents\n");
error = EPERM;
goto out;
}
@@ -897,6 +903,7 @@ ffs_mountfs(devvp, mp, td)
*/
bzero(fs->fs_fsmnt, MAXMNTLEN);
strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
+ mp->mnt_stat.f_iosize = fs->fs_bsize;
if( mp->mnt_flag & MNT_ROOTFS) {
/*
@@ -908,6 +915,7 @@ ffs_mountfs(devvp, mp, td)
}
if (ronly == 0) {
+ fs->fs_mtime = time_second;
if ((fs->fs_flags & FS_DOSOFTDEP) &&
(error = softdep_mount(devvp, mp, fs, cred)) != 0) {
free(fs->fs_csp, M_UFSMNT);
@@ -938,7 +946,6 @@ ffs_mountfs(devvp, mp, td)
* This would all happen while the filesystem was busy/not
* available, so would effectively be "atomic".
*/
- mp->mnt_stat.f_iosize = fs->fs_bsize;
(void) ufs_extattr_autostart(mp, td);
#endif /* !UFS_EXTATTR_AUTOSTART */
#endif /* !UFS_EXTATTR */
@@ -1038,7 +1045,7 @@ ffs_oldfscompat_read(fs, ump, sblockloc)
* XXX - Parts get retired eventually.
* Unfortunately new bits get added.
*/
-static void
+void
ffs_oldfscompat_write(fs, ump)
struct fs *fs;
struct ufsmount *ump;
@@ -1133,6 +1140,7 @@ ffs_unmount(mp, mntflags)
fs->fs_pendinginodes = 0;
}
UFS_UNLOCK(ump);
+ softdep_unmount(mp);
if (fs->fs_ronly == 0) {
fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
error = ffs_sbupdate(ump, MNT_WAIT, 0);
@@ -1574,16 +1582,6 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
DIP_SET(ip, i_gen, ip->i_gen);
}
}
- /*
- * Ensure that uid and gid are correct. This is a temporary
- * fix until fsck has been changed to do the update.
- */
- if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */
- fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */
- ip->i_uid = ip->i_din1->di_ouid; /* XXX */
- ip->i_gid = ip->i_din1->di_ogid; /* XXX */
- } /* XXX */
-
#ifdef MAC
if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
/*
@@ -1727,6 +1725,8 @@ ffs_sbupdate(mp, waitfor, suspended)
}
fs->fs_fmod = 0;
fs->fs_time = time_second;
+ if (fs->fs_flags & FS_DOSOFTDEP)
+ softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp);
bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
if (suspended)
@@ -1868,9 +1868,6 @@ ffs_bufwrite(struct buf *bp)
}
BO_UNLOCK(bp->b_bufobj);
- /* Mark the buffer clean */
- bundirty(bp);
-
/*
* If this buffer is marked for background writing and we
* do not have to wait for it, make a copy and write the
@@ -1911,9 +1908,16 @@ ffs_bufwrite(struct buf *bp)
newbp->b_flags &= ~B_INVAL;
#ifdef SOFTUPDATES
- /* move over the dependencies */
- if (!LIST_EMPTY(&bp->b_dep))
- softdep_move_dependencies(bp, newbp);
+ /*
+ * Move over the dependencies. If there are rollbacks,
+ * leave the parent buffer dirtied as it will need to
+ * be written again.
+ */
+ if (LIST_EMPTY(&bp->b_dep) ||
+ softdep_move_dependencies(bp, newbp) == 0)
+ bundirty(bp);
+#else
+ bundirty(bp);
#endif
/*
@@ -1926,7 +1930,10 @@ ffs_bufwrite(struct buf *bp)
*/
bqrelse(bp);
bp = newbp;
- }
+ } else
+ /* Mark the buffer clean */
+ bundirty(bp);
+
/* Let the normal bufwrite do the rest for us */
normal_write:
@@ -1940,6 +1947,7 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
struct vnode *vp;
int error;
struct buf *tbp;
+ int nocopy;
vp = bo->__bo_vnode;
if (bp->b_iocmd == BIO_WRITE) {
@@ -1947,8 +1955,9 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
(bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
panic("ffs_geom_strategy: bad I/O");
- bp->b_flags &= ~B_VALIDSUSPWRT;
- if ((vp->v_vflag & VV_COPYONWRITE) &&
+ nocopy = bp->b_flags & B_NOCOPY;
+ bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY);
+ if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&
vp->v_rdev->si_snapdata != NULL) {
if ((bp->b_flags & B_CLUSTER) != 0) {
runningbufwakeup(bp);
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 464a7613e162..e6617cbcdfa8 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -225,6 +225,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor)
wait = (waitfor == MNT_WAIT);
lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
bo = &vp->v_bufobj;
+ ip->i_flag &= ~IN_NEEDSYNC;
/*
* Flush all dirty buffers associated with a vnode.
diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h
index 5452e2be6de2..e863b961c620 100644
--- a/sys/ufs/ffs/fs.h
+++ b/sys/ufs/ffs/fs.h
@@ -340,7 +340,9 @@ struct fs {
u_int32_t fs_avgfilesize; /* expected average file size */
u_int32_t fs_avgfpdir; /* expected # of files per directory */
int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */
- int32_t fs_sparecon32[26]; /* reserved for future constants */
+ ufs_time_t fs_mtime; /* Last mount or fsck time. */
+ int32_t fs_sujfree; /* SUJ free list */
+ int32_t fs_sparecon32[23]; /* reserved for future constants */
int32_t fs_flags; /* see FS_ flags below */
int32_t fs_contigsumsize; /* size of cluster summary array */
int32_t fs_maxsymlinklen; /* max length of an internal symlink */
@@ -408,12 +410,13 @@ CTASSERT(sizeof(struct fs) == 1376);
#define FS_UNCLEAN 0x0001 /* filesystem not clean at mount */
#define FS_DOSOFTDEP 0x0002 /* filesystem using soft dependencies */
#define FS_NEEDSFSCK 0x0004 /* filesystem needs sync fsck before mount */
-#define FS_INDEXDIRS 0x0008 /* kernel supports indexed directories */
+#define FS_SUJ 0x0008 /* Filesystem using softupdate journal */
#define FS_ACLS 0x0010 /* file system has POSIX.1e ACLs enabled */
#define FS_MULTILABEL 0x0020 /* file system is MAC multi-label */
#define FS_GJOURNAL 0x0040 /* gjournaled file system */
#define FS_FLAGS_UPDATED 0x0080 /* flags have been moved to new location */
#define FS_NFS4ACLS 0x0100 /* file system has NFSv4 ACLs enabled */
+#define FS_INDEXDIRS 0x0200 /* kernel supports indexed directories */
/*
* Macros to access bits in the fs_active array.
@@ -603,7 +606,31 @@ struct cg {
? (fs)->fs_bsize \
: (fragroundup(fs, blkoff(fs, (size)))))
-
+/*
+ * Indirect lbns are aligned on NDADDR addresses where single indirects
+ * are the negated address of the lowest lbn reachable, double indirects
+ * are this lbn - 1 and triple indirects are this lbn - 2. This yields
+ * an unusual bit order to determine level.
+ */
+static inline int
+lbn_level(ufs_lbn_t lbn)
+{
+ if (lbn >= 0)
+ return 0;
+ switch (lbn & 0x3) {
+ case 0:
+ return (0);
+ case 1:
+ break;
+ case 2:
+ return (2);
+ case 3:
+ return (1);
+ default:
+ break;
+ }
+ return (-1);
+}
/*
* Number of inodes in a secondary storage block/fragment.
*/
@@ -615,6 +642,108 @@ struct cg {
*/
#define NINDIR(fs) ((fs)->fs_nindir)
+/*
+ * Softdep journal record format.
+ */
+
+#define JOP_ADDREF 1 /* Add a reference to an inode. */
+#define JOP_REMREF 2 /* Remove a reference from an inode. */
+#define JOP_NEWBLK 3 /* Allocate a block. */
+#define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */
+#define JOP_MVREF 5 /* Move a reference from one off to another. */
+#define JOP_TRUNC 6 /* Partial truncation record. */
+
+#define JREC_SIZE 32 /* Record and segment header size. */
+
+#define SUJ_MIN (4 * 1024 * 1024) /* Minimum journal size */
+#define SUJ_MAX (32 * 1024 * 1024) /* Maximum journal size */
+#define SUJ_FILE ".sujournal" /* Journal file name */
+
+/*
+ * Size of the segment record header. There is at most one for each disk
+ * block n the journal. The segment header is followed by an array of
+ * records. fsck depends on the first element in each record being 'op'
+ * and the second being 'ino'. Segments may span multiple disk blocks but
+ * the header is present on each.
+ */
+struct jsegrec {
+ uint64_t jsr_seq; /* Our sequence number */
+ uint64_t jsr_oldest; /* Oldest valid sequence number */
+ uint16_t jsr_cnt; /* Count of valid records */
+ uint16_t jsr_blocks; /* Count of DEV_BSIZE blocks. */
+ uint32_t jsr_crc; /* 32bit crc of the valid space */
+ ufs_time_t jsr_time; /* timestamp for mount instance */
+};
+
+/*
+ * Reference record. Records a single link count modification.
+ */
+struct jrefrec {
+ uint32_t jr_op;
+ ino_t jr_ino;
+ ino_t jr_parent;
+ uint16_t jr_nlink;
+ uint16_t jr_mode;
+ off_t jr_diroff;
+ uint64_t jr_unused;
+};
+
+/*
+ * Move record. Records a reference moving within a directory block. The
+ * nlink is unchanged but we must search both locations.
+ */
+struct jmvrec {
+ uint32_t jm_op;
+ ino_t jm_ino;
+ ino_t jm_parent;
+ uint16_t jm_unused;
+ off_t jm_oldoff;
+ off_t jm_newoff;
+};
+
+/*
+ * Block record. A set of frags or tree of blocks starting at an indirect are
+ * freed or a set of frags are allocated.
+ */
+struct jblkrec {
+ uint32_t jb_op;
+ uint32_t jb_ino;
+ ufs2_daddr_t jb_blkno;
+ ufs_lbn_t jb_lbn;
+ uint16_t jb_frags;
+ uint16_t jb_oldfrags;
+ uint32_t jb_unused;
+};
+
+/*
+ * Truncation record. Records a partial truncation so that it may be
+ * completed later.
+ */
+struct jtrncrec {
+ uint32_t jt_op;
+ uint32_t jt_ino;
+ off_t jt_size;
+ uint32_t jt_extsize;
+ uint32_t jt_pad[3];
+};
+
+union jrec {
+ struct jsegrec rec_jsegrec;
+ struct jrefrec rec_jrefrec;
+ struct jmvrec rec_jmvrec;
+ struct jblkrec rec_jblkrec;
+ struct jtrncrec rec_jtrncrec;
+};
+
+#ifdef CTASSERT
+CTASSERT(sizeof(struct jsegrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jrefrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jmvrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jblkrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE);
+CTASSERT(sizeof(union jrec) == JREC_SIZE);
+#endif
+
extern int inside[], around[];
extern u_char *fragtbl[];
diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h
index b00183bcfd2c..5d8a8691b170 100644
--- a/sys/ufs/ffs/softdep.h
+++ b/sys/ufs/ffs/softdep.h
@@ -94,22 +94,29 @@
* The ONWORKLIST flag shows whether the structure is currently linked
* onto a worklist.
*/
-#define ATTACHED 0x0001
-#define UNDONE 0x0002
-#define COMPLETE 0x0004
-#define DEPCOMPLETE 0x0008
-#define MKDIR_PARENT 0x0010 /* diradd & mkdir only */
-#define MKDIR_BODY 0x0020 /* diradd & mkdir only */
-#define RMDIR 0x0040 /* dirrem only */
-#define DIRCHG 0x0080 /* diradd & dirrem only */
-#define GOINGAWAY 0x0100 /* indirdep only */
-#define IOSTARTED 0x0200 /* inodedep & pagedep only */
-#define SPACECOUNTED 0x0400 /* inodedep only */
-#define NEWBLOCK 0x0800 /* pagedep only */
-#define INPROGRESS 0x1000 /* dirrem, freeblks, freefrag, freefile only */
-#define UFS1FMT 0x2000 /* indirdep only */
-#define EXTDATA 0x4000 /* allocdirect only */
-#define ONWORKLIST 0x8000
+#define ATTACHED 0x000001
+#define UNDONE 0x000002
+#define COMPLETE 0x000004
+#define DEPCOMPLETE 0x000008
+#define MKDIR_PARENT 0x000010 /* diradd, mkdir, jaddref, jsegdep only */
+#define MKDIR_BODY 0x000020 /* diradd, mkdir, jaddref only */
+#define RMDIR 0x000040 /* dirrem only */
+#define DIRCHG 0x000080 /* diradd, dirrem only */
+#define GOINGAWAY 0x000100 /* indirdep, jremref only */
+#define IOSTARTED 0x000200 /* inodedep, pagedep, bmsafemap only */
+#define SPACECOUNTED 0x000400 /* inodedep only */
+#define NEWBLOCK 0x000800 /* pagedep, jaddref only */
+#define INPROGRESS 0x001000 /* dirrem, freeblks, freefrag, freefile only */
+#define UFS1FMT 0x002000 /* indirdep only */
+#define EXTDATA 0x004000 /* allocdirect only */
+#define ONWORKLIST 0x008000
+#define IOWAITING 0x010000 /* Thread is waiting for IO to complete. */
+#define ONDEPLIST 0x020000 /* Structure is on a dependency list. */
+#define UNLINKED 0x040000 /* inodedep has been unlinked. */
+#define UNLINKNEXT 0x080000 /* inodedep has valid di_freelink */
+#define UNLINKPREV 0x100000 /* inodedep is pointed at in the unlink list */
+#define UNLINKONLIST 0x200000 /* inodedep is in the unlinked list on disk */
+#define UNLINKLINKS (UNLINKNEXT | UNLINKPREV)
#define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE)
@@ -135,25 +142,38 @@
* and the macros below changed to use it.
*/
struct worklist {
- struct mount *wk_mp; /* Mount we live in */
LIST_ENTRY(worklist) wk_list; /* list of work requests */
- unsigned short wk_type; /* type of request */
- unsigned short wk_state; /* state flags */
+ struct mount *wk_mp; /* Mount we live in */
+ unsigned int wk_type:8, /* type of request */
+ wk_state:24; /* state flags */
};
#define WK_DATA(wk) ((void *)(wk))
#define WK_PAGEDEP(wk) ((struct pagedep *)(wk))
#define WK_INODEDEP(wk) ((struct inodedep *)(wk))
#define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk))
+#define WK_NEWBLK(wk) ((struct newblk *)(wk))
#define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk))
#define WK_INDIRDEP(wk) ((struct indirdep *)(wk))
#define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk))
#define WK_FREEFRAG(wk) ((struct freefrag *)(wk))
#define WK_FREEBLKS(wk) ((struct freeblks *)(wk))
+#define WK_FREEWORK(wk) ((struct freework *)(wk))
#define WK_FREEFILE(wk) ((struct freefile *)(wk))
#define WK_DIRADD(wk) ((struct diradd *)(wk))
#define WK_MKDIR(wk) ((struct mkdir *)(wk))
#define WK_DIRREM(wk) ((struct dirrem *)(wk))
#define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk))
+#define WK_JADDREF(wk) ((struct jaddref *)(wk))
+#define WK_JREMREF(wk) ((struct jremref *)(wk))
+#define WK_JMVREF(wk) ((struct jmvref *)(wk))
+#define WK_JSEGDEP(wk) ((struct jsegdep *)(wk))
+#define WK_JSEG(wk) ((struct jseg *)(wk))
+#define WK_JNEWBLK(wk) ((struct jnewblk *)(wk))
+#define WK_JFREEBLK(wk) ((struct jfreeblk *)(wk))
+#define WK_FREEDEP(wk) ((struct freedep *)(wk))
+#define WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk))
+#define WK_SBDEP(wk) ((struct sbdep *)wk)
+#define WK_JTRUNC(wk) ((struct jtrunc *)(wk))
/*
* Various types of lists
@@ -165,6 +185,15 @@ LIST_HEAD(inodedephd, inodedep);
LIST_HEAD(allocindirhd, allocindir);
LIST_HEAD(allocdirecthd, allocdirect);
TAILQ_HEAD(allocdirectlst, allocdirect);
+LIST_HEAD(indirdephd, indirdep);
+LIST_HEAD(jaddrefhd, jaddref);
+LIST_HEAD(jremrefhd, jremref);
+LIST_HEAD(jmvrefhd, jmvref);
+LIST_HEAD(jnewblkhd, jnewblk);
+LIST_HEAD(jfreeblkhd, jfreeblk);
+LIST_HEAD(freeworkhd, freework);
+TAILQ_HEAD(jseglst, jseg);
+TAILQ_HEAD(inoreflst, inoref);
/*
* The "pagedep" structure tracks the various dependencies related to
@@ -192,9 +221,11 @@ struct pagedep {
LIST_ENTRY(pagedep) pd_hash; /* hashed lookup */
ino_t pd_ino; /* associated file */
ufs_lbn_t pd_lbn; /* block within file */
+ struct newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */
struct dirremhd pd_dirremhd; /* dirrem's waiting for page */
struct diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */
struct diraddhd pd_pendinghd; /* directory entries awaiting write */
+ struct jmvrefhd pd_jmvrefhd; /* Dependent journal writes. */
};
/*
@@ -248,13 +279,18 @@ struct inodedep {
struct worklist id_list; /* buffer holding inode block */
# define id_state id_list.wk_state /* inode dependency state */
LIST_ENTRY(inodedep) id_hash; /* hashed lookup */
+ TAILQ_ENTRY(inodedep) id_unlinked; /* Unlinked but ref'd inodes */
struct fs *id_fs; /* associated filesystem */
ino_t id_ino; /* dependent inode */
nlink_t id_nlinkdelta; /* saved effective link count */
+ nlink_t id_savednlink; /* Link saved during rollback */
LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */
- struct buf *id_buf; /* related bmsafemap (if pending) */
+ struct bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */
+ struct diradd *id_mkdiradd; /* diradd for a mkdir. */
+ struct inoreflst id_inoreflst; /* Inode reference adjustments. */
long id_savedextsize; /* ext size saved during rollback */
off_t id_savedsize; /* file size saved during rollback */
+ struct dirremhd id_dirremhd; /* Removals pending. */
struct workhead id_pendinghd; /* entries awaiting directory write */
struct workhead id_bufwait; /* operations after inode written */
struct workhead id_inowait; /* operations waiting inode update */
@@ -271,23 +307,6 @@ struct inodedep {
#define id_savedino2 id_un.idu_savedino2
/*
- * A "newblk" structure is attached to a bmsafemap structure when a block
- * or fragment is allocated from a cylinder group. Its state is set to
- * DEPCOMPLETE when its cylinder group map is written. It is consumed by
- * an associated allocdirect or allocindir allocation which will attach
- * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag
- * is not set (i.e., its cylinder group map has not been written).
- */
-struct newblk {
- LIST_ENTRY(newblk) nb_hash; /* hashed lookup */
- struct fs *nb_fs; /* associated filesystem */
- int nb_state; /* state of bitmap dependency */
- ufs2_daddr_t nb_newblkno; /* allocated block number */
- LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblk's */
- struct bmsafemap *nb_bmsafemap; /* associated bmsafemap */
-};
-
-/*
* A "bmsafemap" structure maintains a list of dependency structures
* that depend on the update of a particular cylinder group map.
* It has lists for newblks, allocdirects, allocindirs, and inodedeps.
@@ -299,11 +318,41 @@ struct newblk {
*/
struct bmsafemap {
struct worklist sm_list; /* cylgrp buffer */
+# define sm_state sm_list.wk_state
+ int sm_cg;
+ LIST_ENTRY(bmsafemap) sm_hash; /* Hash links. */
struct buf *sm_buf; /* associated buffer */
struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */
+ struct allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */
struct allocindirhd sm_allocindirhd; /* allocindir deps */
+ struct allocindirhd sm_allocindirwr; /* writing allocindir deps */
struct inodedephd sm_inodedephd; /* inodedep deps */
+ struct inodedephd sm_inodedepwr; /* writing inodedep deps */
struct newblkhd sm_newblkhd; /* newblk deps */
+ struct newblkhd sm_newblkwr; /* writing newblk deps */
+ struct jaddrefhd sm_jaddrefhd; /* Pending inode allocations. */
+ struct jnewblkhd sm_jnewblkhd; /* Pending block allocations. */
+};
+
+/*
+ * A "newblk" structure is attached to a bmsafemap structure when a block
+ * or fragment is allocated from a cylinder group. Its state is set to
+ * DEPCOMPLETE when its cylinder group map is written. It is converted to
+ * an allocdirect or allocindir allocation once the allocator calls the
+ * appropriate setup function.
+ */
+struct newblk {
+ struct worklist nb_list;
+# define nb_state nb_list.wk_state
+ LIST_ENTRY(newblk) nb_hash; /* hashed lookup */
+ LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblks */
+ struct jnewblk *nb_jnewblk; /* New block journal entry. */
+ struct bmsafemap *nb_bmsafemap;/* cylgrp dep (if pending) */
+ struct freefrag *nb_freefrag; /* fragment to be freed (if any) */
+ struct indirdephd nb_indirdeps; /* Children indirect blocks. */
+ struct workhead nb_newdirblk; /* dir block to notify when written */
+ struct workhead nb_jwork; /* Journal work pending. */
+ ufs2_daddr_t nb_newblkno; /* new value of block pointer */
};
/*
@@ -334,20 +383,18 @@ struct bmsafemap {
* and inodedep->id_pendinghd lists.
*/
struct allocdirect {
- struct worklist ad_list; /* buffer holding block */
-# define ad_state ad_list.wk_state /* block pointer state */
+ struct newblk ad_block; /* Common block logic */
+# define ad_state ad_block.nb_list.wk_state /* block pointer state */
TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */
- ufs_lbn_t ad_lbn; /* block within file */
- ufs2_daddr_t ad_newblkno; /* new value of block pointer */
- ufs2_daddr_t ad_oldblkno; /* old value of block pointer */
- long ad_newsize; /* size of new block */
- long ad_oldsize; /* size of old block */
- LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */
- struct buf *ad_buf; /* cylgrp buffer (if pending) */
struct inodedep *ad_inodedep; /* associated inodedep */
- struct freefrag *ad_freefrag; /* fragment to be freed (if any) */
- struct workhead ad_newdirblk; /* dir block to notify when written */
+ ufs2_daddr_t ad_oldblkno; /* old value of block pointer */
+ int ad_offset; /* Pointer offset in parent. */
+ long ad_newsize; /* size of new block */
+ long ad_oldsize; /* size of old block */
};
+#define ad_newblkno ad_block.nb_newblkno
+#define ad_freefrag ad_block.nb_freefrag
+#define ad_newdirblk ad_block.nb_newdirblk
/*
* A single "indirdep" structure manages all allocation dependencies for
@@ -369,10 +416,14 @@ struct allocdirect {
struct indirdep {
struct worklist ir_list; /* buffer holding indirect block */
# define ir_state ir_list.wk_state /* indirect block pointer state */
- caddr_t ir_saveddata; /* buffer cache contents */
+ LIST_ENTRY(indirdep) ir_next; /* alloc{direct,indir} list */
+ caddr_t ir_saveddata; /* buffer cache contents */
struct buf *ir_savebp; /* buffer holding safe copy */
+ struct allocindirhd ir_completehd; /* waiting for indirdep complete */
+ struct allocindirhd ir_writehd; /* Waiting for the pointer write. */
struct allocindirhd ir_donehd; /* done waiting to update safecopy */
struct allocindirhd ir_deplisthd; /* allocindir deps for this block */
+ struct workhead ir_jwork; /* Journal work pending. */
};
/*
@@ -389,16 +440,25 @@ struct indirdep {
* can then be freed as it is no longer applicable.
*/
struct allocindir {
- struct worklist ai_list; /* buffer holding indirect block */
-# define ai_state ai_list.wk_state /* indirect block pointer state */
+ struct newblk ai_block; /* Common block area */
+# define ai_state ai_block.nb_list.wk_state /* indirect pointer state */
LIST_ENTRY(allocindir) ai_next; /* indirdep's list of allocindir's */
- int ai_offset; /* pointer offset in indirect block */
- ufs2_daddr_t ai_newblkno; /* new block pointer value */
- ufs2_daddr_t ai_oldblkno; /* old block pointer value */
- struct freefrag *ai_freefrag; /* block to be freed when complete */
struct indirdep *ai_indirdep; /* address of associated indirdep */
- LIST_ENTRY(allocindir) ai_deps; /* bmsafemap's list of allocindir's */
- struct buf *ai_buf; /* cylgrp buffer (if pending) */
+ ufs2_daddr_t ai_oldblkno; /* old value of block pointer */
+ int ai_offset; /* Pointer offset in parent. */
+};
+#define ai_newblkno ai_block.nb_newblkno
+#define ai_freefrag ai_block.nb_freefrag
+#define ai_newdirblk ai_block.nb_newdirblk
+
+/*
+ * The allblk union is used to size the newblk structure on allocation so
+ * that it may be any one of three types.
+ */
+union allblk {
+ struct allocindir ab_allocindir;
+ struct allocdirect ab_allocdirect;
+ struct newblk ab_newblk;
};
/*
@@ -406,14 +466,13 @@ struct allocindir {
* allocated fragment is replaced with a larger fragment, rather than extended.
* The "freefrag" structure is constructed and attached when the replacement
* block is first allocated. It is processed after the inode claiming the
- * bigger block that replaces it has been written to disk. Note that the
- * ff_state field is is used to store the uid, so may lose data. However,
- * the uid is used only in printing an error message, so is not critical.
- * Keeping it in a short keeps the data structure down to 32 bytes.
+ * bigger block that replaces it has been written to disk.
*/
struct freefrag {
struct worklist ff_list; /* id_inowait or delayed worklist */
-# define ff_state ff_list.wk_state /* owning user; should be uid_t */
+# define ff_state ff_list.wk_state
+ struct jfreefrag *ff_jfreefrag; /* Associated journal entry. */
+ struct workhead ff_jwork; /* Journal work pending. */
ufs2_daddr_t ff_blkno; /* fragment physical block number */
long ff_fragsize; /* size of fragment being deleted */
ino_t ff_inum; /* owning inode number */
@@ -423,20 +482,57 @@ struct freefrag {
* A "freeblks" structure is attached to an "inodedep" when the
* corresponding file's length is reduced to zero. It records all
* the information needed to free the blocks of a file after its
- * zero'ed inode has been written to disk.
+ * zero'ed inode has been written to disk. The actual work is done
+ * by child freework structures which are responsible for individual
+ * inode pointers while freeblks is responsible for retiring the
+ * entire operation when it is complete and holding common members.
*/
struct freeblks {
struct worklist fb_list; /* id_inowait or delayed worklist */
# define fb_state fb_list.wk_state /* inode and dirty block state */
+ struct jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */
+ struct workhead fb_freeworkhd; /* Work items pending */
+ struct workhead fb_jwork; /* Journal work pending */
ino_t fb_previousinum; /* inode of previous owner of blocks */
uid_t fb_uid; /* uid of previous owner of blocks */
struct vnode *fb_devvp; /* filesystem device vnode */
- long fb_oldextsize; /* previous ext data size */
- off_t fb_oldsize; /* previous file size */
ufs2_daddr_t fb_chkcnt; /* used to check cnt of blks released */
- ufs2_daddr_t fb_dblks[NDADDR]; /* direct blk ptrs to deallocate */
- ufs2_daddr_t fb_iblks[NIADDR]; /* indirect blk ptrs to deallocate */
- ufs2_daddr_t fb_eblks[NXADDR]; /* indirect blk ptrs to deallocate */
+ int fb_ref; /* Children outstanding. */
+};
+
+/*
+ * A "freework" structure handles the release of a tree of blocks or a single
+ * block. Each indirect block in a tree is allocated its own freework
+ * structure so that the indrect block may be freed only when all of its
+ * children are freed. In this way we enforce the rule that an allocated
+ * block must have a valid path to a root that is journaled. Each child
+ * block acquires a reference and when the ref hits zero the parent ref
+ * is decremented. If there is no parent the freeblks ref is decremented.
+ */
+struct freework {
+ struct worklist fw_list;
+# define fw_state fw_list.wk_state
+ LIST_ENTRY(freework) fw_next; /* Queue for freeblksk. */
+ struct freeblks *fw_freeblks; /* Root of operation. */
+ struct freework *fw_parent; /* Parent indirect. */
+ ufs2_daddr_t fw_blkno; /* Our block #. */
+ ufs_lbn_t fw_lbn; /* Original lbn before free. */
+ int fw_frags; /* Number of frags. */
+ int fw_ref; /* Number of children out. */
+ int fw_off; /* Current working position. */
+ struct workhead fw_jwork; /* Journal work pending. */
+};
+
+/*
+ * A "freedep" structure is allocated to track the completion of a bitmap
+ * write for a freework. One freedep may cover many freed blocks so long
+ * as they reside in the same cylinder group. When the cg is written
+ * the freedep decrements the ref on the freework which may permit it
+ * to be freed as well.
+ */
+struct freedep {
+ struct worklist fd_list;
+ struct freework *fd_freework; /* Parent freework. */
};
/*
@@ -450,6 +546,7 @@ struct freefile {
mode_t fx_mode; /* mode of inode */
ino_t fx_oldinum; /* inum of the unlinked file */
struct vnode *fx_devvp; /* filesystem device vnode */
+ struct workhead fx_jwork; /* journal work pending. */
};
/*
@@ -482,12 +579,11 @@ struct freefile {
* than zero.
*
* The overlaying of da_pagedep and da_previous is done to keep the
- * structure down to 32 bytes in size on a 32-bit machine. If a
- * da_previous entry is present, the pointer to its pagedep is available
- * in the associated dirrem entry. If the DIRCHG flag is set, the
- * da_previous entry is valid; if not set the da_pagedep entry is valid.
- * The DIRCHG flag never changes; it is set when the structure is created
- * if appropriate and is never cleared.
+ * structure down. If a da_previous entry is present, the pointer to its
+ * pagedep is available in the associated dirrem entry. If the DIRCHG flag
+ * is set, the da_previous entry is valid; if not set the da_pagedep entry
+ * is valid. The DIRCHG flag never changes; it is set when the structure
+ * is created if appropriate and is never cleared.
*/
struct diradd {
struct worklist da_list; /* id_inowait or id_pendinghd list */
@@ -499,6 +595,7 @@ struct diradd {
struct dirrem *dau_previous; /* entry being replaced in dir change */
struct pagedep *dau_pagedep; /* pagedep dependency for addition */
} da_un;
+ struct workhead da_jwork; /* Journal work awaiting completion. */
};
#define da_previous da_un.dau_previous
#define da_pagedep da_un.dau_pagedep
@@ -525,12 +622,13 @@ struct diradd {
* mkdir structures that reference it. The deletion would be faster if the
* diradd structure were simply augmented to have two pointers that referenced
* the associated mkdir's. However, this would increase the size of the diradd
- * structure from 32 to 64-bits to speed a very infrequent operation.
+ * structure to speed a very infrequent operation.
*/
struct mkdir {
struct worklist md_list; /* id_inowait or buffer holding dir */
# define md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */
struct diradd *md_diradd; /* associated diradd */
+ struct jaddref *md_jaddref; /* dependent jaddref. */
struct buf *md_buf; /* MKDIR_BODY: buffer holding dir */
LIST_ENTRY(mkdir) md_mkdirs; /* list of all mkdirs */
};
@@ -542,20 +640,19 @@ LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
* list of the pagedep for the directory page that contains the entry.
* It is processed after the directory page with the deleted entry has
* been written to disk.
- *
- * The overlaying of dm_pagedep and dm_dirinum is done to keep the
- * structure down to 32 bytes in size on a 32-bit machine. It works
- * because they are never used concurrently.
*/
struct dirrem {
struct worklist dm_list; /* delayed worklist */
# define dm_state dm_list.wk_state /* state of the old directory entry */
LIST_ENTRY(dirrem) dm_next; /* pagedep's list of dirrem's */
+ LIST_ENTRY(dirrem) dm_inonext; /* inodedep's list of dirrem's */
+ struct jremrefhd dm_jremrefhd; /* Pending remove reference deps. */
ino_t dm_oldinum; /* inum of the removed dir entry */
union {
struct pagedep *dmu_pagedep; /* pagedep dependency for remove */
ino_t dmu_dirinum; /* parent inode number (for rmdir) */
} dm_un;
+ struct workhead dm_jwork; /* Journal work awaiting completion. */
};
#define dm_pagedep dm_un.dmu_pagedep
#define dm_dirinum dm_un.dmu_dirinum
@@ -577,9 +674,200 @@ struct dirrem {
* blocks using a similar scheme with the allocindir structures. Rather
* than adding this level of complexity, we simply write those newly
* allocated indirect blocks synchronously as such allocations are rare.
+ * In the case of a new directory the . and .. links are tracked with
+ * a mkdir rather than a pagedep. In this case we track the mkdir
+ * so it can be released when it is written. A workhead is used
+ * to simplify canceling a mkdir that is removed by a subsequent dirrem.
*/
struct newdirblk {
struct worklist db_list; /* id_inowait or pg_newdirblk */
# define db_state db_list.wk_state /* unused */
struct pagedep *db_pagedep; /* associated pagedep */
+ struct workhead db_mkdir;
+};
+
+/*
+ * The inoref structure holds the elements common to jaddref and jremref
+ * so they may easily be queued in-order on the inodedep.
+ */
+struct inoref {
+ struct worklist if_list;
+# define if_state if_list.wk_state
+ TAILQ_ENTRY(inoref) if_deps; /* Links for inodedep. */
+ struct jsegdep *if_jsegdep;
+ off_t if_diroff; /* Directory offset. */
+ ino_t if_ino; /* Inode number. */
+ ino_t if_parent; /* Parent inode number. */
+ nlink_t if_nlink; /* nlink before addition. */
+ uint16_t if_mode; /* File mode, needed for IFMT. */
+};
+
+/*
+ * A "jaddref" structure tracks a new reference (link count) on an inode
+ * and prevents the link count increase and bitmap allocation until a
+ * journal entry can be written. Once the journal entry is written,
+ * the inode is put on the pendinghd of the bmsafemap and a diradd or
+ * mkdir entry is placed on the bufwait list of the inode. The DEPCOMPLETE
+ * flag is used to indicate that all of the required information for writing
+ * the journal entry is present. MKDIR_BODY and MKDIR_PARENT are used to
+ * differentiate . and .. links from regular file names. NEWBLOCK indicates
+ * a bitmap is still pending. If a new reference is canceled by a delete
+ * prior to writing the journal the jaddref write is canceled and the
+ * structure persists to prevent any disk-visible changes until it is
+ * ultimately released when the file is freed or the link is dropped again.
+ */
+struct jaddref {
+ struct inoref ja_ref;
+# define ja_list ja_ref.if_list /* Journal pending or jseg entries. */
+# define ja_state ja_ref.if_list.wk_state
+ LIST_ENTRY(jaddref) ja_bmdeps; /* Links for bmsafemap. */
+ union {
+ struct diradd *jau_diradd; /* Pending diradd. */
+ struct mkdir *jau_mkdir; /* MKDIR_{PARENT,BODY} */
+ } ja_un;
+};
+#define ja_diradd ja_un.jau_diradd
+#define ja_mkdir ja_un.jau_mkdir
+#define ja_diroff ja_ref.if_diroff
+#define ja_ino ja_ref.if_ino
+#define ja_parent ja_ref.if_parent
+#define ja_mode ja_ref.if_mode
+
+/*
+ * A "jremref" structure tracks a removed reference (unlink) on an
+ * inode and prevents the directory remove from proceeding until the
+ * journal entry is written. Once the journal has been written the remove
+ * may proceed as normal.
+ */
+struct jremref {
+ struct inoref jr_ref;
+# define jr_list jr_ref.if_list /* Journal pending or jseg entries. */
+# define jr_state jr_ref.if_list.wk_state
+ LIST_ENTRY(jremref) jr_deps; /* Links for pagdep. */
+ struct dirrem *jr_dirrem; /* Back pointer to dirrem. */
+};
+
+struct jmvref {
+ struct worklist jm_list;
+ LIST_ENTRY(jmvref) jm_deps;
+ struct pagedep *jm_pagedep;
+ ino_t jm_parent;
+ ino_t jm_ino;
+ off_t jm_oldoff;
+ off_t jm_newoff;
+};
+
+/*
+ * A "jnewblk" structure tracks a newly allocated block or fragment and
+ * prevents the direct or indirect block pointer as well as the cg bitmap
+ * from being written until it is logged. After it is logged the jsegdep
+ * is attached to the allocdirect or allocindir until the operation is
+ * completed or reverted. If the operation is reverted prior to the journal
+ * write the jnewblk structure is maintained to prevent the bitmaps from
+ * reaching the disk. Ultimately the jnewblk structure will be passed
+ * to the free routine as the in memory cg is modified back to the free
+ * state at which time it can be released.
+ */
+struct jnewblk {
+ struct worklist jn_list;
+# define jn_state jn_list.wk_state
+ struct jsegdep *jn_jsegdep;
+ LIST_ENTRY(jnewblk) jn_deps; /* All jnewblks on bmsafemap */
+ struct newblk *jn_newblk;
+ ino_t jn_ino;
+ ufs_lbn_t jn_lbn;
+ ufs2_daddr_t jn_blkno;
+ int jn_oldfrags;
+ int jn_frags;
+};
+
+/*
+ * A "jfreeblk" structure tracks the journal write for freeing a block
+ * or tree of blocks. The block pointer must not be cleared in the inode
+ * or indirect prior to the jfreeblk being written.
+ */
+struct jfreeblk {
+ struct worklist jf_list;
+# define jf_state jf_list.wk_state
+ struct jsegdep *jf_jsegdep;
+ struct freeblks *jf_freeblks;
+ LIST_ENTRY(jfreeblk) jf_deps;
+ ino_t jf_ino;
+ ufs_lbn_t jf_lbn;
+ ufs2_daddr_t jf_blkno;
+ int jf_frags;
+};
+
+/*
+ * A "jfreefrag" tracks the freeing of a single block when a fragment is
+ * extended or an indirect page is replaced. It is not part of a larger
+ * freeblks operation.
+ */
+struct jfreefrag {
+ struct worklist fr_list;
+# define fr_state fr_list.wk_state
+ struct jsegdep *fr_jsegdep;
+ struct freefrag *fr_freefrag;
+ ino_t fr_ino;
+ ufs_lbn_t fr_lbn;
+ ufs2_daddr_t fr_blkno;
+ int fr_frags;
+};
+
+/*
+ * A "jtrunc" journals the intent to truncate an inode to a non-zero
+ * value. This is done synchronously prior to the synchronous partial
+ * truncation process. The jsegdep is not released until the truncation
+ * is complete and the truncated inode is fsync'd.
+ */
+struct jtrunc {
+ struct worklist jt_list;
+ struct jsegdep *jt_jsegdep;
+ ino_t jt_ino;
+ off_t jt_size;
+ int jt_extsize;
+};
+
+/*
+ * A "jsegdep" structure tracks a single reference to a written journal
+ * segment so the journal space can be reclaimed when all dependencies
+ * have been written.
+ */
+struct jsegdep {
+ struct worklist jd_list;
+# define jd_state jd_list.wk_state
+ struct jseg *jd_seg;
+};
+
+/*
+ * A "jseg" structure contains all of the journal records written in a
+ * single disk write. jaddref and jremref structures are linked into
+ * js_entries so thay may be completed when the write completes. The
+ * js_deps array contains as many entries as there are ref counts to
+ * reduce the number of allocations required per journal write to one.
+ */
+struct jseg {
+ struct worklist js_list; /* b_deps link for journal */
+# define js_state js_list.wk_state
+ struct workhead js_entries; /* Entries awaiting write */
+ TAILQ_ENTRY(jseg) js_next;
+ struct jblocks *js_jblocks; /* Back pointer to block/seg list */
+ struct buf *js_buf; /* Buffer while unwritten */
+ uint64_t js_seq;
+ int js_size; /* Allocated size in bytes */
+ int js_cnt; /* Total items allocated */
+ int js_refs; /* Count of items pending completion */
+};
+
+/*
+ * A 'sbdep' structure tracks the head of the free inode list and
+ * superblock writes. This makes sure the superblock is always pointing at
+ * the first possible unlinked inode for the suj recovery process. If a
+ * block write completes and we discover a new head is available the buf
+ * is dirtied and the dep is kept.
+ */
+struct sbdep {
+ struct worklist sb_list; /* b_dep linkage */
+ struct fs *sb_fs; /* Filesystem pointer within buf. */
+ struct ufsmount *sb_ump;
};
diff --git a/sys/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h
index 7f9e7c56496e..c75257c8e62d 100644
--- a/sys/ufs/ufs/dinode.h
+++ b/sys/ufs/ufs/dinode.h
@@ -146,7 +146,8 @@ struct ufs2_dinode {
ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */
ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */
u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */
- int64_t di_spare[2]; /* 240: Reserved; currently unused */
+ ino_t di_freelink; /* 240: SUJ: Next unlinked inode. */
+ uint32_t di_spare[3]; /* 244: Reserved; currently unused */
};
/*
@@ -167,9 +168,7 @@ struct ufs2_dinode {
struct ufs1_dinode {
u_int16_t di_mode; /* 0: IFMT, permissions; see below. */
int16_t di_nlink; /* 2: File link count. */
- union {
- u_int16_t oldids[2]; /* 4: Ffs: old user and group ids. */
- } di_u;
+ ino_t di_freelink; /* 4: SUJ: Next unlinked inode. */
u_int64_t di_size; /* 8: File byte count. */
int32_t di_atime; /* 16: Last access time. */
int32_t di_atimensec; /* 20: Last access time. */
@@ -186,7 +185,5 @@ struct ufs1_dinode {
u_int32_t di_gid; /* 116: File group. */
u_int64_t di_modrev; /* 120: i_modrev for NFSv4 */
};
-#define di_ogid di_u.oldids[1]
-#define di_ouid di_u.oldids[0]
#endif /* _UFS_UFS_DINODE_H_ */
diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h
index 565580e60460..295b12975e25 100644
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@@ -120,7 +120,7 @@ struct inode {
#define IN_CHANGE 0x0002 /* Inode change time update request. */
#define IN_UPDATE 0x0004 /* Modification time update request. */
#define IN_MODIFIED 0x0008 /* Inode has been modified. */
-#define IN_RENAME 0x0010 /* Inode is being renamed. */
+#define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */
#define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */
#define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */
#define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the
@@ -175,6 +175,7 @@ struct indir {
/* Determine if soft dependencies are being done */
#define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
#define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)
+#define DOINGSUJ(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_SUJ)
/* This overlays the fid structure (see mount.h). */
struct ufid {
diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c
index c85fdc8980f6..d7c1d0ddb821 100644
--- a/sys/ufs/ufs/ufs_dirhash.c
+++ b/sys/ufs/ufs/ufs_dirhash.c
@@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$");
static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");
-static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
-
static int ufs_mindirhashsize = DIRBLKSIZ * 5;
SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
&ufs_mindirhashsize,
diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h
index b2e4a9757305..6658b663fb14 100644
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h
@@ -57,7 +57,7 @@ int ufs_bmap(struct vop_bmap_args *);
int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *,
struct buf *, int *, int *);
int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **);
-int ufs_checkpath(ino_t, struct inode *, struct ucred *);
+int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *);
void ufs_dirbad(struct inode *, doff_t, char *);
int ufs_dirbadentry(struct vnode *, struct direct *, int);
int ufs_dirempty(struct inode *, ino_t, struct ucred *);
@@ -66,9 +66,11 @@ int ufs_extwrite(struct vop_write_args *);
void ufs_makedirentry(struct inode *, struct componentname *,
struct direct *);
int ufs_direnter(struct vnode *, struct vnode *, struct direct *,
- struct componentname *, struct buf *);
+ struct componentname *, struct buf *, int);
int ufs_dirremove(struct vnode *, struct inode *, int, int);
int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int);
+int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *,
+ ino_t *);
int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *);
int ufs_inactive(struct vop_inactive_args *);
int ufs_init(struct vfsconf *);
@@ -81,19 +83,33 @@ vfs_root_t ufs_root;
int ufs_uninit(struct vfsconf *);
int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **);
+#include <sys/sysctl.h>
+SYSCTL_DECL(_vfs_ufs);
+
/*
* Soft update function prototypes.
*/
int softdep_setup_directory_add(struct buf *, struct inode *, off_t,
ino_t, struct buf *, int);
-void softdep_change_directoryentry_offset(struct inode *, caddr_t,
- caddr_t, caddr_t, int);
+void softdep_change_directoryentry_offset(struct buf *, struct inode *,
+ caddr_t, caddr_t, caddr_t, int);
void softdep_setup_remove(struct buf *,struct inode *, struct inode *, int);
void softdep_setup_directory_change(struct buf *, struct inode *,
struct inode *, ino_t, int);
void softdep_change_linkcnt(struct inode *);
void softdep_releasefile(struct inode *);
int softdep_slowdown(struct vnode *);
+void softdep_setup_create(struct inode *, struct inode *);
+void softdep_setup_dotdot_link(struct inode *, struct inode *);
+void softdep_setup_link(struct inode *, struct inode *);
+void softdep_setup_mkdir(struct inode *, struct inode *);
+void softdep_setup_rmdir(struct inode *, struct inode *);
+void softdep_setup_unlink(struct inode *, struct inode *);
+void softdep_revert_create(struct inode *, struct inode *);
+void softdep_revert_dotdot_link(struct inode *, struct inode *);
+void softdep_revert_link(struct inode *, struct inode *);
+void softdep_revert_mkdir(struct inode *, struct inode *);
+void softdep_revert_rmdir(struct inode *, struct inode *);
/*
* Flags to low-level allocation routines. The low 16-bits are reserved
diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
index b0247e77d0d8..0030c5264bd1 100644
--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c
@@ -77,9 +77,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, "");
/* true if old FS format...*/
#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0)
-static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *,
- ino_t *);
-
static int
ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred,
struct thread *td)
@@ -189,11 +186,11 @@ ufs_lookup(ap)
} */ *ap;
{
- return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
+ return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
}
-static int
-ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
+int
+ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
ino_t *dd_ino)
{
struct inode *dp; /* inode for directory being searched */
@@ -524,6 +521,8 @@ notfound:
return (ENOENT);
found:
+ if (dd_ino != NULL)
+ *dd_ino = ino;
if (numdirpasses == 2)
nchstats.ncs_pass2++;
/*
@@ -546,11 +545,6 @@ found:
if ((flags & ISLASTCN) && nameiop == LOOKUP)
dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1);
- if (dd_ino != NULL) {
- *dd_ino = ino;
- return (0);
- }
-
/*
* If deleting, and at end of pathname, return
* parameters which can be used to remove file.
@@ -558,17 +552,6 @@ found:
if (nameiop == DELETE && (flags & ISLASTCN)) {
if (flags & LOCKPARENT)
ASSERT_VOP_ELOCKED(vdp, __FUNCTION__);
- if ((error = VFS_VGET(vdp->v_mount, ino,
- LK_EXCLUSIVE, &tdp)) != 0)
- return (error);
-
- error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);
- if (error) {
- vput(tdp);
- return (error);
- }
-
-
/*
* Return pointer to current entry in dp->i_offset,
* and distance past previous entry (if there
@@ -585,6 +568,16 @@ found:
dp->i_count = 0;
else
dp->i_count = dp->i_offset - prevoff;
+ if (dd_ino != NULL)
+ return (0);
+ if ((error = VFS_VGET(vdp->v_mount, ino,
+ LK_EXCLUSIVE, &tdp)) != 0)
+ return (error);
+ error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);
+ if (error) {
+ vput(tdp);
+ return (error);
+ }
if (dp->i_number == ino) {
VREF(vdp);
*vpp = vdp;
@@ -616,6 +609,8 @@ found:
dp->i_offset = i_offset;
if (dp->i_number == ino)
return (EISDIR);
+ if (dd_ino != NULL)
+ return (0);
if ((error = VFS_VGET(vdp->v_mount, ino,
LK_EXCLUSIVE, &tdp)) != 0)
return (error);
@@ -650,6 +645,8 @@ found:
cnp->cn_flags |= SAVENAME;
return (0);
}
+ if (dd_ino != NULL)
+ return (0);
/*
* Step through the translation in the name. We do not `vput' the
@@ -681,7 +678,7 @@ found:
* to the inode we looked up before vdp lock was
* dropped.
*/
- error = ufs_lookup_(pdp, NULL, cnp, &ino1);
+ error = ufs_lookup_ino(pdp, NULL, cnp, &ino1);
if (error) {
vput(tdp);
return (error);
@@ -704,6 +701,14 @@ found:
vn_lock(vdp, LK_UPGRADE | LK_RETRY);
else /* if (ltype == LK_SHARED) */
vn_lock(vdp, LK_DOWNGRADE | LK_RETRY);
+ /*
+ * Relock for the "." case may left us with
+ * reclaimed vnode.
+ */
+ if (vdp->v_iflag & VI_DOOMED) {
+ vrele(vdp);
+ return (ENOENT);
+ }
}
*vpp = vdp;
} else {
@@ -825,12 +830,13 @@ ufs_makedirentry(ip, cnp, newdirp)
* soft dependency code).
*/
int
-ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
+ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename)
struct vnode *dvp;
struct vnode *tvp;
struct direct *dirp;
struct componentname *cnp;
struct buf *newdirbp;
+ int isrename;
{
struct ucred *cr;
struct thread *td;
@@ -903,22 +909,28 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
blkoff += DIRBLKSIZ;
}
if (softdep_setup_directory_add(bp, dp, dp->i_offset,
- dirp->d_ino, newdirbp, 1) == 0) {
- bdwrite(bp);
+ dirp->d_ino, newdirbp, 1))
+ dp->i_flag |= IN_NEEDSYNC;
+ if (newdirbp)
+ bdwrite(newdirbp);
+ bdwrite(bp);
+ if ((dp->i_flag & IN_NEEDSYNC) == 0)
return (UFS_UPDATE(dvp, 0));
- }
- /* We have just allocated a directory block in an
- * indirect block. Rather than tracking when it gets
- * claimed by the inode, we simply do a VOP_FSYNC
- * now to ensure that it is there (in case the user
- * does a future fsync). Note that we have to unlock
- * the inode for the entry that we just entered, as
- * the VOP_FSYNC may need to lock other inodes which
- * can lead to deadlock if we also hold a lock on
- * the newly entered node.
+ /*
+ * We have just allocated a directory block in an
+ * indirect block. We must prevent holes in the
+ * directory created if directory entries are
+ * written out of order. To accomplish this we
+ * fsync when we extend a directory into indirects.
+ * During rename it's not safe to drop the tvp lock
+ * so sync must be delayed until it is.
+ *
+ * This synchronous step could be removed if fsck and
+ * the kernel were taught to fill in sparse
+ * directories rather than panic.
*/
- if ((error = bwrite(bp)))
- return (error);
+ if (isrename)
+ return (0);
if (tvp != NULL)
VOP_UNLOCK(tvp, 0);
error = VOP_FSYNC(dvp, MNT_WAIT, td);
@@ -1007,7 +1019,7 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
dp->i_offset + ((char *)ep - dirbuf));
#endif
if (DOINGSOFTDEP(dvp))
- softdep_change_directoryentry_offset(dp, dirbuf,
+ softdep_change_directoryentry_offset(bp, dp, dirbuf,
(caddr_t)nep, (caddr_t)ep, dsize);
else
bcopy((caddr_t)nep, (caddr_t)ep, dsize);
@@ -1059,6 +1071,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
(void) softdep_setup_directory_add(bp, dp,
dp->i_offset + (caddr_t)ep - dirbuf,
dirp->d_ino, newdirbp, 0);
+ if (newdirbp != NULL)
+ bdwrite(newdirbp);
bdwrite(bp);
} else {
if (DOINGASYNC(dvp)) {
@@ -1076,7 +1090,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
* lock other inodes which can lead to deadlock if we also hold a
* lock on the newly entered node.
*/
- if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {
+ if (isrename == 0 && error == 0 &&
+ dp->i_endoff && dp->i_endoff < dp->i_size) {
if (tvp != NULL)
VOP_UNLOCK(tvp, 0);
#ifdef UFS_DIRHASH
@@ -1117,6 +1132,19 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
dp = VTOI(dvp);
+ /*
+ * Adjust the link count early so softdep can block if necessary.
+ */
+ if (ip) {
+ ip->i_effnlink--;
+ if (DOINGSOFTDEP(dvp)) {
+ softdep_setup_unlink(dp, ip);
+ } else {
+ ip->i_nlink--;
+ DIP_SET(ip, i_nlink, ip->i_nlink);
+ ip->i_flag |= IN_CHANGE;
+ }
+ }
if (flags & DOWHITEOUT) {
/*
* Whiteout entry: set d_ino to WINO.
@@ -1146,6 +1174,9 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
if (dp->i_dirhash != NULL)
ufsdirhash_remove(dp, rep, dp->i_offset);
#endif
+ if (ip && rep->d_ino != ip->i_number)
+ panic("ufs_dirremove: ip %d does not match dirent ino %d\n",
+ ip->i_number, rep->d_ino);
if (dp->i_count == 0) {
/*
* First entry in block: set d_ino to zero.
@@ -1164,31 +1195,20 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
dp->i_offset & ~(DIRBLKSIZ - 1));
#endif
out:
+ error = 0;
if (DOINGSOFTDEP(dvp)) {
- if (ip) {
- ip->i_effnlink--;
- softdep_change_linkcnt(ip);
+ if (ip)
softdep_setup_remove(bp, dp, ip, isrmdir);
- }
- if (softdep_slowdown(dvp)) {
+ if (softdep_slowdown(dvp))
error = bwrite(bp);
- } else {
+ else
bdwrite(bp);
- error = 0;
- }
} else {
- if (ip) {
- ip->i_effnlink--;
- ip->i_nlink--;
- DIP_SET(ip, i_nlink, ip->i_nlink);
- ip->i_flag |= IN_CHANGE;
- }
if (flags & DOWHITEOUT)
error = bwrite(bp);
- else if (DOINGASYNC(dvp) && dp->i_count != 0) {
+ else if (DOINGASYNC(dvp) && dp->i_count != 0)
bdwrite(bp);
- error = 0;
- } else
+ else
error = bwrite(bp);
}
dp->i_flag |= IN_CHANGE | IN_UPDATE;
@@ -1221,6 +1241,19 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
struct vnode *vdp = ITOV(dp);
int error;
+ /*
+ * Drop the link before we lock the buf so softdep can block if
+ * necessary.
+ */
+ oip->i_effnlink--;
+ if (DOINGSOFTDEP(vdp)) {
+ softdep_setup_unlink(dp, oip);
+ } else {
+ oip->i_nlink--;
+ DIP_SET(oip, i_nlink, oip->i_nlink);
+ oip->i_flag |= IN_CHANGE;
+ }
+
error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp);
if (error)
return (error);
@@ -1232,15 +1265,10 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
ep->d_ino = newinum;
if (!OFSFMT(vdp))
ep->d_type = newtype;
- oip->i_effnlink--;
if (DOINGSOFTDEP(vdp)) {
- softdep_change_linkcnt(oip);
softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
bdwrite(bp);
} else {
- oip->i_nlink--;
- DIP_SET(oip, i_nlink, oip->i_nlink);
- oip->i_flag |= IN_CHANGE;
if (DOINGASYNC(vdp)) {
bdwrite(bp);
error = 0;
@@ -1355,25 +1383,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct ucred *cred, ino_t *dd_ino)
/*
* Check if source directory is in the path of the target directory.
- * Target is supplied locked, source is unlocked.
- * The target is always vput before returning.
*/
int
-ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)
+ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino)
{
- struct vnode *vp, *vp1;
+ struct mount *mp;
+ struct vnode *tvp, *vp, *vp1;
int error;
ino_t dd_ino;
- vp = ITOV(target);
- if (target->i_number == source_ino) {
- error = EEXIST;
- goto out;
- }
- error = 0;
+ vp = tvp = ITOV(target);
+ mp = vp->v_mount;
+ *wait_ino = 0;
+ if (target->i_number == source_ino)
+ return (EEXIST);
+ if (target->i_number == parent_ino)
+ return (0);
if (target->i_number == ROOTINO)
- goto out;
-
+ return (0);
+ error = 0;
for (;;) {
error = ufs_dir_dd_ino(vp, cred, &dd_ino);
if (error != 0)
@@ -1384,9 +1412,13 @@ ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)
}
if (dd_ino == ROOTINO)
break;
- error = vn_vget_ino(vp, dd_ino, LK_EXCLUSIVE, &vp1);
- if (error != 0)
+ if (dd_ino == parent_ino)
break;
+ error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1);
+ if (error != 0) {
+ *wait_ino = dd_ino;
+ break;
+ }
/* Recheck that ".." still points to vp1 after relock of vp */
error = ufs_dir_dd_ino(vp, cred, &dd_ino);
if (error != 0) {
@@ -1398,14 +1430,14 @@ ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)
vput(vp1);
continue;
}
- vput(vp);
+ if (vp != tvp)
+ vput(vp);
vp = vp1;
}
-out:
if (error == ENOTDIR)
- printf("checkpath: .. not a directory\n");
- if (vp != NULL)
+ panic("checkpath: .. not a directory\n");
+ if (vp != tvp)
vput(vp);
return (error);
}
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 9d4d93dbc8fe..f8d45cfceb8e 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -114,6 +114,8 @@ static vop_close_t ufsfifo_close;
static vop_kqfilter_t ufsfifo_kqfilter;
static vop_pathconf_t ufsfifo_pathconf;
+SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
+
/*
* A virgin directory (no blushing please).
*/
@@ -974,6 +976,9 @@ ufs_link(ap)
error = EXDEV;
goto out;
}
+ if (VTOI(tdvp)->i_effnlink < 2)
+ panic("ufs_link: Bad link count %d on parent",
+ VTOI(tdvp)->i_effnlink);
ip = VTOI(vp);
if ((nlink_t)ip->i_nlink >= LINK_MAX) {
error = EMLINK;
@@ -988,11 +993,11 @@ ufs_link(ap)
DIP_SET(ip, i_nlink, ip->i_nlink);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(vp))
- softdep_change_linkcnt(ip);
+ softdep_setup_link(VTOI(tdvp), ip);
error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp)));
if (!error) {
ufs_makedirentry(ip, cnp, &newdir);
- error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL);
+ error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0);
}
if (error) {
@@ -1001,7 +1006,7 @@ ufs_link(ap)
DIP_SET(ip, i_nlink, ip->i_nlink);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(vp))
- softdep_change_linkcnt(ip);
+ softdep_revert_link(VTOI(tdvp), ip);
}
out:
return (error);
@@ -1043,7 +1048,7 @@ ufs_whiteout(ap)
newdir.d_namlen = cnp->cn_namelen;
bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
newdir.d_type = DT_WHT;
- error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL);
+ error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0);
break;
case DELETE:
@@ -1062,6 +1067,11 @@ ufs_whiteout(ap)
return (error);
}
+static volatile int rename_restarts;
+SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD,
+ __DEVOLATILE(int *, &rename_restarts), 0,
+ "Times rename had to restart due to lock contention");
+
/*
* Rename system call.
* rename("foo", "bar");
@@ -1101,111 +1111,183 @@ ufs_rename(ap)
struct vnode *tdvp = ap->a_tdvp;
struct vnode *fvp = ap->a_fvp;
struct vnode *fdvp = ap->a_fdvp;
+ struct vnode *nvp;
struct componentname *tcnp = ap->a_tcnp;
struct componentname *fcnp = ap->a_fcnp;
struct thread *td = fcnp->cn_thread;
- struct inode *ip, *xp, *dp;
+ struct inode *fip, *tip, *tdp, *fdp;
struct direct newdir;
- int doingdirectory = 0, oldparent = 0, newparent = 0;
+ off_t endoff;
+ int doingdirectory, newparent;
int error = 0, ioflag;
- ino_t fvp_ino;
+ struct mount *mp;
+ ino_t ino;
#ifdef INVARIANTS
if ((tcnp->cn_flags & HASBUF) == 0 ||
(fcnp->cn_flags & HASBUF) == 0)
panic("ufs_rename: no name");
#endif
+ endoff = 0;
+ mp = tdvp->v_mount;
+ VOP_UNLOCK(tdvp, 0);
+ if (tvp && tvp != tdvp)
+ VOP_UNLOCK(tvp, 0);
/*
* Check for cross-device rename.
*/
if ((fvp->v_mount != tdvp->v_mount) ||
(tvp && (fvp->v_mount != tvp->v_mount))) {
error = EXDEV;
-abortit:
- if (tdvp == tvp)
- vrele(tdvp);
- else
- vput(tdvp);
- if (tvp)
- vput(tvp);
- vrele(fdvp);
+ mp = NULL;
+ goto releout;
+ }
+ error = vfs_busy(mp, 0);
+ if (error) {
+ mp = NULL;
+ goto releout;
+ }
+relock:
+ /*
+ * We need to acquire 2 to 4 locks depending on whether tvp is NULL
+ * and fdvp and tdvp are the same directory. Subsequently we need
+ * to double-check all paths and in the directory rename case we
+ * need to verify that we are not creating a directory loop. To
+ * handle this we acquire all but fdvp using non-blocking
+ * acquisitions. If we fail to acquire any lock in the path we will
+ * drop all held locks, acquire the new lock in a blocking fashion,
+ * and then release it and restart the rename. This acquire/release
+ * step ensures that we do not spin on a lock waiting for release.
+ */
+ error = vn_lock(fdvp, LK_EXCLUSIVE);
+ if (error)
+ goto releout;
+ if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+ VOP_UNLOCK(fdvp, 0);
+ error = vn_lock(tdvp, LK_EXCLUSIVE);
+ if (error)
+ goto releout;
+ VOP_UNLOCK(tdvp, 0);
+ atomic_add_int(&rename_restarts, 1);
+ goto relock;
+ }
+ /*
+ * Re-resolve fvp to be certain it still exists and fetch the
+ * correct vnode.
+ */
+ error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
+ if (error) {
+ VOP_UNLOCK(fdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ goto releout;
+ }
+ error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
+ if (error) {
+ VOP_UNLOCK(fdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ if (error != EBUSY)
+ goto releout;
+ error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
+ if (error != 0)
+ goto releout;
+ VOP_UNLOCK(nvp, 0);
vrele(fvp);
- return (error);
+ fvp = nvp;
+ atomic_add_int(&rename_restarts, 1);
+ goto relock;
}
-
+ vrele(fvp);
+ fvp = nvp;
+ /*
+ * Re-resolve tvp and acquire the vnode lock if present.
+ */
+ error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino);
+ if (error != 0 && error != EJUSTRETURN) {
+ VOP_UNLOCK(fdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ VOP_UNLOCK(fvp, 0);
+ goto releout;
+ }
+ /*
+ * If tvp disappeared we just carry on.
+ */
+ if (error == EJUSTRETURN && tvp != NULL) {
+ vrele(tvp);
+ tvp = NULL;
+ }
+ /*
+ * Get the tvp ino if the lookup succeeded. We may have to restart
+ * if the non-blocking acquire fails.
+ */
+ if (error == 0) {
+ nvp = NULL;
+ error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
+ if (tvp)
+ vrele(tvp);
+ tvp = nvp;
+ if (error) {
+ VOP_UNLOCK(fdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ VOP_UNLOCK(fvp, 0);
+ if (error != EBUSY)
+ goto releout;
+ error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
+ if (error != 0)
+ goto releout;
+ VOP_UNLOCK(nvp, 0);
+ atomic_add_int(&rename_restarts, 1);
+ goto relock;
+ }
+ }
+ fdp = VTOI(fdvp);
+ fip = VTOI(fvp);
+ tdp = VTOI(tdvp);
+ tip = NULL;
+ if (tvp)
+ tip = VTOI(tvp);
if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
(VTOI(tdvp)->i_flags & APPEND))) {
error = EPERM;
- goto abortit;
+ goto unlockout;
}
-
/*
* Renaming a file to itself has no effect. The upper layers should
- * not call us in that case. Temporarily just warn if they do.
+ * not call us in that case. However, things could change after
+ * we drop the locks above.
*/
if (fvp == tvp) {
- printf("ufs_rename: fvp == tvp (can't happen)\n");
error = 0;
- goto abortit;
+ goto unlockout;
}
-
- if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
- goto abortit;
- dp = VTOI(fdvp);
- ip = VTOI(fvp);
- if (ip->i_nlink >= LINK_MAX) {
- VOP_UNLOCK(fvp, 0);
+ doingdirectory = 0;
+ newparent = 0;
+ ino = fip->i_number;
+ if (fip->i_nlink >= LINK_MAX) {
error = EMLINK;
- goto abortit;
+ goto unlockout;
}
- if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
- || (dp->i_flags & APPEND)) {
- VOP_UNLOCK(fvp, 0);
+ if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
+ || (fdp->i_flags & APPEND)) {
error = EPERM;
- goto abortit;
+ goto unlockout;
}
- if ((ip->i_mode & IFMT) == IFDIR) {
+ if ((fip->i_mode & IFMT) == IFDIR) {
/*
* Avoid ".", "..", and aliases of "." for obvious reasons.
*/
if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
- dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
- (ip->i_flag & IN_RENAME)) {
- VOP_UNLOCK(fvp, 0);
+ fdp == fip ||
+ (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
error = EINVAL;
- goto abortit;
+ goto unlockout;
}
- ip->i_flag |= IN_RENAME;
- oldparent = dp->i_number;
+ if (fdp->i_number != tdp->i_number)
+ newparent = tdp->i_number;
doingdirectory = 1;
}
- vrele(fdvp);
-
- /*
- * When the target exists, both the directory
- * and target vnodes are returned locked.
- */
- dp = VTOI(tdvp);
- xp = NULL;
- if (tvp)
- xp = VTOI(tvp);
-
- /*
- * 1) Bump link count while we're moving stuff
- * around. If we crash somewhere before
- * completing our work, the link count
- * may be wrong, but correctable.
- */
- ip->i_effnlink++;
- ip->i_nlink++;
- DIP_SET(ip, i_nlink, ip->i_nlink);
- ip->i_flag |= IN_CHANGE;
- if (DOINGSOFTDEP(fvp))
- softdep_change_linkcnt(ip);
- if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) |
- DOINGASYNC(fvp)))) != 0) {
- VOP_UNLOCK(fvp, 0);
- goto bad;
+ if (fvp->v_mountedhere != NULL || (tvp && tvp->v_mountedhere != NULL)) {
+ error = EXDEV;
+ goto unlockout;
}
/*
@@ -1214,35 +1296,55 @@ abortit:
* directory hierarchy above the target, as this would
* orphan everything below the source directory. Also
* the user must have write permission in the source so
- * as to be able to change "..". We must repeat the call
- * to namei, as the parent directory is unlocked by the
- * call to checkpath().
+ * as to be able to change "..".
*/
- error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
- fvp_ino = ip->i_number;
- VOP_UNLOCK(fvp, 0);
- if (oldparent != dp->i_number)
- newparent = dp->i_number;
if (doingdirectory && newparent) {
- if (error) /* write access check above */
- goto bad;
- if (xp != NULL)
- vput(tvp);
- error = ufs_checkpath(fvp_ino, dp, tcnp->cn_cred);
+ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
if (error)
- goto out;
+ goto unlockout;
+ error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred,
+ &ino);
+ /*
+ * We encountered a lock that we have to wait for. Unlock
+ * everything else and VGET before restarting.
+ */
+ if (ino) {
+ VOP_UNLOCK(fdvp, 0);
+ VOP_UNLOCK(fvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ if (tvp)
+ VOP_UNLOCK(tvp, 0);
+ error = VFS_VGET(mp, ino, LK_SHARED, &nvp);
+ if (error == 0)
+ vput(nvp);
+ atomic_add_int(&rename_restarts, 1);
+ goto relock;
+ }
+ if (error)
+ goto unlockout;
if ((tcnp->cn_flags & SAVESTART) == 0)
panic("ufs_rename: lost to startdir");
- VREF(tdvp);
- error = relookup(tdvp, &tvp, tcnp);
- if (error)
- goto out;
- vrele(tdvp);
- dp = VTOI(tdvp);
- xp = NULL;
- if (tvp)
- xp = VTOI(tvp);
}
+ if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 ||
+ tdp->i_effnlink == 0)
+ panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp);
+
+ /*
+ * 1) Bump link count while we're moving stuff
+ * around. If we crash somewhere before
+ * completing our work, the link count
+ * may be wrong, but correctable.
+ */
+ fip->i_effnlink++;
+ fip->i_nlink++;
+ DIP_SET(fip, i_nlink, fip->i_nlink);
+ fip->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(fvp))
+ softdep_setup_link(tdp, fip);
+ error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp)));
+ if (error)
+ goto bad;
+
/*
* 2) If target doesn't exist, link the target
* to the source and unlink the source.
@@ -1250,52 +1352,37 @@ abortit:
* entry to reference the source inode and
* expunge the original entry's existence.
*/
- if (xp == NULL) {
- if (dp->i_dev != ip->i_dev)
+ if (tip == NULL) {
+ if (tdp->i_dev != fip->i_dev)
panic("ufs_rename: EXDEV");
- /*
- * Account for ".." in new directory.
- * When source and destination have the same
- * parent we don't fool with the link count.
- */
if (doingdirectory && newparent) {
- if ((nlink_t)dp->i_nlink >= LINK_MAX) {
+ /*
+ * Account for ".." in new directory.
+ * When source and destination have the same
+ * parent we don't adjust the link count. The
+ * actual link modification is completed when
+ * .. is rewritten below.
+ */
+ if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
error = EMLINK;
goto bad;
}
- dp->i_effnlink++;
- dp->i_nlink++;
- DIP_SET(dp, i_nlink, dp->i_nlink);
- dp->i_flag |= IN_CHANGE;
- if (DOINGSOFTDEP(tdvp))
- softdep_change_linkcnt(dp);
- error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |
- DOINGASYNC(tdvp)));
- if (error)
- goto bad;
}
- ufs_makedirentry(ip, tcnp, &newdir);
- error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL);
- if (error) {
- if (doingdirectory && newparent) {
- dp->i_effnlink--;
- dp->i_nlink--;
- DIP_SET(dp, i_nlink, dp->i_nlink);
- dp->i_flag |= IN_CHANGE;
- if (DOINGSOFTDEP(tdvp))
- softdep_change_linkcnt(dp);
- (void)UFS_UPDATE(tdvp, 1);
- }
+ ufs_makedirentry(fip, tcnp, &newdir);
+ error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1);
+ if (error)
goto bad;
- }
- vput(tdvp);
+ /* Setup tdvp for directory compaction if needed. */
+ if (tdp->i_count && tdp->i_endoff &&
+ tdp->i_endoff < tdp->i_size)
+ endoff = tdp->i_endoff;
} else {
- if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
+ if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev)
panic("ufs_rename: EXDEV");
/*
* Short circuit rename(foo, foo).
*/
- if (xp->i_number == ip->i_number)
+ if (tip->i_number == fip->i_number)
panic("ufs_rename: same file");
/*
* If the parent directory is "sticky", then the caller
@@ -1303,7 +1390,7 @@ abortit:
* destination of the rename. This implements append-only
* directories.
*/
- if ((dp->i_mode & S_ISTXT) &&
+ if ((tdp->i_mode & S_ISTXT) &&
VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) &&
VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) {
error = EPERM;
@@ -1314,9 +1401,9 @@ abortit:
* to it. Also, ensure source and target are compatible
* (both directories, or both not directories).
*/
- if ((xp->i_mode&IFMT) == IFDIR) {
- if ((xp->i_effnlink > 2) ||
- !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
+ if ((tip->i_mode & IFMT) == IFDIR) {
+ if ((tip->i_effnlink > 2) ||
+ !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) {
error = ENOTEMPTY;
goto bad;
}
@@ -1329,20 +1416,30 @@ abortit:
error = EISDIR;
goto bad;
}
- error = ufs_dirrewrite(dp, xp, ip->i_number,
- IFTODT(ip->i_mode),
- (doingdirectory && newparent) ? newparent : doingdirectory);
- if (error)
- goto bad;
if (doingdirectory) {
if (!newparent) {
- dp->i_effnlink--;
+ tdp->i_effnlink--;
if (DOINGSOFTDEP(tdvp))
- softdep_change_linkcnt(dp);
+ softdep_change_linkcnt(tdp);
}
- xp->i_effnlink--;
+ tip->i_effnlink--;
if (DOINGSOFTDEP(tvp))
- softdep_change_linkcnt(xp);
+ softdep_change_linkcnt(tip);
+ }
+ error = ufs_dirrewrite(tdp, tip, fip->i_number,
+ IFTODT(fip->i_mode),
+ (doingdirectory && newparent) ? newparent : doingdirectory);
+ if (error) {
+ if (doingdirectory) {
+ if (!newparent) {
+ tdp->i_effnlink++;
+ if (DOINGSOFTDEP(tdvp))
+ softdep_change_linkcnt(tdp);
+ }
+ tip->i_effnlink++;
+ if (DOINGSOFTDEP(tvp))
+ softdep_change_linkcnt(tip);
+ }
}
if (doingdirectory && !DOINGSOFTDEP(tvp)) {
/*
@@ -1357,115 +1454,107 @@ abortit:
* them now.
*/
if (!newparent) {
- dp->i_nlink--;
- DIP_SET(dp, i_nlink, dp->i_nlink);
- dp->i_flag |= IN_CHANGE;
+ tdp->i_nlink--;
+ DIP_SET(tdp, i_nlink, tdp->i_nlink);
+ tdp->i_flag |= IN_CHANGE;
}
- xp->i_nlink--;
- DIP_SET(xp, i_nlink, xp->i_nlink);
- xp->i_flag |= IN_CHANGE;
+ tip->i_nlink--;
+ DIP_SET(tip, i_nlink, tip->i_nlink);
+ tip->i_flag |= IN_CHANGE;
ioflag = IO_NORMAL;
if (!DOINGASYNC(tvp))
ioflag |= IO_SYNC;
+ /* Don't go to bad here as the new link exists. */
if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag,
tcnp->cn_cred, tcnp->cn_thread)) != 0)
- goto bad;
+ goto unlockout;
}
- vput(tdvp);
- vput(tvp);
- xp = NULL;
}
/*
- * 3) Unlink the source.
+ * 3) Unlink the source. We have to resolve the path again to
+ * fixup the directory offset and count for ufs_dirremove.
*/
- fcnp->cn_flags &= ~MODMASK;
- fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
- if ((fcnp->cn_flags & SAVESTART) == 0)
- panic("ufs_rename: lost from startdir");
- VREF(fdvp);
- error = relookup(fdvp, &fvp, fcnp);
- if (error == 0)
- vrele(fdvp);
- if (fvp != NULL) {
- xp = VTOI(fvp);
- dp = VTOI(fdvp);
- } else {
- /*
- * From name has disappeared. IN_RENAME is not sufficient
- * to protect against directory races due to timing windows,
- * so we have to remove the panic. XXX the only real way
- * to solve this issue is at a much higher level. By the
- * time we hit ufs_rename() it's too late.
- */
-#if 0
- if (doingdirectory)
- panic("ufs_rename: lost dir entry");
-#endif
- vrele(ap->a_fvp);
- return (0);
+ if (fdvp == tdvp) {
+ error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
+ if (error)
+ panic("ufs_rename: from entry went away!");
+ if (ino != fip->i_number)
+ panic("ufs_rename: ino mismatch %d != %d\n", ino,
+ fip->i_number);
}
/*
- * Ensure that the directory entry still exists and has not
- * changed while the new name has been entered. If the source is
- * a file then the entry may have been unlinked or renamed. In
- * either case there is no further work to be done. If the source
- * is a directory then it cannot have been rmdir'ed; the IN_RENAME
- * flag ensures that it cannot be moved by another rename or removed
- * by a rmdir.
+ * If the source is a directory with a
+ * new parent, the link count of the old
+ * parent directory must be decremented
+ * and ".." set to point to the new parent.
*/
- if (xp != ip) {
- /*
- * From name resolves to a different inode. IN_RENAME is
- * not sufficient protection against timing window races
- * so we can't panic here. XXX the only real way
- * to solve this issue is at a much higher level. By the
- * time we hit ufs_rename() it's too late.
- */
-#if 0
- if (doingdirectory)
- panic("ufs_rename: lost dir entry");
-#endif
- } else {
+ if (doingdirectory && newparent) {
/*
- * If the source is a directory with a
- * new parent, the link count of the old
- * parent directory must be decremented
- * and ".." set to point to the new parent.
+ * If tip exists we simply use its link, otherwise we must
+ * add a new one.
*/
- if (doingdirectory && newparent) {
- xp->i_offset = mastertemplate.dot_reclen;
- ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0);
- cache_purge(fdvp);
+ if (tip == NULL) {
+ tdp->i_effnlink++;
+ tdp->i_nlink++;
+ DIP_SET(tdp, i_nlink, tdp->i_nlink);
+ tdp->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(tdvp))
+ softdep_setup_dotdot_link(tdp, fip);
+ error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |
+ DOINGASYNC(tdvp)));
+ /* Don't go to bad here as the new link exists. */
+ if (error)
+ goto unlockout;
}
- error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);
- xp->i_flag &= ~IN_RENAME;
- }
- if (dp)
- vput(fdvp);
- if (xp)
- vput(fvp);
- vrele(ap->a_fvp);
+ fip->i_offset = mastertemplate.dot_reclen;
+ ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0);
+ cache_purge(fdvp);
+ }
+ error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0);
+
+unlockout:
+ vput(fdvp);
+ vput(fvp);
+ if (tvp)
+ vput(tvp);
+ /*
+ * If compaction or fsync was requested do it now that other locks
+ * are no longer needed.
+ */
+ if (error == 0 && endoff != 0) {
+#ifdef UFS_DIRHASH
+ if (tdp->i_dirhash != NULL)
+ ufsdirhash_dirtrunc(tdp, endoff);
+#endif
+ UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, tcnp->cn_cred,
+ td);
+ }
+ if (error == 0 && tdp->i_flag & IN_NEEDSYNC)
+ error = VOP_FSYNC(tdvp, MNT_WAIT, td);
+ vput(tdvp);
+ if (mp)
+ vfs_unbusy(mp);
return (error);
bad:
- if (xp)
- vput(ITOV(xp));
- vput(ITOV(dp));
-out:
- if (doingdirectory)
- ip->i_flag &= ~IN_RENAME;
- if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
- ip->i_effnlink--;
- ip->i_nlink--;
- DIP_SET(ip, i_nlink, ip->i_nlink);
- ip->i_flag |= IN_CHANGE;
- ip->i_flag &= ~IN_RENAME;
- if (DOINGSOFTDEP(fvp))
- softdep_change_linkcnt(ip);
- vput(fvp);
- } else
- vrele(fvp);
+ fip->i_effnlink--;
+ fip->i_nlink--;
+ DIP_SET(fip, i_nlink, fip->i_nlink);
+ fip->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(fvp))
+ softdep_revert_link(tdp, fip);
+ goto unlockout;
+
+releout:
+ vrele(fdvp);
+ vrele(fvp);
+ vrele(tdvp);
+ if (tvp)
+ vrele(tvp);
+ if (mp)
+ vfs_unbusy(mp);
+
return (error);
}
@@ -1767,8 +1856,7 @@ ufs_mkdir(ap)
ip->i_effnlink = 2;
ip->i_nlink = 2;
DIP_SET(ip, i_nlink, 2);
- if (DOINGSOFTDEP(tvp))
- softdep_change_linkcnt(ip);
+
if (cnp->cn_flags & ISWHITEOUT) {
ip->i_flags |= UF_OPAQUE;
DIP_SET(ip, i_flags, ip->i_flags);
@@ -1784,8 +1872,8 @@ ufs_mkdir(ap)
DIP_SET(dp, i_nlink, dp->i_nlink);
dp->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(dvp))
- softdep_change_linkcnt(dp);
- error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));
+ softdep_setup_mkdir(dp, ip);
+ error = UFS_UPDATE(dvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));
if (error)
goto bad;
#ifdef MAC
@@ -1863,7 +1951,7 @@ ufs_mkdir(ap)
else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp))))
goto bad;
ufs_makedirentry(ip, cnp, &newdir);
- error = ufs_direnter(dvp, tvp, &newdir, cnp, bp);
+ error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0);
bad:
if (error == 0) {
@@ -1873,8 +1961,6 @@ bad:
dp->i_nlink--;
DIP_SET(dp, i_nlink, dp->i_nlink);
dp->i_flag |= IN_CHANGE;
- if (DOINGSOFTDEP(dvp))
- softdep_change_linkcnt(dp);
/*
* No need to do an explicit VOP_TRUNCATE here, vrele will
* do this for us because we set the link count to 0.
@@ -1884,7 +1970,8 @@ bad:
DIP_SET(ip, i_nlink, 0);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(tvp))
- softdep_change_linkcnt(ip);
+ softdep_revert_mkdir(dp, ip);
+
vput(tvp);
}
out:
@@ -1920,10 +2007,13 @@ ufs_rmdir(ap)
* tries to remove a locally mounted on directory).
*/
error = 0;
- if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) {
+ if (ip->i_effnlink < 2) {
error = EINVAL;
goto out;
}
+ if (dp->i_effnlink < 3)
+ panic("ufs_dirrem: Bad link count %d on parent",
+ dp->i_effnlink);
if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
error = ENOTEMPTY;
goto out;
@@ -1947,18 +2037,14 @@ ufs_rmdir(ap)
*/
dp->i_effnlink--;
ip->i_effnlink--;
- if (DOINGSOFTDEP(vp)) {
- softdep_change_linkcnt(dp);
- softdep_change_linkcnt(ip);
- }
+ if (DOINGSOFTDEP(vp))
+ softdep_setup_rmdir(dp, ip);
error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
if (error) {
dp->i_effnlink++;
ip->i_effnlink++;
- if (DOINGSOFTDEP(vp)) {
- softdep_change_linkcnt(dp);
- softdep_change_linkcnt(ip);
- }
+ if (DOINGSOFTDEP(vp))
+ softdep_revert_rmdir(dp, ip);
goto out;
}
cache_purge(dvp);
@@ -2464,6 +2550,9 @@ ufs_makeinode(mode, dvp, vpp, cnp)
if ((mode & IFMT) == 0)
mode |= IFREG;
+ if (VTOI(dvp)->i_effnlink < 2)
+ panic("ufs_makeinode: Bad link count %d on parent",
+ VTOI(dvp)->i_effnlink);
error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
if (error)
return (error);
@@ -2539,7 +2628,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
ip->i_nlink = 1;
DIP_SET(ip, i_nlink, 1);
if (DOINGSOFTDEP(tvp))
- softdep_change_linkcnt(ip);
+ softdep_setup_create(VTOI(dvp), ip);
if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) {
ip->i_mode &= ~ISGID;
@@ -2579,7 +2668,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
}
#endif /* !UFS_ACL */
ufs_makedirentry(ip, cnp, &newdir);
- error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL);
+ error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0);
if (error)
goto bad;
*vpp = tvp;
@@ -2595,7 +2684,7 @@ bad:
DIP_SET(ip, i_nlink, 0);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(tvp))
- softdep_change_linkcnt(ip);
+ softdep_revert_create(VTOI(dvp), ip);
vput(tvp);
return (error);
}
diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h
index 83f9af06b59d..d5669179dac2 100644
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h
@@ -57,6 +57,10 @@ struct ucred;
struct uio;
struct vnode;
struct ufs_extattr_per_mount;
+struct jblocks;
+struct inodedep;
+
+TAILQ_HEAD(inodedeplst, inodedep);
/* This structure describes the UFS specific mount structure data. */
struct ufsmount {
@@ -75,6 +79,11 @@ struct ufsmount {
long um_numindirdeps; /* outstanding indirdeps */
struct workhead softdep_workitem_pending; /* softdep work queue */
struct worklist *softdep_worklist_tail; /* Tail pointer for above */
+ struct workhead softdep_journal_pending; /* journal work queue */
+ struct worklist *softdep_journal_tail; /* Tail pointer for above */
+ struct jblocks *softdep_jblocks; /* Journal block information */
+ struct inodedeplst softdep_unlinked; /* Unlinked inodes */
+ int softdep_on_journal; /* Items on the journal list */
int softdep_on_worklist; /* Items on the worklist */
int softdep_on_worklist_inprogress; /* Busy items on worklist */
int softdep_deps; /* Total dependency count */