aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorKirk McKusick <mckusick@FreeBSD.org>2000-01-10 00:24:24 +0000
committerKirk McKusick <mckusick@FreeBSD.org>2000-01-10 00:24:24 +0000
commitcf60e8e4bf442e10aeb65803cfcbdb1cd3a875e3 (patch)
tree126ab6feda3d7d9183f59410c61d778b2e490d46 /sys
parentbd5f5da94da66c03392e82dcb9631879023c437e (diff)
downloadsrc-cf60e8e4bf442e10aeb65803cfcbdb1cd3a875e3.tar.gz
src-cf60e8e4bf442e10aeb65803cfcbdb1cd3a875e3.zip
Several performance improvements for soft updates have been added:
1) Fastpath deletions. When a file is being deleted, check to see if it was so recently created that its inode has not yet been written to disk. If so, the delete can proceed to immediately free the inode. 2) Background writes: No file or block allocations can be done while the bitmap is being written to disk. To avoid these stalls, the bitmap is copied to another buffer which is written thus leaving the original available for futher allocations. 3) Link count tracking. Constantly track the difference in i_effnlink and i_nlink so that inodes that have had no change other than i_effnlink need not be written. 4) Identify buffers with rollback dependencies so that the buffer flushing daemon can choose to skip over them.
Notes
Notes: svn path=/head/; revision=55697
Diffstat (limited to 'sys')
-rw-r--r--sys/contrib/softupdates/ffs_softdep.c242
-rw-r--r--sys/kern/vfs_bio.c149
-rw-r--r--sys/sys/bio.h7
-rw-r--r--sys/sys/buf.h7
-rw-r--r--sys/ufs/ffs/ffs_alloc.c7
-rw-r--r--sys/ufs/ffs/ffs_softdep.c242
-rw-r--r--sys/ufs/ffs/ffs_softdep_stub.c4
-rw-r--r--sys/ufs/ffs/ffs_vfsops.c13
-rw-r--r--sys/ufs/ffs/ffs_vnops.c49
-rw-r--r--sys/ufs/ufs/ufs_extern.h2
-rw-r--r--sys/ufs/ufs/ufs_lookup.c17
-rw-r--r--sys/ufs/ufs/ufs_vnops.c66
12 files changed, 626 insertions, 179 deletions
diff --git a/sys/contrib/softupdates/ffs_softdep.c b/sys/contrib/softupdates/ffs_softdep.c
index 14e1bb244153..dee1891d7199 100644
--- a/sys/contrib/softupdates/ffs_softdep.c
+++ b/sys/contrib/softupdates/ffs_softdep.c
@@ -52,7 +52,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from: @(#)ffs_softdep.c 9.45 (McKusick) 1/9/00
+ * from: @(#)ffs_softdep.c 9.46 (McKusick) 1/9/00
* $FreeBSD$
*/
@@ -212,6 +212,8 @@ static void softdep_disk_write_complete __P((struct buf *));
static void softdep_deallocate_dependencies __P((struct buf *));
static int softdep_fsync __P((struct vnode *));
static int softdep_process_worklist __P((struct mount *));
+static void softdep_move_dependencies __P((struct buf *, struct buf *));
+static int softdep_count_dependencies __P((struct buf *bp, int));
struct bio_ops bioops = {
softdep_disk_io_initiation, /* io_start */
@@ -219,6 +221,8 @@ struct bio_ops bioops = {
softdep_deallocate_dependencies, /* io_deallocate */
softdep_fsync, /* io_fsync */
softdep_process_worklist, /* io_sync */
+ softdep_move_dependencies, /* io_movedeps */
+ softdep_count_dependencies, /* io_countdeps */
};
/*
@@ -472,7 +476,6 @@ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
#ifdef DEBUG
#include <vm/vm.h>
#include <sys/sysctl.h>
-#if defined(__FreeBSD__)
SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
@@ -483,19 +486,6 @@ SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0
SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
-#else /* !__FreeBSD__ */
-struct ctldebug debug20 = { "max_softdeps", &max_softdeps };
-struct ctldebug debug21 = { "tickdelay", &tickdelay };
-struct ctldebug debug23 = { "blk_limit_push", &stat_blk_limit_push };
-struct ctldebug debug24 = { "ino_limit_push", &stat_ino_limit_push };
-struct ctldebug debug25 = { "blk_limit_hit", &stat_blk_limit_hit };
-struct ctldebug debug26 = { "ino_limit_hit", &stat_ino_limit_hit };
-struct ctldebug debug27 = { "indir_blk_ptrs", &stat_indir_blk_ptrs };
-struct ctldebug debug28 = { "inode_bitmap", &stat_inode_bitmap };
-struct ctldebug debug29 = { "direct_blk_ptrs", &stat_direct_blk_ptrs };
-struct ctldebug debug30 = { "dir_entry", &stat_dir_entry };
-#endif /* !__FreeBSD__ */
-
#endif /* DEBUG */
/*
@@ -637,6 +627,31 @@ softdep_process_worklist(matchmnt)
}
/*
+ * Move dependencies from one buffer to another.
+ */
+static void
+softdep_move_dependencies(oldbp, newbp)
+ struct buf *oldbp;
+ struct buf *newbp;
+{
+ struct worklist *wk, *wktail;
+
+ if (LIST_FIRST(&newbp->b_dep) != NULL)
+ panic("softdep_move_dependencies: need merge code");
+ wktail = 0;
+ ACQUIRE_LOCK(&lk);
+ while (wk = LIST_FIRST(&oldbp->b_dep)) {
+ LIST_REMOVE(wk, wk_list);
+ if (wktail == 0)
+ LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
+ else
+ LIST_INSERT_AFTER(wktail, wk, wk_list);
+ wktail = wk;
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
* Purge the work list of all items associated with a particular mount point.
*/
int
@@ -1633,11 +1648,6 @@ softdep_setup_freeblocks(ip, length)
if ((inodedep->id_state & IOSTARTED) != 0)
panic("softdep_setup_freeblocks: inode busy");
/*
- * Add the freeblks structure to the list of operations that
- * must await the zero'ed inode being written to disk.
- */
- WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
- /*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
* with this inode are obsolete and can simply be de-allocated.
@@ -1647,6 +1657,16 @@ softdep_setup_freeblocks(ip, length)
merge_inode_lists(inodedep);
while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
free_allocdirect(&inodedep->id_inoupdt, adp, 1);
+ /*
+ * Add the freeblks structure to the list of operations that
+ * must await the zero'ed inode being written to disk. If we
+ * still have a bitmap dependency, then the inode has never been
+ * written to disk, so we can process the freeblks immediately.
+ */
+ if ((inodedep->id_state & DEPCOMPLETE) == 0)
+ handle_workitem_freeblocks(freeblks);
+ else
+ WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
FREE_LOCK(&lk);
bdwrite(bp);
/*
@@ -1841,36 +1861,35 @@ softdep_freefile(pvp, ino, mode)
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) {
- add_to_worklist(&freefile->fx_list);
FREE_LOCK(&lk);
+ handle_workitem_freefile(freefile);
return;
}
/*
* If we still have a bitmap dependency, then the inode has never
* been written to disk. Drop the dependency as it is no longer
- * necessary since the inode is being deallocated. We could process
- * the freefile immediately, but then we would have to clear the
- * id_inowait dependencies here and it is easier just to let the
- * zero'ed inode be written and let them be cleaned up in the
- * normal followup actions that follow the inode write.
+ * necessary since the inode is being deallocated. We set the
+ * ALLCOMPLETE flags since the bitmap now properly shows that the
+ * inode is not allocated. Even if the inode is actively being
+ * written, it has been rolled back to its zero'ed state, so we
+ * are ensured that a zero inode is what is on the disk. For short
+ * lived files, this change will usually result in removing all the
+ * depedencies from the inode so that it can be freed immediately.
*/
- if ((inodedep->id_state & DEPCOMPLETE) == 0) {
- inodedep->id_state |= DEPCOMPLETE;
+ if ((inodedep->id_state & DEPCOMPLETE) == 0) {
+ inodedep->id_state |= ALLCOMPLETE;
LIST_REMOVE(inodedep, id_deps);
inodedep->id_buf = NULL;
+ WORKLIST_REMOVE(&inodedep->id_list);
}
- /*
- * If the inodedep has no dependencies associated with it,
- * then we must free it here and free the file immediately.
- * This case arises when an early allocation fails (for
- * example, the user is over their file quota).
- */
- if (free_inodedep(inodedep) == 0)
+ if (free_inodedep(inodedep) == 0) {
WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
- else
- add_to_worklist(&freefile->fx_list);
- FREE_LOCK(&lk);
+ FREE_LOCK(&lk);
+ } else {
+ FREE_LOCK(&lk);
+ handle_workitem_freefile(freefile);
+ }
}
/*
@@ -2318,11 +2337,12 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
dm_next);
+ FREE_LOCK(&lk);
} else {
dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ FREE_LOCK(&lk);
+ handle_workitem_remove(dirrem);
}
- FREE_LOCK(&lk);
}
/*
@@ -2515,19 +2535,22 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
}
/*
- * Called whenever the link count on an inode is increased.
+ * Called whenever the link count on an inode is changed.
* It creates an inode dependency so that the new reference(s)
* to the inode cannot be committed to disk until the updated
* inode has been written.
*/
void
-softdep_increase_linkcnt(ip)
+softdep_change_linkcnt(ip)
struct inode *ip; /* the inode with the increased link count */
{
struct inodedep *inodedep;
ACQUIRE_LOCK(&lk);
(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
+ if (ip->i_nlink < ip->i_effnlink)
+ panic("softdep_change_linkcnt: bad delta");
+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
FREE_LOCK(&lk);
}
@@ -2550,14 +2573,19 @@ handle_workitem_remove(dirrem)
return;
}
ip = VTOI(vp);
+ ACQUIRE_LOCK(&lk);
+ if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
+ panic("handle_workitem_remove: lost inodedep 1");
/*
* Normal file deletion.
*/
if ((dirrem->dm_state & RMDIR) == 0) {
ip->i_nlink--;
+ ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad file delta");
- ip->i_flag |= IN_CHANGE;
+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
+ FREE_LOCK(&lk);
vput(vp);
num_dirrem -= 1;
WORKITEM_FREE(dirrem, D_DIRREM);
@@ -2571,9 +2599,11 @@ handle_workitem_remove(dirrem)
* the parent decremented to account for the loss of "..".
*/
ip->i_nlink -= 2;
+ ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad dir delta");
- ip->i_flag |= IN_CHANGE;
+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
+ FREE_LOCK(&lk);
if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
softdep_error("handle_workitem_remove: truncate", error);
/*
@@ -2587,14 +2617,37 @@ handle_workitem_remove(dirrem)
WORKITEM_FREE(dirrem, D_DIRREM);
return;
}
+ /*
+ * If we still have a bitmap dependency, then the inode has never
+ * been written to disk. Drop the dependency as it is no longer
+ * necessary since the inode is being deallocated. We set the
+ * ALLCOMPLETE flags since the bitmap now properly shows that the
+ * inode is not allocated. Even if the inode is actively being
+ * written, it has been rolled back to its zero'ed state, so we
+ * are ensured that a zero inode is what is on the disk. For short
+ * lived files, this change will usually result in removing all the
+ * depedencies from the inode so that it can be freed immediately.
+ */
ACQUIRE_LOCK(&lk);
- (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC,
- &inodedep);
+ if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
+ panic("handle_workitem_remove: lost inodedep 2");
+ if ((inodedep->id_state & DEPCOMPLETE) == 0) {
+ inodedep->id_state |= ALLCOMPLETE;
+ LIST_REMOVE(inodedep, id_deps);
+ inodedep->id_buf = NULL;
+ WORKLIST_REMOVE(&inodedep->id_list);
+ }
dirrem->dm_state = 0;
dirrem->dm_oldinum = dirrem->dm_dirinum;
- WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
- FREE_LOCK(&lk);
- vput(vp);
+ if (free_inodedep(inodedep) == 0) {
+ WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
+ FREE_LOCK(&lk);
+ vput(vp);
+ } else {
+ FREE_LOCK(&lk);
+ vput(vp);
+ handle_workitem_remove(dirrem);
+ }
}
/*
@@ -3456,12 +3509,7 @@ softdep_load_inodeblock(ip)
FREE_LOCK(&lk);
return;
}
- if (inodedep->id_nlinkdelta != 0) {
- ip->i_effnlink -= inodedep->id_nlinkdelta;
- ip->i_flag |= IN_MODIFIED;
- inodedep->id_nlinkdelta = 0;
- (void) free_inodedep(inodedep);
- }
+ ip->i_effnlink -= inodedep->id_nlinkdelta;
FREE_LOCK(&lk);
}
@@ -3500,9 +3548,8 @@ softdep_update_inodeblock(ip, bp, waitfor)
FREE_LOCK(&lk);
return;
}
- if (ip->i_nlink < ip->i_effnlink)
+ if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
panic("softdep_update_inodeblock: bad delta");
- inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
/*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
@@ -4405,6 +4452,87 @@ clear_inodedeps(p)
}
/*
+ * Function to determine if the buffer has outstanding dependencies
+ * that will cause a roll-back if the buffer is written. If wantcount
+ * is set, return number of dependencies, otherwise just yes or no.
+ */
+static int
+softdep_count_dependencies(bp, wantcount)
+ struct buf *bp;
+ int wantcount;
+{
+ struct worklist *wk;
+ struct inodedep *inodedep;
+ struct indirdep *indirdep;
+ struct allocindir *aip;
+ struct pagedep *pagedep;
+ struct diradd *dap;
+ int i, retval;
+
+ retval = 0;
+ ACQUIRE_LOCK(&lk);
+ for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) {
+ switch (wk->wk_type) {
+
+ case D_INODEDEP:
+ inodedep = WK_INODEDEP(wk);
+ if ((inodedep->id_state & DEPCOMPLETE) == 0) {
+ /* bitmap allocation dependency */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
+ /* direct block pointer dependency */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
+ case D_INDIRDEP:
+ indirdep = WK_INDIRDEP(wk);
+ for (aip = LIST_FIRST(&indirdep->ir_deplisthd);
+ aip; aip = LIST_NEXT(aip, ai_next)) {
+ /* indirect block pointer dependency */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
+ case D_PAGEDEP:
+ pagedep = WK_PAGEDEP(wk);
+ for (i = 0; i < DAHASHSZ; i++) {
+ for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]);
+ dap; dap = LIST_NEXT(dap, da_pdlist)) {
+ /* directory entry dependency */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ }
+ continue;
+
+ case D_BMSAFEMAP:
+ case D_ALLOCDIRECT:
+ case D_ALLOCINDIR:
+ case D_MKDIR:
+ /* never a dependency on these blocks */
+ continue;
+
+ default:
+ panic("softdep_check_for_rollback: Unexpected type %s",
+ TYPENAME(wk->wk_type));
+ /* NOTREACHED */
+ }
+ }
+out:
+ FREE_LOCK(&lk);
+ return retval;
+}
+
+/*
* Acquire exclusive access to a buffer.
* Must be called with splbio blocked.
* Return 1 if buffer was acquired.
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 9d2b5c27f978..f12316ba6a8f 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -68,6 +68,7 @@ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
static void vfs_clean_pages(struct buf * bp);
static void vfs_setdirty(struct buf *bp);
static void vfs_vmio_release(struct buf *bp);
+static void vfs_backgroundwritedone(struct buf *bp);
static int flushbufqueues(void);
static int bd_request;
@@ -349,7 +350,7 @@ bufinit(void)
* buffer cache operation.
*/
maxbufspace = (nbuf + 8) * DFLTBSIZE;
- hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 5);
+ hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
/*
* Limit the amount of malloc memory since it is wired permanently into
* the kernel space. Even though this is accounted for in the buffer
@@ -593,6 +594,7 @@ int
bwrite(struct buf * bp)
{
int oldflags, s;
+ struct buf *newbp;
if (bp->b_flags & B_INVAL) {
brelse(bp);
@@ -606,8 +608,66 @@ bwrite(struct buf * bp)
panic("bwrite: buffer is not busy???");
#endif
s = splbio();
+ /*
+ * If a background write is already in progress, delay
+ * writing this block if it is asynchronous. Otherwise
+ * wait for the background write to complete.
+ */
+ if (bp->b_xflags & BX_BKGRDINPROG) {
+ if (bp->b_flags & B_ASYNC) {
+ splx(s);
+ bdwrite(bp);
+ return (0);
+ }
+ bp->b_xflags |= BX_BKGRDWAIT;
+ tsleep(&bp->b_xflags, PRIBIO, "biord", 0);
+ if (bp->b_xflags & BX_BKGRDINPROG)
+ panic("bwrite: still writing");
+ }
+
+ /* Mark the buffer clean */
bundirty(bp);
+ /*
+ * If this buffer is marked for background writing and we
+ * do not have to wait for it, make a copy and write the
+ * copy so as to leave this buffer ready for further use.
+ */
+ if ((bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC)) {
+ if (bp->b_flags & B_CALL)
+ panic("bwrite: need chained iodone");
+
+ /* get a new block */
+ newbp = geteblk(bp->b_bufsize);
+
+ /* set it to be identical to the old block */
+ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
+ bgetvp(bp->b_vp, newbp);
+ newbp->b_lblkno = bp->b_lblkno;
+ newbp->b_blkno = bp->b_blkno;
+ newbp->b_offset = bp->b_offset;
+ newbp->b_iodone = vfs_backgroundwritedone;
+ newbp->b_flags |= B_ASYNC | B_CALL;
+ newbp->b_flags &= ~B_INVAL;
+
+ /* move over the dependencies */
+ if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
+ (*bioops.io_movedeps)(bp, newbp);
+
+ /*
+ * Initiate write on the copy, release the original to
+ * the B_LOCKED queue so that it cannot go away until
+ * the background write completes. If not locked it could go
+ * away and then be reconstituted while it was being written.
+ * If the reconstituted buffer were written, we could end up
+ * with two background copies being written at the same time.
+ */
+ bp->b_xflags |= BX_BKGRDINPROG;
+ bp->b_flags |= B_LOCKED;
+ bqrelse(bp);
+ bp = newbp;
+ }
+
bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
bp->b_flags |= B_WRITEINPROG | B_CACHE;
@@ -630,6 +690,56 @@ bwrite(struct buf * bp)
}
/*
+ * Complete a background write started from bwrite.
+ */
+static void
+vfs_backgroundwritedone(bp)
+ struct buf *bp;
+{
+ struct buf *origbp;
+
+ /*
+ * Find the original buffer that we are writing.
+ */
+ if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
+ panic("backgroundwritedone: lost buffer");
+ /*
+ * Process dependencies then return any unfinished ones.
+ */
+ if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
+ (*bioops.io_complete)(bp);
+ if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
+ (*bioops.io_movedeps)(bp, origbp);
+ /*
+ * Clear the BX_BKGRDINPROG flag in the original buffer
+ * and awaken it if it is waiting for the write to complete.
+ */
+ origbp->b_xflags &= ~BX_BKGRDINPROG;
+ if (origbp->b_xflags & BX_BKGRDWAIT) {
+ origbp->b_xflags &= ~BX_BKGRDWAIT;
+ wakeup(&origbp->b_xflags);
+ }
+ /*
+ * Clear the B_LOCKED flag and remove it from the locked
+ * queue if it currently resides there.
+ */
+ origbp->b_flags &= ~B_LOCKED;
+ if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
+ bremfree(origbp);
+ bqrelse(origbp);
+ }
+ /*
+ * This buffer is marked B_NOCACHE, so when it is released
+ * by biodone, it will be tossed. We mark it with B_READ
+ * to avoid biodone doing a second vwakeup.
+ */
+ bp->b_flags |= B_NOCACHE | B_READ;
+ bp->b_flags &= ~(B_CACHE | B_CALL | B_DONE);
+ bp->b_iodone = 0;
+ biodone(bp);
+}
+
+/*
* Delayed write. (Buffer is marked dirty). Do not bother writing
* anything if the buffer is marked invalid.
*
@@ -757,6 +867,10 @@ bundirty(bp)
--numdirtybuffers;
numdirtywakeup();
}
+ /*
+ * Since it is now being written, we can clear its deferred write flag.
+ */
+ bp->b_flags &= ~B_DEFERRED;
}
/*
@@ -895,12 +1009,16 @@ brelse(struct buf * bp)
*
* Normally we can do this whether a buffer is B_DELWRI or not. If
* the buffer is an NFS buffer, it is tracking piecemeal writes or
- * the commit state and we cannot afford to lose the buffer.
+ * the commit state and we cannot afford to lose the buffer. If the
+ * buffer has a background write in progress, we need to keep it
+ * around to prevent it from being reconstituted and starting a second
+ * background write.
*/
if ((bp->b_flags & B_VMIO)
&& !(bp->b_vp->v_tag == VT_NFS &&
!vn_isdisk(bp->b_vp) &&
- (bp->b_flags & B_DELWRI))
+ (bp->b_flags & B_DELWRI) &&
+ (bp->b_xflags & BX_BKGRDINPROG))
) {
int i, j, resid;
@@ -997,6 +1115,9 @@ brelse(struct buf * bp)
/* buffers with no memory */
if (bp->b_bufsize == 0) {
bp->b_flags |= B_INVAL;
+ bp->b_xflags &= ~BX_BKGRDWRITE;
+ if (bp->b_xflags & BX_BKGRDINPROG)
+ panic("losing buffer 1");
if (bp->b_kvasize) {
bp->b_qindex = QUEUE_EMPTYKVA;
kvawakeup = 1;
@@ -1011,6 +1132,9 @@ brelse(struct buf * bp)
/* buffers with junk contents */
} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
bp->b_flags |= B_INVAL;
+ bp->b_xflags &= ~BX_BKGRDWRITE;
+ if (bp->b_xflags & BX_BKGRDINPROG)
+ panic("losing buffer 2");
bp->b_qindex = QUEUE_CLEAN;
if (bp->b_kvasize)
kvawakeup = 1;
@@ -1501,6 +1625,8 @@ restart:
}
if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
(*bioops.io_deallocate)(bp);
+ if (bp->b_xflags & BX_BKGRDINPROG)
+ panic("losing buffer 3");
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
@@ -1508,6 +1634,7 @@ restart:
allocbuf(bp, 0);
bp->b_flags = 0;
+ bp->b_xflags = 0;
bp->b_dev = NODEV;
bp->b_vp = NULL;
bp->b_blkno = bp->b_lblkno = 0;
@@ -1761,7 +1888,8 @@ flushbufqueues(void)
while (bp) {
KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
- if ((bp->b_flags & B_DELWRI) != 0) {
+ if ((bp->b_flags & B_DELWRI) != 0 &&
+ (bp->b_xflags & BX_BKGRDINPROG) == 0) {
if (bp->b_flags & B_INVAL) {
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
panic("flushbufqueues: locked buf");
@@ -1770,13 +1898,24 @@ flushbufqueues(void)
++r;
break;
}
+ if (LIST_FIRST(&bp->b_dep) != NULL &&
+ bioops.io_countdeps &&
+ (bp->b_flags & B_DEFERRED) == 0 &&
+ (*bioops.io_countdeps)(bp, 0)) {
+ TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
+ bp, b_freelist);
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
+ bp, b_freelist);
+ bp->b_flags |= B_DEFERRED;
+ continue;
+ }
vfs_bio_awrite(bp);
++r;
break;
}
bp = TAILQ_NEXT(bp, b_freelist);
}
- return(r);
+ return (r);
}
/*
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index 7168a894e993..f38bf4510138 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -65,6 +65,8 @@ extern struct bio_ops {
void (*io_deallocate) __P((struct buf *));
int (*io_fsync) __P((struct vnode *));
int (*io_sync) __P((struct mount *));
+ void (*io_movedeps) __P((struct buf *, struct buf *));
+ int (*io_countdeps) __P((struct buf *, int));
} bioops;
struct iodone_chain {
@@ -194,7 +196,7 @@ struct buf {
#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */
#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */
#define B_UNUSED0 0x00000008 /* Old B_BAD */
-#define B_UNUSED1 0x00000010 /* Old B_BUSY */
+#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
#define B_CALL 0x00000040 /* Call b_iodone from biodone. */
#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */
@@ -235,6 +237,9 @@ struct buf {
*/
#define BX_VNDIRTY 0x00000001 /* On vnode dirty list */
#define BX_VNCLEAN 0x00000002 /* On vnode clean list */
+#define BX_BKGRDWRITE 0x00000004 /* Do writes in background */
+#define BX_BKGRDINPROG 0x00000008 /* Background write in progress */
+#define BX_BKGRDWAIT 0x00000010 /* Background write waiting */
#define NOOFFSET (-1LL) /* No buffer offset calculated yet */
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 7168a894e993..f38bf4510138 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -65,6 +65,8 @@ extern struct bio_ops {
void (*io_deallocate) __P((struct buf *));
int (*io_fsync) __P((struct vnode *));
int (*io_sync) __P((struct mount *));
+ void (*io_movedeps) __P((struct buf *, struct buf *));
+ int (*io_countdeps) __P((struct buf *, int));
} bioops;
struct iodone_chain {
@@ -194,7 +196,7 @@ struct buf {
#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */
#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */
#define B_UNUSED0 0x00000008 /* Old B_BAD */
-#define B_UNUSED1 0x00000010 /* Old B_BUSY */
+#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
#define B_CALL 0x00000040 /* Call b_iodone from biodone. */
#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */
@@ -235,6 +237,9 @@ struct buf {
*/
#define BX_VNDIRTY 0x00000001 /* On vnode dirty list */
#define BX_VNCLEAN 0x00000002 /* On vnode clean list */
+#define BX_BKGRDWRITE 0x00000004 /* Do writes in background */
+#define BX_BKGRDINPROG 0x00000008 /* Background write in progress */
+#define BX_BKGRDWAIT 0x00000010 /* Background write waiting */
#define NOOFFSET (-1LL) /* No buffer offset calculated yet */
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index c3e1c3172743..bdd00aa5565e 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -836,6 +836,7 @@ ffs_fragextend(ip, cg, bprev, osize, nsize)
brelse(bp);
return (0);
}
+ bp->b_xflags |= BX_BKGRDWRITE;
cgp->cg_time = time_second;
bno = dtogd(fs, bprev);
for (i = numfrags(fs, osize); i < frags; i++)
@@ -903,6 +904,7 @@ ffs_alloccg(ip, cg, bpref, size)
brelse(bp);
return (0);
}
+ bp->b_xflags |= BX_BKGRDWRITE;
cgp->cg_time = time_second;
if (size == fs->fs_bsize) {
bno = ffs_alloccgblk(ip, bp, bpref);
@@ -1113,6 +1115,7 @@ ffs_clusteralloc(ip, cg, bpref, len)
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp))
goto fail;
+ bp->b_xflags |= BX_BKGRDWRITE;
/*
* Check to see if a cluster of the needed size (or bigger) is
* available in this cylinder group.
@@ -1227,6 +1230,7 @@ ffs_nodealloccg(ip, cg, ipref, mode)
brelse(bp);
return (0);
}
+ bp->b_xflags |= BX_BKGRDWRITE;
cgp->cg_time = time_second;
if (ipref) {
ipref %= fs->fs_ipg;
@@ -1322,6 +1326,7 @@ ffs_blkfree(ip, bno, size)
brelse(bp);
return;
}
+ bp->b_xflags |= BX_BKGRDWRITE;
cgp->cg_time = time_second;
bno = dtogd(fs, bno);
if (size == fs->fs_bsize) {
@@ -1419,6 +1424,7 @@ ffs_checkblk(ip, bno, size)
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp))
panic("ffs_checkblk: cg magic mismatch");
+ bp->b_xflags |= BX_BKGRDWRITE;
bno = dtogd(fs, bno);
if (size == fs->fs_bsize) {
free = ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bno));
@@ -1484,6 +1490,7 @@ ffs_vfree( pvp, ino, mode)
brelse(bp);
return (0);
}
+ bp->b_xflags |= BX_BKGRDWRITE;
cgp->cg_time = time_second;
ino %= fs->fs_ipg;
if (isclr(cg_inosused(cgp), ino)) {
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 14e1bb244153..dee1891d7199 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -52,7 +52,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from: @(#)ffs_softdep.c 9.45 (McKusick) 1/9/00
+ * from: @(#)ffs_softdep.c 9.46 (McKusick) 1/9/00
* $FreeBSD$
*/
@@ -212,6 +212,8 @@ static void softdep_disk_write_complete __P((struct buf *));
static void softdep_deallocate_dependencies __P((struct buf *));
static int softdep_fsync __P((struct vnode *));
static int softdep_process_worklist __P((struct mount *));
+static void softdep_move_dependencies __P((struct buf *, struct buf *));
+static int softdep_count_dependencies __P((struct buf *bp, int));
struct bio_ops bioops = {
softdep_disk_io_initiation, /* io_start */
@@ -219,6 +221,8 @@ struct bio_ops bioops = {
softdep_deallocate_dependencies, /* io_deallocate */
softdep_fsync, /* io_fsync */
softdep_process_worklist, /* io_sync */
+ softdep_move_dependencies, /* io_movedeps */
+ softdep_count_dependencies, /* io_countdeps */
};
/*
@@ -472,7 +476,6 @@ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
#ifdef DEBUG
#include <vm/vm.h>
#include <sys/sysctl.h>
-#if defined(__FreeBSD__)
SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
@@ -483,19 +486,6 @@ SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0
SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
-#else /* !__FreeBSD__ */
-struct ctldebug debug20 = { "max_softdeps", &max_softdeps };
-struct ctldebug debug21 = { "tickdelay", &tickdelay };
-struct ctldebug debug23 = { "blk_limit_push", &stat_blk_limit_push };
-struct ctldebug debug24 = { "ino_limit_push", &stat_ino_limit_push };
-struct ctldebug debug25 = { "blk_limit_hit", &stat_blk_limit_hit };
-struct ctldebug debug26 = { "ino_limit_hit", &stat_ino_limit_hit };
-struct ctldebug debug27 = { "indir_blk_ptrs", &stat_indir_blk_ptrs };
-struct ctldebug debug28 = { "inode_bitmap", &stat_inode_bitmap };
-struct ctldebug debug29 = { "direct_blk_ptrs", &stat_direct_blk_ptrs };
-struct ctldebug debug30 = { "dir_entry", &stat_dir_entry };
-#endif /* !__FreeBSD__ */
-
#endif /* DEBUG */
/*
@@ -637,6 +627,31 @@ softdep_process_worklist(matchmnt)
}
/*
+ * Move dependencies from one buffer to another.
+ */
+static void
+softdep_move_dependencies(oldbp, newbp)
+ struct buf *oldbp;
+ struct buf *newbp;
+{
+ struct worklist *wk, *wktail;
+
+ if (LIST_FIRST(&newbp->b_dep) != NULL)
+ panic("softdep_move_dependencies: need merge code");
+ wktail = 0;
+ ACQUIRE_LOCK(&lk);
+ while (wk = LIST_FIRST(&oldbp->b_dep)) {
+ LIST_REMOVE(wk, wk_list);
+ if (wktail == 0)
+ LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
+ else
+ LIST_INSERT_AFTER(wktail, wk, wk_list);
+ wktail = wk;
+ }
+ FREE_LOCK(&lk);
+}
+
+/*
* Purge the work list of all items associated with a particular mount point.
*/
int
@@ -1633,11 +1648,6 @@ softdep_setup_freeblocks(ip, length)
if ((inodedep->id_state & IOSTARTED) != 0)
panic("softdep_setup_freeblocks: inode busy");
/*
- * Add the freeblks structure to the list of operations that
- * must await the zero'ed inode being written to disk.
- */
- WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
- /*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
* with this inode are obsolete and can simply be de-allocated.
@@ -1647,6 +1657,16 @@ softdep_setup_freeblocks(ip, length)
merge_inode_lists(inodedep);
while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
free_allocdirect(&inodedep->id_inoupdt, adp, 1);
+ /*
+ * Add the freeblks structure to the list of operations that
+ * must await the zero'ed inode being written to disk. If we
+ * still have a bitmap dependency, then the inode has never been
+ * written to disk, so we can process the freeblks immediately.
+ */
+ if ((inodedep->id_state & DEPCOMPLETE) == 0)
+ handle_workitem_freeblocks(freeblks);
+ else
+ WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
FREE_LOCK(&lk);
bdwrite(bp);
/*
@@ -1841,36 +1861,35 @@ softdep_freefile(pvp, ino, mode)
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) {
- add_to_worklist(&freefile->fx_list);
FREE_LOCK(&lk);
+ handle_workitem_freefile(freefile);
return;
}
/*
* If we still have a bitmap dependency, then the inode has never
* been written to disk. Drop the dependency as it is no longer
- * necessary since the inode is being deallocated. We could process
- * the freefile immediately, but then we would have to clear the
- * id_inowait dependencies here and it is easier just to let the
- * zero'ed inode be written and let them be cleaned up in the
- * normal followup actions that follow the inode write.
+ * necessary since the inode is being deallocated. We set the
+ * ALLCOMPLETE flags since the bitmap now properly shows that the
+ * inode is not allocated. Even if the inode is actively being
+ * written, it has been rolled back to its zero'ed state, so we
+ * are ensured that a zero inode is what is on the disk. For short
+ * lived files, this change will usually result in removing all the
+ * depedencies from the inode so that it can be freed immediately.
*/
- if ((inodedep->id_state & DEPCOMPLETE) == 0) {
- inodedep->id_state |= DEPCOMPLETE;
+ if ((inodedep->id_state & DEPCOMPLETE) == 0) {
+ inodedep->id_state |= ALLCOMPLETE;
LIST_REMOVE(inodedep, id_deps);
inodedep->id_buf = NULL;
+ WORKLIST_REMOVE(&inodedep->id_list);
}
- /*
- * If the inodedep has no dependencies associated with it,
- * then we must free it here and free the file immediately.
- * This case arises when an early allocation fails (for
- * example, the user is over their file quota).
- */
- if (free_inodedep(inodedep) == 0)
+ if (free_inodedep(inodedep) == 0) {
WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
- else
- add_to_worklist(&freefile->fx_list);
- FREE_LOCK(&lk);
+ FREE_LOCK(&lk);
+ } else {
+ FREE_LOCK(&lk);
+ handle_workitem_freefile(freefile);
+ }
}
/*
@@ -2318,11 +2337,12 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
dm_next);
+ FREE_LOCK(&lk);
} else {
dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
- add_to_worklist(&dirrem->dm_list);
+ FREE_LOCK(&lk);
+ handle_workitem_remove(dirrem);
}
- FREE_LOCK(&lk);
}
/*
@@ -2515,19 +2535,22 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
}
/*
- * Called whenever the link count on an inode is increased.
+ * Called whenever the link count on an inode is changed.
* It creates an inode dependency so that the new reference(s)
* to the inode cannot be committed to disk until the updated
* inode has been written.
*/
void
-softdep_increase_linkcnt(ip)
+softdep_change_linkcnt(ip)
struct inode *ip; /* the inode with the increased link count */
{
struct inodedep *inodedep;
ACQUIRE_LOCK(&lk);
(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
+ if (ip->i_nlink < ip->i_effnlink)
+ panic("softdep_change_linkcnt: bad delta");
+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
FREE_LOCK(&lk);
}
@@ -2550,14 +2573,19 @@ handle_workitem_remove(dirrem)
return;
}
ip = VTOI(vp);
+ ACQUIRE_LOCK(&lk);
+ if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
+ panic("handle_workitem_remove: lost inodedep 1");
/*
* Normal file deletion.
*/
if ((dirrem->dm_state & RMDIR) == 0) {
ip->i_nlink--;
+ ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad file delta");
- ip->i_flag |= IN_CHANGE;
+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
+ FREE_LOCK(&lk);
vput(vp);
num_dirrem -= 1;
WORKITEM_FREE(dirrem, D_DIRREM);
@@ -2571,9 +2599,11 @@ handle_workitem_remove(dirrem)
* the parent decremented to account for the loss of "..".
*/
ip->i_nlink -= 2;
+ ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad dir delta");
- ip->i_flag |= IN_CHANGE;
+ inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
+ FREE_LOCK(&lk);
if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
softdep_error("handle_workitem_remove: truncate", error);
/*
@@ -2587,14 +2617,37 @@ handle_workitem_remove(dirrem)
WORKITEM_FREE(dirrem, D_DIRREM);
return;
}
+ /*
+ * If we still have a bitmap dependency, then the inode has never
+ * been written to disk. Drop the dependency as it is no longer
+ * necessary since the inode is being deallocated. We set the
+ * ALLCOMPLETE flags since the bitmap now properly shows that the
+ * inode is not allocated. Even if the inode is actively being
+ * written, it has been rolled back to its zero'ed state, so we
+ * are ensured that a zero inode is what is on the disk. For short
+ * lived files, this change will usually result in removing all the
+ * depedencies from the inode so that it can be freed immediately.
+ */
ACQUIRE_LOCK(&lk);
- (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC,
- &inodedep);
+ if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
+ panic("handle_workitem_remove: lost inodedep 2");
+ if ((inodedep->id_state & DEPCOMPLETE) == 0) {
+ inodedep->id_state |= ALLCOMPLETE;
+ LIST_REMOVE(inodedep, id_deps);
+ inodedep->id_buf = NULL;
+ WORKLIST_REMOVE(&inodedep->id_list);
+ }
dirrem->dm_state = 0;
dirrem->dm_oldinum = dirrem->dm_dirinum;
- WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
- FREE_LOCK(&lk);
- vput(vp);
+ if (free_inodedep(inodedep) == 0) {
+ WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
+ FREE_LOCK(&lk);
+ vput(vp);
+ } else {
+ FREE_LOCK(&lk);
+ vput(vp);
+ handle_workitem_remove(dirrem);
+ }
}
/*
@@ -3456,12 +3509,7 @@ softdep_load_inodeblock(ip)
FREE_LOCK(&lk);
return;
}
- if (inodedep->id_nlinkdelta != 0) {
- ip->i_effnlink -= inodedep->id_nlinkdelta;
- ip->i_flag |= IN_MODIFIED;
- inodedep->id_nlinkdelta = 0;
- (void) free_inodedep(inodedep);
- }
+ ip->i_effnlink -= inodedep->id_nlinkdelta;
FREE_LOCK(&lk);
}
@@ -3500,9 +3548,8 @@ softdep_update_inodeblock(ip, bp, waitfor)
FREE_LOCK(&lk);
return;
}
- if (ip->i_nlink < ip->i_effnlink)
+ if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
panic("softdep_update_inodeblock: bad delta");
- inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
/*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
@@ -4405,6 +4452,87 @@ clear_inodedeps(p)
}
/*
+ * Function to determine if the buffer has outstanding dependencies
+ * that will cause a roll-back if the buffer is written. If wantcount
+ * is set, return number of dependencies, otherwise just yes or no.
+ */
+static int
+softdep_count_dependencies(bp, wantcount)
+ struct buf *bp;
+ int wantcount;
+{
+ struct worklist *wk;
+ struct inodedep *inodedep;
+ struct indirdep *indirdep;
+ struct allocindir *aip;
+ struct pagedep *pagedep;
+ struct diradd *dap;
+ int i, retval;
+
+ retval = 0;
+ ACQUIRE_LOCK(&lk);
+ for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) {
+ switch (wk->wk_type) {
+
+ case D_INODEDEP:
+ inodedep = WK_INODEDEP(wk);
+ if ((inodedep->id_state & DEPCOMPLETE) == 0) {
+ /* bitmap allocation dependency */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
+ /* direct block pointer dependency */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
+ case D_INDIRDEP:
+ indirdep = WK_INDIRDEP(wk);
+ for (aip = LIST_FIRST(&indirdep->ir_deplisthd);
+ aip; aip = LIST_NEXT(aip, ai_next)) {
+ /* indirect block pointer dependency */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ continue;
+
+ case D_PAGEDEP:
+ pagedep = WK_PAGEDEP(wk);
+ for (i = 0; i < DAHASHSZ; i++) {
+ for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]);
+ dap; dap = LIST_NEXT(dap, da_pdlist)) {
+ /* directory entry dependency */
+ retval += 1;
+ if (!wantcount)
+ goto out;
+ }
+ }
+ continue;
+
+ case D_BMSAFEMAP:
+ case D_ALLOCDIRECT:
+ case D_ALLOCINDIR:
+ case D_MKDIR:
+ /* never a dependency on these blocks */
+ continue;
+
+ default:
+ panic("softdep_check_for_rollback: Unexpected type %s",
+ TYPENAME(wk->wk_type));
+ /* NOTREACHED */
+ }
+ }
+out:
+ FREE_LOCK(&lk);
+ return retval;
+}
+
+/*
* Acquire exclusive access to a buffer.
* Must be called with splbio blocked.
* Return 1 if buffer was acquired.
diff --git a/sys/ufs/ffs/ffs_softdep_stub.c b/sys/ufs/ffs/ffs_softdep_stub.c
index 72f819b23cde..4b8411d32353 100644
--- a/sys/ufs/ffs/ffs_softdep_stub.c
+++ b/sys/ufs/ffs/ffs_softdep_stub.c
@@ -210,11 +210,11 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
}
void
-softdep_increase_linkcnt(ip)
+softdep_change_linkcnt(ip)
struct inode *ip;
{
- panic("softdep_increase_linkcnt called");
+ panic("softdep_change_linkcnt called");
}
void
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 18fb15396b96..77e821f53a9a 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -671,10 +671,6 @@ ffs_mountfs(devvp, mp, p, malloctype)
bp = NULL;
fs = ump->um_fs;
fs->fs_ronly = ronly;
- if (ronly == 0) {
- fs->fs_fmod = 1;
- fs->fs_clean = 0;
- }
size = fs->fs_cssize;
blks = howmany(size, fs->fs_fsize);
if (fs->fs_contigsumsize > 0)
@@ -747,6 +743,7 @@ ffs_mountfs(devvp, mp, p, malloctype)
free(base, M_UFSMNT);
goto out;
}
+ fs->fs_fmod = 1;
fs->fs_clean = 0;
(void) ffs_sbupdate(ump, MNT_WAIT);
}
@@ -964,9 +961,9 @@ loop:
simple_lock(&vp->v_interlock);
nvp = vp->v_mntvnodes.le_next;
ip = VTOI(vp);
- if ((vp->v_type == VNON) || (((ip->i_flag &
- (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) &&
- (TAILQ_EMPTY(&vp->v_dirtyblkhd) || (waitfor == MNT_LAZY)))) {
+ if (vp->v_type == VNON || ((ip->i_flag &
+ (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
+ TAILQ_EMPTY(&vp->v_dirtyblkhd))) {
simple_unlock(&vp->v_interlock);
continue;
}
@@ -1080,7 +1077,7 @@ restart:
return (error);
}
bzero((caddr_t)ip, sizeof(struct inode));
- lockinit(&ip->i_lock, PINOD, "inode", 0, 0);
+ lockinit(&ip->i_lock, PINOD, "inode", 0, LK_CANRECURSE);
vp->v_data = ip;
ip->i_vnode = vp;
ip->i_fs = fs = ump->um_fs;
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index eb99b2c7c3ae..6087d81dbdd3 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -123,10 +123,11 @@ ffs_fsync(ap)
struct vnode *vp = ap->a_vp;
struct buf *bp;
struct buf *nbp;
- int s, error, passes, skipmeta;
+ int s, error, wait, passes, skipmeta;
daddr_t lbn;
+ wait = (ap->a_waitfor == MNT_WAIT);
if (vn_isdisk(vp)) {
lbn = INT_MAX;
if (vp->v_specmountpoint != NULL &&
@@ -143,7 +144,7 @@ ffs_fsync(ap)
*/
passes = NIADDR + 1;
skipmeta = 0;
- if (ap->a_waitfor == MNT_WAIT)
+ if (wait)
skipmeta = 1;
s = splbio();
loop:
@@ -153,33 +154,43 @@ loop:
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = TAILQ_NEXT(bp, b_vnbufs);
/*
- * First time through on a synchronous call,
- * or if it's already scheduled, skip to the next
- * buffer
+ * Reasons to skip this buffer: it has already been considered
+ * on this pass, this pass is the first time through on a
+ * synchronous flush request and the buffer being considered
+ * is metadata, the buffer has dependencies that will cause
+ * it to be redirtied and it has not already been deferred,
+ * or it is already being written.
*/
- if ((bp->b_flags & B_SCANNED) ||
- ((skipmeta == 1) && (bp->b_lblkno < 0)) ||
- BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
+ if ((bp->b_flags & B_SCANNED) != 0)
+ continue;
+ bp->b_flags |= B_SCANNED;
+ if ((skipmeta == 1 && bp->b_lblkno < 0))
+ continue;
+ if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
+ (bp->b_flags & B_DEFERRED) == 0 &&
+ bioops.io_countdeps && (*bioops.io_countdeps)(bp, 0)) {
+ bp->b_flags |= B_DEFERRED;
+ continue;
+ }
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
continue;
if ((bp->b_flags & B_DELWRI) == 0)
panic("ffs_fsync: not dirty");
+ if (vp != bp->b_vp)
+ panic("ffs_fsync: vp != vp->b_vp");
/*
- * If data is outstanding to another vnode, or we were
- * asked to wait for everything, or it's not a file or BDEV,
- * start the IO on this buffer immediatly.
+ * If this is a synchronous flush request, or it is not a
+ * file or device, start the write on this buffer immediatly.
*/
- bp->b_flags |= B_SCANNED;
- if (((bp->b_vp != vp) || (ap->a_waitfor == MNT_WAIT)) ||
- ((vp->v_type != VREG) && (vp->v_type != VBLK))) {
+ if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
/*
* On our final pass through, do all I/O synchronously
* so that we can find out if our flush is failing
* because of write errors.
*/
- if (passes > 0 || (ap->a_waitfor != MNT_WAIT)) {
- if ((bp->b_flags & B_CLUSTEROK) &&
- ap->a_waitfor != MNT_WAIT) {
+ if (passes > 0 || !wait) {
+ if ((bp->b_flags & B_CLUSTEROK) && !wait) {
BUF_UNLOCK(bp);
(void) vfs_bio_awrite(bp);
} else {
@@ -224,7 +235,7 @@ loop:
goto loop;
}
- if (ap->a_waitfor == MNT_WAIT) {
+ if (wait) {
while (vp->v_numoutput) {
vp->v_flag |= VBWAIT;
(void) tsleep((caddr_t)&vp->v_numoutput,
@@ -260,5 +271,5 @@ loop:
}
}
splx(s);
- return (UFS_UPDATE(vp, ap->a_waitfor == MNT_WAIT));
+ return (UFS_UPDATE(vp, wait));
}
diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h
index feec939d5842..d576be977abc 100644
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h
@@ -102,6 +102,6 @@ void softdep_setup_remove __P((struct buf *,struct inode *, struct inode *,
int));
void softdep_setup_directory_change __P((struct buf *, struct inode *,
struct inode *, long, int));
-void softdep_increase_linkcnt __P((struct inode *));
+void softdep_change_linkcnt __P((struct inode *));
#endif /* !_UFS_UFS_EXTERN_H_ */
diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
index 77c0151abd49..7a0232d83f03 100644
--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c
@@ -899,17 +899,19 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
ep->d_reclen += dp->i_reclen;
}
out:
- if (ip) {
- ip->i_effnlink--;
- ip->i_flag |= IN_CHANGE;
- }
if (DOINGSOFTDEP(dvp)) {
- if (ip)
+ if (ip) {
+ ip->i_effnlink--;
+ softdep_change_linkcnt(ip);
softdep_setup_remove(bp, dp, ip, isrmdir);
+ }
bdwrite(bp);
} else {
- if (ip)
+ if (ip) {
+ ip->i_effnlink--;
ip->i_nlink--;
+ ip->i_flag |= IN_CHANGE;
+ }
if (flags & DOWHITEOUT)
error = VOP_BWRITE(bp->b_vp, bp);
else if (DOINGASYNC(dvp) && dp->i_count != 0) {
@@ -946,12 +948,13 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
if (!OFSFMT(vdp))
ep->d_type = newtype;
oip->i_effnlink--;
- oip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(vdp)) {
+ softdep_change_linkcnt(oip);
softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
bdwrite(bp);
} else {
oip->i_nlink--;
+ oip->i_flag |= IN_CHANGE;
if (DOINGASYNC(vdp)) {
bdwrite(bp);
error = 0;
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 9adae8ca5947..9616b3792bd8 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -754,7 +754,7 @@ ufs_link(ap)
ip->i_nlink++;
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(vp))
- softdep_increase_linkcnt(ip);
+ softdep_change_linkcnt(ip);
error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp)));
if (!error) {
ufs_makedirentry(ip, cnp, &newdir);
@@ -765,6 +765,8 @@ ufs_link(ap)
ip->i_effnlink--;
ip->i_nlink--;
ip->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(vp))
+ softdep_change_linkcnt(ip);
}
out1:
if (tdvp != vp)
@@ -1014,7 +1016,7 @@ abortit:
ip->i_nlink++;
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(fvp))
- softdep_increase_linkcnt(ip);
+ softdep_change_linkcnt(ip);
if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) |
DOINGASYNC(fvp)))) != 0) {
VOP_UNLOCK(fvp, 0, p);
@@ -1079,7 +1081,7 @@ abortit:
dp->i_nlink++;
dp->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(tdvp))
- softdep_increase_linkcnt(dp);
+ softdep_change_linkcnt(dp);
error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |
DOINGASYNC(tdvp)));
if (error)
@@ -1092,6 +1094,8 @@ abortit:
dp->i_effnlink--;
dp->i_nlink--;
dp->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(tdvp))
+ softdep_change_linkcnt(dp);
(void)UFS_UPDATE(tdvp, 1);
}
goto bad;
@@ -1146,10 +1150,12 @@ abortit:
if (doingdirectory) {
if (!newparent) {
dp->i_effnlink--;
- dp->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(tdvp))
+ softdep_change_linkcnt(dp);
}
xp->i_effnlink--;
- xp->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(tvp))
+ softdep_change_linkcnt(xp);
}
VN_POLLEVENT(tdvp, POLLWRITE);
if (doingdirectory && !DOINGSOFTDEP(tvp)) {
@@ -1164,9 +1170,12 @@ abortit:
* disk, so when running with that code we avoid doing
* them now.
*/
- if (!newparent)
+ if (!newparent) {
dp->i_nlink--;
+ dp->i_flag |= IN_CHANGE;
+ }
xp->i_nlink--;
+ xp->i_flag |= IN_CHANGE;
ioflag = DOINGASYNC(tvp) ? 0 : IO_SYNC;
if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag,
tcnp->cn_cred, tcnp->cn_proc)) != 0)
@@ -1247,6 +1256,8 @@ out:
ip->i_nlink--;
ip->i_flag |= IN_CHANGE;
ip->i_flag &= ~IN_RENAME;
+ if (DOINGSOFTDEP(fvp))
+ softdep_change_linkcnt(ip);
vput(fvp);
} else
vrele(fvp);
@@ -1359,7 +1370,7 @@ ufs_mkdir(ap)
ip->i_effnlink = 2;
ip->i_nlink = 2;
if (DOINGSOFTDEP(tvp))
- softdep_increase_linkcnt(ip);
+ softdep_change_linkcnt(ip);
if (cnp->cn_flags & ISWHITEOUT)
ip->i_flags |= UF_OPAQUE;
@@ -1372,7 +1383,7 @@ ufs_mkdir(ap)
dp->i_nlink++;
dp->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(dvp))
- softdep_increase_linkcnt(dp);
+ softdep_change_linkcnt(dp);
error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));
if (error)
goto bad;
@@ -1440,6 +1451,8 @@ bad:
dp->i_effnlink--;
dp->i_nlink--;
dp->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(dvp))
+ softdep_change_linkcnt(dp);
/*
* No need to do an explicit VOP_TRUNCATE here, vrele will
* do this for us because we set the link count to 0.
@@ -1447,6 +1460,8 @@ bad:
ip->i_effnlink = 0;
ip->i_nlink = 0;
ip->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(tvp))
+ softdep_change_linkcnt(ip);
vput(tvp);
}
out:
@@ -1505,29 +1520,36 @@ ufs_rmdir(ap)
* inode. If we crash in between, the directory
* will be reattached to lost+found,
*/
+ dp->i_effnlink--;
+ ip->i_effnlink--;
+ if (DOINGSOFTDEP(vp)) {
+ softdep_change_linkcnt(dp);
+ softdep_change_linkcnt(ip);
+ }
error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
- if (error)
+ if (error) {
+ dp->i_effnlink++;
+ ip->i_effnlink++;
+ if (DOINGSOFTDEP(vp)) {
+ softdep_change_linkcnt(dp);
+ softdep_change_linkcnt(ip);
+ }
goto out;
+ }
VN_POLLEVENT(dvp, POLLWRITE|POLLNLINK);
cache_purge(dvp);
/*
* Truncate inode. The only stuff left in the directory is "." and
* "..". The "." reference is inconsequential since we are quashing
- * it. We have removed the "." reference and the reference in the
- * parent directory, but there may be other hard links. So,
- * ufs_dirremove will set the UF_IMMUTABLE flag to ensure that no
- * new entries are made. The soft dependency code will arrange to
- * do these operations after the parent directory entry has been
- * deleted on disk, so when running with that code we avoid doing
- * them now.
+ * it. The soft dependency code will arrange to do these operations
+ * after the parent directory entry has been deleted on disk, so
+ * when running with that code we avoid doing them now.
*/
- dp->i_effnlink--;
- dp->i_flag |= IN_CHANGE;
- ip->i_effnlink--;
- ip->i_flag |= IN_CHANGE;
if (!DOINGSOFTDEP(vp)) {
dp->i_nlink--;
+ dp->i_flag |= IN_CHANGE;
ip->i_nlink--;
+ ip->i_flag |= IN_CHANGE;
ioflag = DOINGASYNC(vp) ? 0 : IO_SYNC;
error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred,
cnp->cn_proc);
@@ -2119,7 +2141,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
ip->i_effnlink = 1;
ip->i_nlink = 1;
if (DOINGSOFTDEP(tvp))
- softdep_increase_linkcnt(ip);
+ softdep_change_linkcnt(ip);
if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
suser_xxx(cnp->cn_cred, 0, 0))
ip->i_mode &= ~ISGID;
@@ -2148,6 +2170,8 @@ bad:
ip->i_effnlink = 0;
ip->i_nlink = 0;
ip->i_flag |= IN_CHANGE;
+ if (DOINGSOFTDEP(tvp))
+ softdep_change_linkcnt(ip);
vput(tvp);
return (error);
}