/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright 1998, 2000 Marshall Kirk McKusick.
* Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
* All rights reserved.
*
* The soft updates code is derived from the appendix of a University
* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
* "Soft Updates: A Solution to the Metadata Update Problem in File
* Systems", CSE-TR-254-95, August 1995).
*
* Further information about soft updates can be obtained from:
*
* Marshall Kirk McKusick http://www.mckusick.com/softdep/
* 1614 Oxford Street mckusick@mckusick.com
* Berkeley, CA 94709-1608 +1-510-843-9542
* USA
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_ddb.h"
/*
* For now we want the safety net that the DEBUG flag provides.
*/
#ifndef DEBUG
#define DEBUG
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/kdb.h>
#include <sys/kthread.h>
#include <sys/ktr.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/softdep.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/ufs/ufs_extern.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
#include <geom/geom.h>
#include <ddb/ddb.h>
#define KTR_SUJ 0 /* Define to KTR_SPARE. */
#ifndef SOFTUPDATES
int
softdep_flushfiles(oldmnt, flags, td)
struct mount *oldmnt;
int flags;
struct thread *td;
{
panic("softdep_flushfiles called");
}
int
softdep_mount(devvp, mp, fs, cred)
struct vnode *devvp;
struct mount *mp;
struct fs *fs;
struct ucred *cred;
{
return (0);
}
void
softdep_initialize()
{
return;
}
void
softdep_uninitialize()
{
return;
}
void
softdep_unmount(mp)
struct mount *mp;
{
panic("softdep_unmount called");
}
void
softdep_setup_sbupdate(ump, fs, bp)
struct ufsmount *ump;
struct fs *fs;
struct buf *bp;
{
panic("softdep_setup_sbupdate called");
}
void
softdep_setup_inomapdep(bp, ip, newinum, mode)
struct buf *bp;
struct inode *ip;
ino_t newinum;
int mode;
{
panic("softdep_setup_inomapdep called");
}
void
softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
struct buf *bp;
struct mount *mp;
ufs2_daddr_t newblkno;
int frags;
int oldfrags;
{
panic("softdep_setup_blkmapdep called");
}
void
softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
struct inode *ip;
ufs_lbn_t lbn;
ufs2_daddr_t newblkno;
ufs2_daddr_t oldblkno;
long newsize;
long oldsize;
struct buf *bp;
{
panic("softdep_setup_allocdirect called");
}
void
softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
struct inode *ip;
ufs_lbn_t lbn;
ufs2_daddr_t newblkno;
ufs2_daddr_t oldblkno;
long newsize;
long oldsize;
struct buf *bp;
{
panic("softdep_setup_allocext called");
}
void
softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
struct inode *ip;
ufs_lbn_t lbn;
struct buf *bp;
int ptrno;
ufs2_daddr_t newblkno;
ufs2_daddr_t oldblkno;
struct buf *nbp;
{
panic("softdep_setup_allocindir_page called");
}
void
softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
struct buf *nbp;
struct inode *ip;
struct buf *bp;
int ptrno;
ufs2_daddr_t newblkno;
{
panic("softdep_setup_allocindir_meta called");
}
void
softdep_journal_freeblocks(ip, cred, length, flags)
struct inode *ip;
struct ucred *cred;
off_t length;
int flags;
{
panic("softdep_journal_freeblocks called");
}
void
softdep_journal_fsync(ip)
struct inode *ip;
{
panic("softdep_journal_fsync called");
}
void
softdep_setup_freeblocks(ip, length, flags)
struct inode *ip;
off_t length;
int flags;
{
panic("softdep_setup_freeblocks called");
}
void
softdep_freefile(pvp, ino, mode)
struct vnode *pvp;
ino_t ino;
int mode;
{
panic("softdep_freefile called");
}
int
softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
struct buf *bp;
struct inode *dp;
off_t diroffset;
ino_t newinum;
struct buf *newdirbp;
int isnewblk;
{
panic("softdep_setup_directory_add called");
}
void
softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
struct buf *bp;
struct inode *dp;
caddr_t base;
caddr_t oldloc;
caddr_t newloc;
int entrysize;
{
panic("softdep_change_directoryentry_offset called");
}
void
softdep_setup_remove(bp, dp, ip, isrmdir)
struct buf *bp;
struct inode *dp;
struct inode *ip;
int isrmdir;
{
panic("softdep_setup_remove called");
}
void
softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
struct buf *bp;
struct inode *dp;
struct inode *ip;
ino_t newinum;
int isrmdir;
{
panic("softdep_setup_directory_change called");
}
void
softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
struct mount *mp;
struct buf *bp;
ufs2_daddr_t blkno;
int frags;
struct workhead *wkhd;
{
panic("%s called", __FUNCTION__);
}
void
softdep_setup_inofree(mp, bp, ino, wkhd)
struct mount *mp;
struct buf *bp;
ino_t ino;
struct workhead *wkhd;
{
panic("%s called", __FUNCTION__);
}
void
softdep_setup_unlink(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_setup_link(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_revert_link(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_setup_rmdir(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_revert_rmdir(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_setup_create(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_revert_create(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_setup_mkdir(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_revert_mkdir(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
void
softdep_setup_dotdot_link(dp, ip)
struct inode *dp;
struct inode *ip;
{
panic("%s called", __FUNCTION__);
}
int
softdep_prealloc(vp, waitok)
struct vnode *vp;
int waitok;
{
panic("%s called", __FUNCTION__);
}
int
softdep_journal_lookup(mp, vpp)
struct mount *mp;
struct vnode **vpp;
{
return (ENOENT);
}
void
softdep_change_linkcnt(ip)
struct inode *ip;
{
panic("softdep_change_linkcnt called");
}
void
softdep_load_inodeblock(ip)
struct inode *ip;
{
panic("softdep_load_inodeblock called");
}
void
softdep_update_inodeblock(ip, bp, waitfor)
struct inode *ip;
struct buf *bp;
int waitfor;
{
panic("softdep_update_inodeblock called");
}
int
softdep_fsync(vp)
struct vnode *vp; /* the "in_core" copy of the inode */
{
return (0);
}
void
softdep_fsync_mountdev(vp)
struct vnode *vp;
{
return;
}
int
softdep_flushworklist(oldmnt, countp, td)
struct mount *oldmnt;
int *countp;
struct thread *td;
{
*countp = 0;
return (0);
}
int
softdep_sync_metadata(struct vnode *vp)
{
panic("softdep_sync_metadata called");
}
int
softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
{
panic("softdep_sync_buf called");
}
int
softdep_slowdown(vp)
struct vnode *vp;
{
panic("softdep_slowdown called");
}
int
softdep_request_cleanup(fs, vp, cred, resource)
struct fs *fs;
struct vnode *vp;
struct ucred *cred;
int resource;
{
return (0);
}
int
softdep_check_suspend(struct mount *mp,
struct vnode *devvp,
int softdep_depcnt,
int softdep_accdepcnt,
int secondary_writes,
int secondary_accwrites)
{
struct bufobj *bo;
int error;
(void) softdep_depcnt,
(void) softdep_accdepcnt;
bo = &devvp->v_bufobj;
ASSERT_BO_WLOCKED(bo);
MNT_ILOCK(mp);
while (mp->mnt_secondary_writes != 0) {
BO_UNLOCK(bo);
msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
(PUSER - 1) | PDROP, "secwr", 0);
BO_LOCK(bo);
MNT_ILOCK(mp);
}
/*
* Reasons for needing more work before suspend:
* - Dirty buffers on devvp.
* - Secondary writes occurred after start of vnode sync loop
*/
error = 0;
if (bo->bo_numoutput > 0 ||
bo->bo_dirty.bv_cnt > 0 ||
secondary_writes != 0 ||
mp->mnt_secondary_writes != 0 ||
secondary_accwrites != mp->mnt_secondary_accwrites)
error = EAGAIN;
BO_UNLOCK(bo);
return (error);
}
void
softdep_get_depcounts(struct mount *mp,
int *softdepactivep,
int *softdepactiveaccp)
{
(void) mp;
*softdepactivep = 0;
*softdepactiveaccp = 0;
}
void
softdep_buf_append(bp, wkhd)
struct buf *bp;
struct workhead *wkhd;
{
panic("softdep_buf_appendwork called");
}
void
softdep_inode_append(ip, cred, wkhd)
struct inode *ip;
struct ucred *cred;
struct workhead *wkhd;
{
panic("softdep_inode_appendwork called");
}
void
softdep_freework(wkhd)
struct workhead *wkhd;
{
panic("softdep_freework called");
}
#else
FEATURE(softupdates, "FFS soft-updates support");
static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
"soft updates stats");
static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
"total dependencies allocated");
static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
"high use dependencies allocated");
static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
"current dependencies allocated");
static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
"current dependencies written");
unsigned long dep_current[D_LAST + 1];
unsigned long dep_highuse[D_LAST + 1];
unsigned long dep_total[D_LAST + 1];
unsigned long dep_write[D_LAST + 1];
#define SOFTDEP_TYPE(type, str, long) \
static MALLOC_DEFINE(M_ ## type, #str, long); \
SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \
&dep_total[D_ ## type], 0, ""); \
SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \
&dep_current[D_ ## type], 0, ""); \
SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \
&dep_highuse[D_ ## type], 0, ""); \
SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \
&dep_write[D_ ## type], 0, "");
SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
"Block or frag allocated from cyl group map");
SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
#define M_SOFTDEP_FLAGS (M_WAITOK)
/*
* translate from workitem type to memory type
* MUST match the defines above, such that memtype[D_XXX] == M_XXX
*/
static struct malloc_type *memtype[] = {
NULL,
M_PAGEDEP,
M_INODEDEP,
M_BMSAFEMAP,
M_NEWBLK,
M_ALLOCDIRECT,
M_INDIRDEP,
M_ALLOCINDIR,
M_FREEFRAG,
M_FREEBLKS,
M_FREEFILE,
M_DIRADD,
M_MKDIR,
M_DIRREM,
M_NEWDIRBLK,
M_FREEWORK,
M_FREEDEP,
M_JADDREF,
M_JREMREF,
M_JMVREF,
M_JNEWBLK,
M_JFREEBLK,
M_JFREEFRAG,
M_JSEG,
M_JSEGDEP,
M_SBDEP,
M_JTRUNC,
M_JFSYNC,
M_SENTINEL
};
#define DtoM(type) (memtype[type])
/*
* Names of malloc types.
*/
#define TYPENAME(type) \
((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \
memtype[type]->ks_shortdesc : "???")
/*
* End system adaptation definitions.
*/
#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino)
#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino)
/*
* Internal function prototypes.
*/
static void check_clear_deps(struct mount *);
static void softdep_error(char *, int);
static int softdep_process_worklist(struct mount *, int);
static int softdep_waitidle(struct mount *, int);
static void drain_output(struct vnode *);
static struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
static int check_inodedep_free(struct inodedep *);
static void clear_remove(struct mount *);
static void clear_inodedeps(struct mount *);
static void unlinked_inodedep(struct mount *, struct inodedep *);
static void clear_unlinked_inodedep(struct inodedep *);
static struct inodedep *first_unlinked_inodedep(struct ufsmount *);
static int flush_pagedep_deps(struct vnode *, struct mount *,
struct diraddhd *);
static int free_pagedep(struct pagedep *);
static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
static int flush_deplist(struct allocdirectlst *, int, int *);
static int sync_cgs(struct mount *, int);
static int handle_written_filepage(struct pagedep *, struct buf *, int);
static int handle_written_sbdep(struct sbdep *, struct buf *);
static void initiate_write_sbdep(struct sbdep *);
static void diradd_inode_written(struct diradd *, struct inodedep *);
static int handle_written_indirdep(struct indirdep *, struct buf *,
struct buf**, int);
static int handle_written_inodeblock(struct inodedep *, struct buf *, int);
static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
uint8_t *);
static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
static void handle_written_jaddref(struct jaddref *);
static void handle_written_jremref(struct jremref *);
static void handle_written_jseg(struct jseg *, struct buf *);
static void handle_written_jnewblk(struct jnewblk *);
static void handle_written_jblkdep(struct jblkdep *);
static void handle_written_jfreefrag(struct jfreefrag *);
static void complete_jseg(struct jseg *);
static void complete_jsegs(struct jseg *);
static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
static void jremref_write(struct jremref *, struct jseg *, uint8_t *);
static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
static inline void inoref_write(struct inoref *, struct jseg *,
struct jrefrec *);
static void handle_allocdirect_partdone(struct allocdirect *,
struct workhead *);
static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
struct workhead *);
static void indirdep_complete(struct indirdep *);
static int indirblk_lookup(struct mount *, ufs2_daddr_t);
static void indirblk_insert(struct freework *);
static void indirblk_remove(struct freework *);
static void handle_allocindir_partdone(struct allocindir *);
static void initiate_write_filepage(struct pagedep *, struct buf *);
static void initiate_write_indirdep(struct indirdep*, struct buf *);
static void handle_written_mkdir(struct mkdir *, int);
static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
uint8_t *);
static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
static void handle_workitem_freefile(struct freefile *);
static int handle_workitem_remove(struct dirrem *, int);
static struct dirrem *newdirrem(struct buf *, struct inode *,
struct inode *, int, struct dirrem **);
static struct indirdep *indirdep_lookup(struct mount *, struct inode *,
struct buf *);
static void cancel_indirdep(struct indirdep *, struct buf *,
struct freeblks *);
static void free_indirdep(struct indirdep *);
static void free_diradd(struct diradd *, struct workhead *);
static void merge_diradd(struct inodedep *, struct diradd *);
static void complete_diradd(struct diradd *);
static struct diradd *diradd_lookup(struct pagedep *, int);
static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
struct jremref *);
static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
struct jremref *);
static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
struct jremref *, struct jremref *);
static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
struct jremref *);
static void cancel_allocindir(struct allocindir *, struct buf *bp,
struct freeblks *, int);
static int setup_trunc_indir(struct freeblks *, struct inode *,
ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
static void complete_trunc_indir(struct freework *);
static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
int);
static void complete_mkdir(struct mkdir *);
static void free_newdirblk(struct newdirblk *);
static void free_jremref(struct jremref *);
static void free_jaddref(struct jaddref *);
static void free_jsegdep(struct jsegdep *);
static void free_jsegs(struct jblocks *);
static void rele_jseg(struct jseg *);
static void free_jseg(struct jseg *, struct jblocks *);
static void free_jnewblk(struct jnewblk *);
static void free_jblkdep(struct jblkdep *);
static void free_jfreefrag(struct jfreefrag *);
static void free_freedep(struct freedep *);
static void journal_jremref(struct dirrem *, struct jremref *,
struct inodedep *);
static void cancel_jnewblk(struct jnewblk *, struct workhead *);
static int cancel_jaddref(struct jaddref *, struct inodedep *,
struct workhead *);
static void cancel_jfreefrag(struct jfreefrag *);
static inline void setup_freedirect(struct freeblks *, struct inode *,
int, int);
static inline void setup_freeext(struct freeblks *, struct inode *, int, int);
static inline void setup_freeindir(struct freeblks *, struct inode *, int,
ufs_lbn_t, int);
static inline struct freeblks *newfreeblks(struct mount *, struct inode *);
static void freeblks_free(struct ufsmount *, struct freeblks *, int);
static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
int, int);
static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
static int cancel_pagedep(struct pagedep *, struct freeblks *, int);
static int deallocate_dependencies(struct buf *, struct freeblks *, int);
static void newblk_freefrag(struct newblk*);
static void free_newblk(struct newblk *);
static void cancel_allocdirect(struct allocdirectlst *,
struct allocdirect *, struct freeblks *);
static int check_inode_unwritten(struct inodedep *);
static int free_inodedep(struct inodedep *);
static void freework_freeblock(struct freework *, u_long);
static void freework_enqueue(struct freework *);
static int handle_workitem_freeblocks(struct freeblks *, int);
static int handle_complete_freeblocks(struct freeblks *, int);
static void handle_workitem_indirblk(struct freework *);
static void handle_written_freework(struct freework *);
static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
static struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
struct workhead *);
static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
struct inodedep *, struct allocindir *, ufs_lbn_t);
static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
ufs2_daddr_t, ufs_lbn_t);
static void handle_workitem_freefrag(struct freefrag *);
static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
ufs_lbn_t, u_long);
static void allocdirect_merge(struct allocdirectlst *,
struct allocdirect *, struct allocdirect *);
static struct freefrag *allocindir_merge(struct allocindir *,
struct allocindir *);
static int bmsafemap_find(struct bmsafemap_hashhead *, int,
struct bmsafemap **);
static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
int cg, struct bmsafemap *);
static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
struct newblk **);
static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
static int inodedep_find(struct inodedep_hashhead *, ino_t,
struct inodedep **);
static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
int, struct pagedep **);
static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
struct pagedep **);
static void pause_timer(void *);
static int request_cleanup(struct mount *, int);
static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
static void schedule_cleanup(struct mount *);
static void softdep_ast_cleanup_proc(struct thread *);
static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
static int process_worklist_item(struct mount *, int, int);
static void process_removes(struct vnode *);
static void process_truncates(struct vnode *);
static void jwork_move(struct workhead *, struct workhead *);
static void jwork_insert(struct workhead *, struct jsegdep *);
static void add_to_worklist(struct worklist *, int);
static void wake_worklist(struct worklist *);
static void wait_worklist(struct worklist *, char *);
static void remove_from_worklist(struct worklist *);
static void softdep_flush(void *);
static void softdep_flushjournal(struct mount *);
static int softdep_speedup(struct ufsmount *);
static void worklist_speedup(struct mount *);
static int journal_mount(struct mount *, struct fs *, struct ucred *);
static void journal_unmount(struct ufsmount *);
static int journal_space(struct ufsmount *, int);
static void journal_suspend(struct ufsmount *);
static int journal_unsuspend(struct ufsmount *ump);
static void softdep_prelink(struct vnode *, struct vnode *);
static void add_to_journal(struct worklist *);
static void remove_from_journal(struct worklist *);
static bool softdep_excess_items(struct ufsmount *, int);
static void softdep_process_journal(struct mount *, struct worklist *, int);
static struct jremref *newjremref(struct dirrem *, struct inode *,
struct inode *ip, off_t, nlink_t);
static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
uint16_t);
static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
uint16_t);
static inline struct jsegdep *inoref_jseg(struct inoref *);
static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
ufs2_daddr_t, int);
static void adjust_newfreework(struct freeblks *, int);
static struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
static void move_newblock_dep(struct jaddref *, struct inodedep *);
static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
ufs2_daddr_t, long, ufs_lbn_t);
static struct freework *newfreework(struct ufsmount *, struct freeblks *,
struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
static int jwait(struct worklist *, int);
static struct inodedep *inodedep_lookup_ip(struct inode *);
static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
static struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
static void handle_jwork(struct workhead *);
static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
struct mkdir **);
static struct jblocks *jblocks_create(void);
static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
static void jblocks_free(struct jblocks *, struct mount *, int);
static void jblocks_destroy(struct jblocks *);
static void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
/*
* Exported softdep operations.
*/
static void softdep_disk_io_initiation(struct buf *);
static void softdep_disk_write_complete(struct buf *);
static void softdep_deallocate_dependencies(struct buf *);
static int softdep_count_dependencies(struct buf *bp, int);
/*
* Global lock over all of soft updates.
*/
static struct mtx lk;
MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
#define ACQUIRE_GBLLOCK(lk) mtx_lock(lk)
#define FREE_GBLLOCK(lk) mtx_unlock(lk)
#define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED)
/*
* Per-filesystem soft-updates locking.
*/
#define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock)
#define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock)
#define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock)
#define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock)
#define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \
RA_WLOCKED)
#define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock)
#define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock)
/*
* Worklist queue management.
* These routines require that the lock be held.
*/
#ifndef /* NOT */ DEBUG
#define WORKLIST_INSERT(head, item) do { \
(item)->wk_state |= ONWORKLIST; \
LIST_INSERT_HEAD(head, item, wk_list); \
} while (0)
#define WORKLIST_REMOVE(item) do { \
(item)->wk_state &= ~ONWORKLIST; \
LIST_REMOVE(item, wk_list); \
} while (0)
#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT
#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE
#else /* DEBUG */
static void worklist_insert(struct workhead *, struct worklist *, int);
static void worklist_remove(struct worklist *, int);
#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
static void
worklist_insert(head, item, locked)
struct workhead *head;
struct worklist *item;
int locked;
{
if (locked)
LOCK_OWNED(VFSTOUFS(item->wk_mp));
if (item->wk_state & ONWORKLIST)
panic("worklist_insert: %p %s(0x%X) already on list",
item, TYPENAME(item->wk_type), item->wk_state);
item->wk_state |= ONWORKLIST;
LIST_INSERT_HEAD(head, item, wk_list);
}
static void
worklist_remove(item, locked)
struct worklist *item;
int locked;
{
if (locked)
LOCK_OWNED(VFSTOUFS(item->wk_mp));
if ((item->wk_state & ONWORKLIST) == 0)
panic("worklist_remove: %p %s(0x%X) not on list",
item, TYPENAME(item->wk_type), item->wk_state);
item->wk_state &= ~ONWORKLIST;
LIST_REMOVE(item, wk_list);
}
#endif /* DEBUG */
/*
* Merge two jsegdeps keeping only the oldest one as newer references
* can't be discarded until after older references.
*/
static inline struct jsegdep *
jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
{
struct jsegdep *swp;
if (two == NULL)
return (one);
if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
swp = one;
one = two;
two = swp;
}
WORKLIST_REMOVE(&two->jd_list);
free_jsegdep(two);
return (one);
}
/*
* If two freedeps are compatible free one to reduce list size.
*/
static inline struct freedep *
freedep_merge(struct freedep *one, struct freedep *two)
{
if (two == NULL)
return (one);
if (one->fd_freework == two->fd_freework) {
WORKLIST_REMOVE(&two->fd_list);
free_freedep(two);
}
return (one);
}
/*
* Move journal work from one list to another. Duplicate freedeps and
* jsegdeps are coalesced to keep the lists as small as possible.
*/
static void
jwork_move(dst, src)
struct workhead *dst;
struct workhead *src;
{
struct freedep *freedep;
struct jsegdep *jsegdep;
struct worklist *wkn;
struct worklist *wk;
KASSERT(dst != src,
("jwork_move: dst == src"));
freedep = NULL;
jsegdep = NULL;
LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
if (wk->wk_type == D_JSEGDEP)
jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
else if (wk->wk_type == D_FREEDEP)
freedep = freedep_merge(WK_FREEDEP(wk), freedep);
}
while ((wk = LIST_FIRST(src)) != NULL) {
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(dst, wk);
if (wk->wk_type == D_JSEGDEP) {
jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
continue;
}
if (wk->wk_type == D_FREEDEP)
freedep = freedep_merge(WK_FREEDEP(wk), freedep);
}
}
static void
jwork_insert(dst, jsegdep)
struct workhead *dst;
struct jsegdep *jsegdep;
{
struct jsegdep *jsegdepn;
struct worklist *wk;
LIST_FOREACH(wk, dst, wk_list)
if (wk->wk_type == D_JSEGDEP)
break;
if (wk == NULL) {
WORKLIST_INSERT(dst, &jsegdep->jd_list);
return;
}
jsegdepn = WK_JSEGDEP(wk);
if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
WORKLIST_REMOVE(wk);
free_jsegdep(jsegdepn);
WORKLIST_INSERT(dst, &jsegdep->jd_list);
} else
free_jsegdep(jsegdep);
}
/*
* Routines for tracking and managing workitems.
*/
static void workitem_free(struct worklist *, int);
static void workitem_alloc(struct worklist *, int, struct mount *);
static void workitem_reassign(struct worklist *, int);
#define WORKITEM_FREE(item, type) \
workitem_free((struct worklist *)(item), (type))
#define WORKITEM_REASSIGN(item, type) \
workitem_reassign((struct worklist *)(item), (type))
static void
workitem_free(item, type)
struct worklist *item;
int type;
{
struct ufsmount *ump;
#ifdef DEBUG
if (item->wk_state & ONWORKLIST)
panic("workitem_free: %s(0x%X) still on list",
TYPENAME(item->wk_type), item->wk_state);
if (item->wk_type != type && type != D_NEWBLK)
panic("workitem_free: type mismatch %s != %s",
TYPENAME(item->wk_type), TYPENAME(type));
#endif
if (item->wk_state & IOWAITING)
wakeup(item);
ump = VFSTOUFS(item->wk_mp);
LOCK_OWNED(ump);
KASSERT(ump->softdep_deps > 0,
("workitem_free: %s: softdep_deps going negative",
ump->um_fs->fs_fsmnt));
if (--ump->softdep_deps == 0 && ump->softdep_req)
wakeup(&ump->softdep_deps);
KASSERT(dep_current[item->wk_type] > 0,
("workitem_free: %s: dep_current[%s] going negative",
ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
("workitem_free: %s: softdep_curdeps[%s] going negative",
ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
atomic_subtract_long(&dep_current[item->wk_type], 1);
ump->softdep_curdeps[item->wk_type] -= 1;
free(item, DtoM(type));
}
static void
workitem_alloc(item, type, mp)
struct worklist *item;
int type;
struct mount *mp;
{
struct ufsmount *ump;
item->wk_type = type;
item->wk_mp = mp;
item->wk_state = 0;
ump = VFSTOUFS(mp);
ACQUIRE_GBLLOCK(&lk);
dep_current[type]++;
if (dep_current[type] > dep_highuse[type])
dep_highuse[type] = dep_current[type];
dep_total[type]++;
FREE_GBLLOCK(&lk);
ACQUIRE_LOCK(ump);
ump->softdep_curdeps[type] += 1;
ump->softdep_deps++;
ump->softdep_accdeps++;
FREE_LOCK(ump);
}
static void
workitem_reassign(item, newtype)
struct worklist *item;
int newtype;
{
struct ufsmount *ump;
ump = VFSTOUFS(item->wk_mp);
LOCK_OWNED(ump);
KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
("workitem_reassign: %s: softdep_curdeps[%s] going negative",
VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
ump->softdep_curdeps[item->wk_type] -= 1;
ump->softdep_curdeps[newtype] += 1;
KASSERT(dep_current[item->wk_type] > 0,
("workitem_reassign: %s: dep_current[%s] going negative",
VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
ACQUIRE_GBLLOCK(&lk);
dep_current[newtype]++;
dep_current[item->wk_type]--;
if (dep_current[newtype] > dep_highuse[newtype])
dep_highuse[newtype] = dep_current[newtype];
dep_total[newtype]++;
FREE_GBLLOCK(&lk);
item->wk_type = newtype;
}
/*
* Workitem queue management
*/
static int max_softdeps; /* maximum number of structs before slowdown */
static int tickdelay = 2; /* number of ticks to pause during slowdown */
static int proc_waiting; /* tracks whether we have a timeout posted */
static int *stat_countp; /* statistic to count in proc_waiting timeout */
static struct callout softdep_callout;
static int req_clear_inodedeps; /* syncer process flush some inodedeps */
static int req_clear_remove; /* syncer process flush some freeblks */
static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
/*
* runtime statistics
*/
static int stat_flush_threads; /* number of softdep flushing threads */
static int stat_worklist_push; /* number of worklist cleanups */
static int stat_blk_limit_push; /* number of times block limit neared */
static int stat_ino_limit_push; /* number of times inode limit neared */
static int stat_blk_limit_hit; /* number of times block slowdown imposed */
static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */
static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */
static int stat_journal_min; /* Times hit journal min threshold */
static int stat_journal_low; /* Times hit journal low threshold */
static int stat_journal_wait; /* Times blocked in jwait(). */
static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */
static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */
static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
static int stat_cleanup_failures; /* Number of cleanup requests that failed */
static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
&max_softdeps, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
&tickdelay, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
&stat_flush_threads, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
&stat_worklist_push, 0,"");
SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
&stat_blk_limit_push, 0,"");
SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
&stat_ino_limit_push, 0,"");
SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
&stat_blk_limit_hit, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
&stat_ino_limit_hit, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
&stat_sync_limit_hit, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
&stat_indir_blk_ptrs, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
&stat_inode_bitmap, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
&stat_direct_blk_ptrs, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
&stat_dir_entry, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
&stat_jaddref, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
&stat_jnewblk, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
&stat_journal_low, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
&stat_journal_min, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
&stat_journal_wait, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
&stat_jwait_filepage, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
&stat_jwait_freeblks, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
&stat_jwait_inode, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
&stat_jwait_newblk, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
&stat_cleanup_blkrequests, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
&stat_cleanup_inorequests, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
&stat_cleanup_high_delay, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
&stat_cleanup_retries, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
&stat_cleanup_failures, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
&softdep_flushcache, 0, "");
SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
&stat_emptyjblocks, 0, "");
SYSCTL_DECL(_vfs_ffs);
/* Whether to recompute the summary at mount time */
static int compute_summary_at_mount = 0;
SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
&compute_summary_at_mount, 0, "Recompute summary at mount");
static int print_threads = 0;
SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
&print_threads, 0, "Notify flusher thread start/stop");
/* List of all filesystems mounted with soft updates */
static TAILQ_HEAD(, mount_softdeps) softdepmounts;
/*
* This function cleans the worklist for a filesystem.
* Each filesystem running with soft dependencies gets its own
* thread to run in this function. The thread is started up in
* softdep_mount and shutdown in softdep_unmount. They show up
* as part of the kernel "bufdaemon" process whose process
* entry is available in bufdaemonproc.
*/
static int searchfailed;
extern struct proc *bufdaemonproc;
static void
softdep_flush(addr)
void *addr;
{
struct mount *mp;
struct thread *td;
struct ufsmount *ump;
td = curthread;
td->td_pflags |= TDP_NORUNNINGBUF;
mp = (struct mount *)addr;
ump = VFSTOUFS(mp);
atomic_add_int(&stat_flush_threads, 1);
ACQUIRE_LOCK(ump);
ump->softdep_flags &= ~FLUSH_STARTING;
wakeup(&ump->softdep_flushtd);
FREE_LOCK(ump);
if (print_threads) {
if (stat_flush_threads == 1)
printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
bufdaemonproc->p_pid);
printf("Start thread %s\n", td->td_name);
}
for (;;) {
while (softdep_process_worklist(mp, 0) > 0 ||
(MOUNTEDSUJ(mp) &&
VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
kthread_suspend_check();
ACQUIRE_LOCK(ump);
if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
"sdflush", hz / 2);
ump->softdep_flags &= ~FLUSH_CLEANUP;
/*
* Check to see if we are done and need to exit.
*/
if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
FREE_LOCK(ump);
continue;
}
ump->softdep_flags &= ~FLUSH_EXIT;
FREE_LOCK(ump);
wakeup(&ump->softdep_flags);
if (print_threads)
printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
atomic_subtract_int(&stat_flush_threads, 1);
kthread_exit();
panic("kthread_exit failed\n");
}
}
static void
worklist_speedup(mp)
struct mount *mp;
{
struct ufsmount *ump;
ump = VFSTOUFS(mp);
LOCK_OWNED(ump);
if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
ump->softdep_flags |= FLUSH_CLEANUP;
wakeup(&ump->softdep_flushtd);
}
static int
softdep_speedup(ump)
struct ufsmount *ump;
{
struct ufsmount *altump;
struct mount_softdeps *sdp;
LOCK_OWNED(ump);
worklist_speedup(ump->um_mountp);
bd_speedup();
/*
* If we have global shortages, then we need other
* filesystems to help with the cleanup. Here we wakeup a
* flusher thread for a filesystem that is over its fair
* share of resources.
*/
if (req_clear_inodedeps || req_clear_remove) {
ACQUIRE_GBLLOCK(&lk);
TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
if ((altump = sdp->sd_ump) == ump)
continue;
if (((req_clear_inodedeps &&
altump->softdep_curdeps[D_INODEDEP] >
max_softdeps / stat_flush_threads) ||
(req_clear_remove &&
altump->softdep_curdeps[D_DIRREM] >
(max_softdeps / 2) / stat_flush_threads)) &&
TRY_ACQUIRE_LOCK(altump))
break;
}
if (sdp == NULL) {
searchfailed++;
FREE_GBLLOCK(&lk);
} else {
/*
* Move to the end of the list so we pick a
* different one on out next try.
*/
TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
FREE_GBLLOCK(&lk);
if ((altump->softdep_flags &
(FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
altump->softdep_flags |= FLUSH_CLEANUP;
altump->um_softdep->sd_cleanups++;
wakeup(&altump->softdep_flushtd);
FREE_LOCK(altump);
}
}
return (speedup_syncer());
}
/*
* Add an item to the end of the work queue.
* This routine requires that the lock be held.
* This is the only routine that adds items to the list.
* The following routine is the only one that removes items
* and does so in order from first to last.
*/
#define WK_HEAD 0x0001 /* Add to HEAD. */
#define WK_NODELAY 0x0002 /* Process immediately. */
static void
add_to_worklist(wk, flags)
struct worklist *wk;
int flags;
{
struct ufsmount *ump;
ump = VFSTOUFS(wk->wk_mp);
LOCK_OWNED(ump);
if (wk->wk_state & ONWORKLIST)
panic("add_to_worklist: %s(0x%X) already on list",
TYPENAME(wk->wk_type), wk->wk_state);
wk->wk_state |= ONWORKLIST;
if (ump->softdep_on_worklist == 0) {
LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
ump->softdep_worklist_tail = wk;
} else if (flags & WK_HEAD) {
LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
} else {
LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
ump->softdep_worklist_tail = wk;
}
ump->softdep_on_worklist += 1;
if (flags & WK_NODELAY)
worklist_speedup(wk->wk_mp);
}
/*
* Remove the item to be processed. If we are removing the last
* item on the list, we need to recalculate the tail pointer.
*/
static void
remove_from_worklist(wk)
struct worklist *wk;
{
struct ufsmount *ump;
ump = VFSTOUFS(wk->wk_mp);
if (ump->softdep_worklist_tail == wk)
ump->softdep_worklist_tail =
(struct worklist *)wk->wk_list.le_prev;
WORKLIST_REMOVE(wk);
ump->softdep_on_worklist -= 1;
}
static void
wake_worklist(wk)
struct worklist *wk;
{
if (wk->wk_state & IOWAITING) {
wk->wk_state &= ~IOWAITING;
wakeup(wk);
}
}
static void
wait_worklist(wk, wmesg)
struct worklist *wk;
char *wmesg;
{
struct ufsmount *ump;
ump = VFSTOUFS(wk->wk_mp);
wk->wk_state |= IOWAITING;
msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
}
/*
* Process that runs once per second to handle items in the background queue.
*
* Note that we ensure that everything is done in the order in which they
* appear in the queue. The code below depends on this property to ensure
* that blocks of a file are freed before the inode itself is freed. This
* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
* until all the old ones have been purged from the dependency lists.
*/
static int
softdep_process_worklist(mp, full)
struct mount *mp;
int full;
{
int cnt, matchcnt;
struct ufsmount *ump;
long starttime;
KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
if (MOUNTEDSOFTDEP(mp) == 0)
return (0);
matchcnt = 0;
ump = VFSTOUFS(mp);
ACQUIRE_LOCK(ump);
starttime = time_second;
softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
check_clear_deps(mp);
while (ump->softdep_on_worklist > 0) {
if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
break;
else
matchcnt += cnt;
check_clear_deps(mp);
/*
* We do not generally want to stop for buffer space, but if
* we are really being a buffer hog, we will stop and wait.
*/
if (should_yield()) {
FREE_LOCK(ump);
kern_yield(PRI_USER);
bwillwrite();
ACQUIRE_LOCK(ump);
}
/*
* Never allow processing to run for more than one
* second. This gives the syncer thread the opportunity
* to pause if appropriate.
*/
if (!full && starttime != time_second)
break;
}
if (full == 0)
journal_unsuspend(ump);
FREE_LOCK(ump);
return (matchcnt);
}
/*
* Process all removes associated with a vnode if we are running out of
* journal space. Any other process which attempts to flush these will
* be unable as we have the vnodes locked.
*/
static void
process_removes(vp)
struct vnode *vp;
{
struct inodedep *inodedep;
struct dirrem *dirrem;
struct ufsmount *ump;
struct mount *mp;
ino_t inum;
mp = vp->v_mount;
ump = VFSTOUFS(mp);
LOCK_OWNED(ump);
inum = VTOI(vp)->i_number;
for (;;) {
top:
if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
return;
LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
/*
* If another thread is trying to lock this vnode
* it will fail but we must wait for it to do so
* before we can proceed.
*/
if (dirrem->dm_state & INPROGRESS) {
wait_worklist(&dirrem->dm_list, "pwrwait");
goto top;
}
if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
(COMPLETE | ONWORKLIST))
break;
}
if (dirrem == NULL)
return;
remove_from_worklist(&dirrem->dm_list);
FREE_LOCK(ump);
if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
panic("process_removes: suspended filesystem");
handle_workitem_remove(dirrem, 0);
vn_finished_secondary_write(mp);
ACQUIRE_LOCK(ump);
}
}
/*
* Process all truncations associated with a vnode if we are running out
* of journal space. This is called when the vnode lock is already held
* and no other process can clear the truncation. This function returns
* a value greater than zero if it did any work.
*/
static void
process_truncates(vp)
struct vnode *vp;
{
struct inodedep *inodedep;
struct freeblks *freeblks;
struct ufsmount *ump;
struct mount *mp;
ino_t inum;
int cgwait;
mp = vp->v_mount;
ump = VFSTOUFS(mp);
LOCK_OWNED(ump);
inum = VTOI(vp)->i_number;
for (;;) {
if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
return;
cgwait = 0;
TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
/* Journal entries not yet written. */
if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
jwait(&LIST_FIRST(
&freeblks->fb_jblkdephd)->jb_list,
MNT_WAIT);
break;
}
/* Another thread is executing this item. */
if (freeblks->fb_state & INPROGRESS) {
wait_worklist(&freeblks->fb_list, "ptrwait");
break;
}
/* Freeblks is waiting on a inode write. */
if ((freeblks->fb_state & COMPLETE) == 0) {
FREE_LOCK(ump);
ffs_update(vp, 1);
ACQUIRE_LOCK(ump);
break;
}
if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
(ALLCOMPLETE | ONWORKLIST)) {
remove_from_worklist(&freeblks->fb_list);
freeblks->fb_state |= INPROGRESS;
FREE_LOCK(ump);
if (vn_start_secondary_write(NULL, &mp,
V_NOWAIT))
panic("process_truncates: "
"suspended filesystem");
handle_workitem_freeblocks(freeblks, 0);
vn_finished_secondary_write(mp);
ACQUIRE_LOCK(ump);
break;
}
if (freeblks->fb_cgwait)
cgwait++;
}
if (cgwait) {
FREE_LOCK(ump);
sync_cgs(mp, MNT_WAIT);
ffs_sync_snap(mp, MNT_WAIT);
ACQUIRE_LOCK(ump);
continue;
}
if (freeblks == NULL)
break;
}
return;
}
/*
* Process one item on the worklist.
*/
static int
process_worklist_item(mp, target, flags)
struct mount *mp;
int target;
int flags;
{
struct worklist sentinel;
struct worklist *wk;
struct ufsmount *ump;
int matchcnt;
int error;
KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
/*
* If we are being called because of a process doing a
* copy-on-write, then it is not safe to write as we may
* recurse into the copy-on-write routine.
*/
if (curthread->td_pflags & TDP_COWINPROGRESS)
return (-1);
PHOLD(curproc); /* Don't let the stack go away. */
ump = VFSTOUFS(mp);
LOCK_OWNED(ump);
matchcnt = 0;
sentinel.wk_mp = NULL;
sentinel.wk_type = D_SENTINEL;
LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
wk = LIST_NEXT(&sentinel, wk_list)) {
if (wk->wk_type == D_SENTINEL) {
LIST_REMOVE(&sentinel, wk_list);
LIST_INSERT_AFTER(wk, &sentinel, wk_list);
continue;
}
if (wk->wk_state & INPROGRESS)
panic("process_worklist_item: %p already in progress.",
wk);
wk->wk_state |= INPROGRESS;
remove_from_worklist(wk);
FREE_LOCK(ump);
if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
panic("process_worklist_item: suspended filesystem");
switch (wk->wk_type) {
case D_DIRREM:
/* removal of a directory entry */
error = handle_workitem_remove(WK_DIRREM(wk), flags);
break;
case D_FREEBLKS:
/* releasing blocks and/or fragments from a file */
error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
flags);
break;
case D_FREEFRAG:
/* releasing a fragment when replaced as a file grows */
handle_workitem_freefrag(WK_FREEFRAG(wk));
error = 0;
break;
case D_FREEFILE:
/* releasing an inode when its link count drops to 0 */
handle_workitem_freefile(WK_FREEFILE(wk));
error = 0;
break;
default:
panic("%s_process_worklist: Unknown type %s",
"softdep", TYPENAME(wk->wk_type));
/* NOTREACHED */
}
vn_finished_secondary_write(mp);
ACQUIRE_LOCK(ump);
if (error == 0) {
if (++matchcnt == target)
break;
continue;
}
/*
* We have to retry the worklist item later. Wake up any
* waiters who may be able to complete it immediately and
* add the item back to the head so we don't try to execute
* it again.
*/
wk->wk_state &= ~INPROGRESS;
wake_worklist(wk);
add_to_worklist(wk, WK_HEAD);
}
/* Sentinal could've become the tail from remove_from_worklist. */
if (ump->softdep_worklist_tail == &sentinel)
ump->softdep_worklist_tail =
(struct worklist *)sentinel.wk_list.le_prev;
LIST_REMOVE(&sentinel, wk_list);
PRELE(curproc);
return (matchcnt);
}
/*
* Move dependencies from one buffer to another.
*/
int
softdep_move_dependencies(oldbp, newbp)
struct buf *oldbp;
struct buf *newbp;
{
struct worklist *wk, *wktail;
struct ufsmount *ump;
int dirty;
if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
return (0);
KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
("softdep_move_dependencies called on non-softdep filesystem"));
dirty = 0;
wktail = NULL;
ump = VFSTOUFS(wk->wk_mp);
ACQUIRE_LOCK(ump);
while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
LIST_REMOVE(wk, wk_list);
if (wk->wk_type == D_BMSAFEMAP &&
bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
dirty = 1;
if (wktail == NULL)
LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
else
LIST_INSERT_AFTER(wktail, wk, wk_list);
wktail = wk;
}
FREE_LOCK(ump);
return (dirty);
}
/*
* Purge the work list of all items associated with a particular mount point.
*/
int
softdep_flushworklist(oldmnt, countp, td)
struct mount *oldmnt;
int *countp;
struct thread *td;
{
struct vnode *devvp;
struct ufsmount *ump;
int count, error;
/*
* Alternately flush the block device associated with the mount
* point and process any dependencies that the flushing
* creates. We continue until no more worklist dependencies
* are found.
*/
*countp = 0;
error = 0;
ump = VFSTOUFS(oldmnt);
devvp = ump->um_devvp;
while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
*countp += count;
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_FSYNC(devvp, MNT_WAIT, td);
VOP_UNLOCK(devvp, 0);
if (error != 0)
break;
}
return (error);
}
#define SU_WAITIDLE_RETRIES 20
static int
softdep_waitidle(struct mount *mp, int flags __unused)
{
struct ufsmount *ump;
struct vnode *devvp;
struct thread *td;
int error, i;
ump = VFSTOUFS(mp);
devvp = ump->um_devvp;
td = curthread;
error = 0;
ACQUIRE_LOCK(ump);
for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
ump->softdep_req = 1;
KASSERT((flags & FORCECLOSE) == 0 ||
ump->softdep_on_worklist == 0,
("softdep_waitidle: work added after flush"));
msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
"softdeps", 10 * hz);
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_FSYNC(devvp, MNT_WAIT, td);
VOP_UNLOCK(devvp, 0);
ACQUIRE_LOCK(ump);
if (error != 0)
break;
}
ump->softdep_req = 0;
if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
error = EBUSY;
printf("softdep_waitidle: Failed to flush worklist for %p\n",
mp);
}
FREE_LOCK(ump);
return (error);
}
/*
* Flush all vnodes and worklist items associated with a specified mount point.
*/
int
softdep_flushfiles(oldmnt, flags, td)
struct mount *oldmnt;
int flags;
struct thread *td;
{
#ifdef QUOTA
struct ufsmount *ump;
int i;
#endif
int error, early, depcount, loopcnt, retry_flush_count, retry;
int morework;
KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
("softdep_flushfiles called on non-softdep filesystem"));
loopcnt = 10;
retry_flush_count = 3;
retry_flush:
error = 0;
/*
* Alternately flush the vnodes associated with the mount
* point and process any dependencies that the flushing
* creates. In theory, this loop can happen at most twice,
* but we give it a few extra just to be sure.
*/
for (; loopcnt > 0; loopcnt--) {
/*
* Do another flush in case any vnodes were brought in
* as part of the cleanup operations.
*/
early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
break;
if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
depcount == 0)
break;
}
/*
* If we are unmounting then it is an error to fail. If we
* are simply trying to downgrade to read-only, then filesystem
* activity can keep us busy forever, so we just fail with EBUSY.
*/
if (loopcnt == 0) {
if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
panic("softdep_flushfiles: looping");
error = EBUSY;
}
if (!error)
error = softdep_waitidle(oldmnt, flags);
if (!error) {
if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
retry = 0;
MNT_ILOCK(oldmnt);
KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
("softdep_flushfiles: !MNTK_NOINSMNTQ"));
morework = oldmnt->mnt_nvnodelistsize > 0;
#ifdef QUOTA
ump = VFSTOUFS(oldmnt);
UFS_LOCK(ump);
for (i = 0; i < MAXQUOTAS; i++) {
if (ump->um_quotas[i] != NULLVP)
morework = 1;
}
UFS_UNLOCK(ump);
#endif
if (morework) {
if (--retry_flush_count > 0) {
retry = 1;
loopcnt = 3;
} else
error = EBUSY;
}
MNT_IUNLOCK(oldmnt);
if (retry)
goto retry_flush;
}
}
return (error);
}
/*
* Structure hashing.
*
* There are four types of structures that can be looked up:
* 1) pagedep structures identified by mount point, inode number,
* and logical block.
* 2) inodedep structures identified by mount point and inode number.
* 3) newblk structures identified by mount point and
* physical block number.
* 4) bmsafemap structures identified by mount point and
* cylinder group number.
*
* The "pagedep" and "inodedep" dependency structures are hashed
* separately from the file blocks and inodes to which they correspond.
* This separation helps when the in-memory copy of an inode or
* file block must be replaced. It also obviates the need to access
* an inode or file page when simply updating (or de-allocating)
* dependency structures. Lookup of newblk structures is needed to
* find newly allocated blocks when trying to associate them with
* their allocdirect or allocindir structure.
*
* The lookup routines optionally create and hash a new instance when
* an existing entry is not found. The bmsafemap lookup routine always
* allocates a new structure if an existing one is not found.
*/
#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
/*
* Structures and routines associated with pagedep caching.
*/
#define PAGEDEP_HASH(ump, inum, lbn) \
(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
static int
pagedep_find(pagedephd, ino, lbn, pagedeppp)
struct pagedep_hashhead *pagedephd;
ino_t ino;
ufs_lbn_t lbn;
struct pagedep **pagedeppp;
{
struct pagedep *pagedep;
LIST_FOREACH(pagedep, pagedephd, pd_hash) {
if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
*pagedeppp = pagedep;
return (1);
}
}
*pagedeppp = NULL;
return (0);
}
/*
* Look up a pagedep. Return 1 if found, 0 otherwise.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in pagedeppp.
* This routine must be called with splbio interrupts blocked.
*/
static int
pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
struct mount *mp;
struct buf *bp;
ino_t ino;
ufs_lbn_t lbn;
int flags;
struct pagedep **pagedeppp;
{
struct pagedep *pagedep;
struct pagedep_hashhead *pagedephd;
struct worklist *wk;
struct ufsmount *ump;
int ret;
int i;
ump = VFSTOUFS(mp);
LOCK_OWNED(ump);
if (bp) {
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
if (wk->wk_type == D_PAGEDEP) {
*pagedeppp = WK_PAGEDEP(wk);
return (1);
}
}
}
pagedephd = PAGEDEP_HASH(ump, ino, lbn);
ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
if (ret) {
if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
return (1);
}
if ((flags & DEPALLOC) == 0)
return (0);
FREE_LOCK(ump);
pagedep = malloc(sizeof(struct pagedep),
M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
ACQUIRE_LOCK(ump);
ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
if (*pagedeppp) {
/*
* This should never happen since we only create pagedeps
* with the vnode lock held. Could be an assert.
*/
WORKITEM_FREE(pagedep, D_PAGEDEP);
return (ret);
}
pagedep->pd_ino = ino;
pagedep->pd_lbn = lbn;
LIST_INIT(&pagedep->pd_dirremhd);
LIST_INIT(&pagedep->pd_pendinghd);
for (i = 0; i < DAHASHSZ; i++)
LIST_INIT(&pagedep->pd_diraddhd[i]);
LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
*pagedeppp = pagedep;
return (0);
}
/*
* Structures and routines associated with inodedep caching.
*/
#define INODEDEP_HASH(ump, inum) \
(&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
static int
inodedep_find(inodedephd, inum, inodedeppp)
struct inodedep_hashhead *inodedephd;
ino_t inum;
struct inodedep **inodedeppp;
{
struct inodedep *inodedep;
LIST_FOREACH(inodedep, inodedephd, id_hash)
if (inum == inodedep->id_ino)
break;
if (inodedep) {
*inodedeppp = inodedep;
return (1);
}
*inodedeppp = NULL;
return (0);
}
/*
* Look up an inodedep. Return 1 if found, 0 if not found.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in inodedeppp.
* This routine must be called with splbio interrupts blocked.
*/
static int
inodedep_lookup(mp, inum, flags, inodedeppp)
struct mount *mp;
ino_t inum;
int flags;
struct inodedep **inodedeppp;
{
struct inodedep *inodedep;
struct inodedep_hashhead *inodedephd;
struct ufsmount *ump;
struct fs *fs;
ump = VFSTOUFS(mp);
LOCK_OWNED(ump);
fs = ump->um_fs;
inodedephd = INODEDEP_HASH(ump, inum);
if (inodedep_find(inodedephd, inum, inodedeppp))
return (1);
if ((flags & DEPALLOC) == 0)
return (0);
/*
* If the system is over its limit and our filesystem is
* responsible for more than our share of that usage and
* we are not in a rush, request some inodedep cleanup.
*/
if (softdep_excess_items(ump, D_INODEDEP))
schedule_cleanup(mp);
else
FREE_LOCK(ump);
inodedep = malloc(sizeof(struct inodedep),
M_INODEDEP, M_SOFTDEP_FLAGS);
workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
ACQUIRE_LOCK(ump);
if (inodedep_find(inodedephd, inum, inodedeppp)) {
WORKITEM_FREE(inodedep, D_INODEDEP);
return (1);
}
inodedep->id_fs = fs;
inodedep->id_ino = inum;
inodedep->id_state = ALLCOMPLETE;
inodedep->id_nlinkdelta = 0;
inodedep->id_savedino1 = NULL;
inodedep->id_savedsize = -1;
inodedep->id_savedextsize = -1;
inodedep->id_savednlink = -1;
inodedep->id_bmsafemap = NULL;
inodedep->id_mkdiradd = NULL;
LIST_INIT(&inodedep->id_dirremhd);
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
LIST_INIT(&inodedep->id_bufwait);
TAILQ_INIT(&inodedep->id_inoreflst);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
TAILQ_INIT(&inodedep->id_extupdt);
TAILQ_INIT(&inodedep->id_newextupdt);
TAILQ_INIT(&inodedep->id_freeblklst);
LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
*inodedeppp = inodedep;
return (0);
}
/*
* Structures and routines associated with newblk caching.
*/
#define NEWBLK_HASH(ump, inum) \
(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
static int
newblk_find(newblkhd, newblkno, flags, newblkpp)
struct newblk_hashhead *newblkhd;
ufs2_daddr_t newblkno;
int flags;
struct newblk **newblkpp;
{
struct newblk *newblk;
LIST_FOREACH(newblk, newblkhd, nb_hash) {
if (newblkno != newblk->nb_newblkno)
continue;
/*
* If we're creating a new dependency don't match those that
* have already been converted to allocdirects. This is for
* a frag extend.
*/
if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
continue;
break;
}
if (newblk) {
*newblkpp = newblk;
return (1);
}
*newblkpp = NULL;
return (0);
}
/*
* Look up a newblk. Return 1 if found, 0 if not found.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in newblkpp.
*/
static int
newblk_lookup(mp, newblkno, flags, newblkpp)
struct mount *mp;
ufs2_daddr_t newblkno;
int flags;
struct newblk **newblkpp;
{
struct newblk *newblk;
struct newblk_hashhead *newblkhd;
struct ufsmount *ump;
ump = VFSTOUFS(mp);
LOCK_OWNED(ump);
newblkhd = NEWBLK_HASH(ump, newblkno);
if (newblk_find(newblkhd, newblkno, flags, newblkpp))
return (1);
if ((flags & DEPALLOC) == 0)
return (0);
if (softdep_excess_items(ump, D_NEWBLK) ||
softdep_excess_items(ump, D_ALLOCDIRECT) ||
softdep_excess_items(ump, D_ALLOCINDIR))
schedule_cleanup(mp);
else
FREE_LOCK(ump);
newblk = malloc(sizeof(union allblk), M_NEWBLK,
M_SOFTDEP_FLAGS | M_ZERO);
workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
ACQUIRE_LOCK(ump);
if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
WORKITEM_FREE(newblk, D_NEWBLK);
return (1);
}
newblk->nb_freefrag = NULL;
LIST_INIT(&newblk->nb_indirdeps);
LIST_INIT(&newblk->nb_newdirblk);
LIST_INIT(&newblk->nb_jwork);
newblk->nb_state = ATTACHED;
newblk->nb_newblkno = newblkno;
LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
*newblkpp = newblk;
return (0);
}
/*
* Structures and routines associated with freed indirect block caching.
*/
#define INDIR_HASH(ump, blkno) \
(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
/*
* Lookup an indirect block in the indir hash table. The freework is
* removed and potentially freed. The caller must do a blocking journal
* write before writing to the blkno.
*/
static int
indirblk_lookup(mp, blkno)
struct mount *mp;
ufs2_daddr_t blkno;
{
struct freework *freework;
struct indir_hashhead *wkhd;
struct ufsmount *ump;
ump = VFSTOUFS(mp);
wkhd = INDIR_HASH(ump, blkno);
TAILQ_FOREACH(freework, wkhd, fw_next) {
if (freework->fw_blkno != blkno)
continue;
indirblk_remove(freework);
return (1);
}
return (0);
}
/*
* Insert an indirect block represented by freework into the indirblk
* hash table so that it may prevent the block from being re-used prior
* to the journal being written.
*/
static void
indirblk_insert(freework)
struct freework *freework;
{
struct jblocks *jblocks;
struct jseg *jseg;
struct ufsmount *ump;
ump = VFSTOUFS(freework->fw_list.wk_mp);
jblocks = ump->softdep_jblocks;
jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
if (jseg == NULL)
return;
LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
fw_next);
freework->fw_state &= ~DEPCOMPLETE;
}
static void
indirblk_remove(freework)
struct freework *freework;
{
struct ufsmount *ump;
ump = VFSTOUFS(freework->fw_list.wk_mp);
LIST_REMOVE(freework, fw_segs);
TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
freework->fw_state |= DEPCOMPLETE;
if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
WORKITEM_FREE(freework, D_FREEWORK);
}
/*
* Executed during filesystem system initialization before
* mounting any filesystems.
*/
void
softdep_initialize()
{
TAILQ_INIT(&softdepmounts);
#ifdef __LP64__
max_softdeps = desiredvnodes * 4;
#else
max_softdeps = desiredvnodes * 2;
#endif
/* initialise bioops hack */
bioops.io_start = softdep_disk_io_initiation;
bioops.io_complete = softdep_disk_write_complete;
bioops.io_deallocate = softdep_deallocate_dependencies;
bioops.io_countdeps = softdep_count_dependencies;
softdep_ast_cleanup = softdep_ast_cleanup_proc;
/* Initialize the callout with an mtx. */
callout_init_mtx(&softdep_callout, &lk, 0);
}
/*
* Executed after all filesystems have been unmounted during
* filesystem module unload.
*/
void
softdep_uninitialize()
{
/* clear bioops hack */
bioops.io_start = NULL;
bioops.io_complete = NULL;
bioops.io_deallocate = NULL;
bioops.io_countdeps = NULL;
softdep_ast_cleanup = NULL;
callout_drain(&softdep_callout);
}
/*
* Called at mount time to notify the dependency code that a
* filesystem wishes to use it.
*/
int
softdep_mount(devvp, mp, fs, cred)
struct vnode *devvp;
struct mount *mp;
struct fs *fs;
struct ucred *cred;
{
struct csum_total cstotal;
struct mount_softdeps *sdp;
struct ufsmount *ump;
struct cg *cgp;
struct buf *bp;
u_int cyl, i;
int error;
sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
M_WAITOK | M_ZERO);
MNT_ILOCK(mp);
mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
MNTK_SOFTDEP | MNTK_NOASYNC;
}
ump = VFSTOUFS(mp);
ump->um_softdep = sdp;
MNT_IUNLOCK(mp);
rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
sdp->sd_ump = ump;
LIST_INIT(&ump->softdep_workitem_pending);
LIST_INIT(&ump->softdep_journal_pending);
TAILQ_INIT(&ump->softdep_unlinked);
LIST_INIT(&ump->softdep_dirtycg);
ump->softdep_worklist_tail = NULL;
ump->softdep_on_worklist = 0;
ump->softdep_deps = 0;
LIST_INIT(&ump->softdep_mkdirlisthd);
ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
&ump->pagedep_hash_size);
ump->pagedep_nextclean = 0;
ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
&ump->inodedep_hash_size);
ump->inodedep_nextclean = 0;
ump->newblk_hashtbl = hashinit(max_softdeps / 2, M_NEWBLK,
&ump->newblk_hash_size);
ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
&ump->bmsafemap_hash_size);
i = 1 << (ffs(desiredvnodes / 10) - 1);
ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
M_FREEWORK, M_WAITOK);
ump->indir_hash_size = i - 1;
for (i = 0; i <= ump->indir_hash_size; i++)
TAILQ_INIT(&ump->indir_hashtbl[i]);
ACQUIRE_GBLLOCK(&lk);
TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
FREE_GBLLOCK(&lk);
if ((fs->fs_flags & FS_SUJ) &&
(error = journal_mount(mp, fs, cred)) != 0) {
printf("Failed to start journal: %d\n", error);
softdep_unmount(mp);
return (error);
}
/*
* Start our flushing thread in the bufdaemon process.
*/
ACQUIRE_LOCK(ump);
ump->softdep_flags |= FLUSH_STARTING;
FREE_LOCK(ump);
kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
&ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
mp->mnt_stat.f_mntonname);
ACQUIRE_LOCK(ump);
while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
hz / 2);
}
FREE_LOCK(ump);
/*
* When doing soft updates, the counters in the
* superblock may have gotten out of sync. Recomputation
* can take a long time and can be deferred for background
* fsck. However, the old behavior of scanning the cylinder
* groups and recalculating them at mount time is available
* by setting vfs.ffs.compute_summary_at_mount to one.
*/
if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
return (0);
bzero(&cstotal, sizeof cstotal);
for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
fs->fs_cgsize, cred, &bp)) != 0) {
brelse(bp);
softdep_unmount(mp);
return (error);
}
cgp = (struct cg *)bp->b_data;
cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
fs->fs_cs(fs, cyl) = cgp->cg_cs;
brelse(bp);
}
#ifdef DEBUG
if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
#endif
bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
return (0);
}
void
softdep_unmount(mp)
struct mount *mp;
{
struct ufsmount *ump;
#ifdef INVARIANTS
int i;
#endif
KASSERT(MOUNTEDSOFTDEP(mp) != 0,
("softdep_unmount called on non-softdep filesystem"));
ump = VFSTOUFS(mp);
MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_SOFTDEP;
if (MOUNTEDSUJ(mp) == 0) {
MNT_IUNLOCK(mp);
} else {
mp->mnt_flag &= ~MNT_SUJ;
MNT_IUNLOCK(mp);
journal_unmount(ump);
}
/*
* Shut down our flushing thread. Check for NULL is if
* softdep_mount errors out before the thread has been created.
*/
if (ump->softdep_flushtd != NULL) {
ACQUIRE_LOCK(ump);
ump->softdep_flags |= FLUSH_EXIT;
wakeup(&ump->softdep_flushtd);
msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
"sdwait", 0);
KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
("Thread shutdown failed"));
}
/*
* Free up our resources.
*/
ACQUIRE_GBLLOCK(&lk);
TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
FREE_GBLLOCK(&lk);
rw_destroy(LOCK_PTR(ump));
hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
ump->bmsafemap_hash_size);
free(ump->indir_hashtbl, M_FREEWORK);
#ifdef INVARIANTS
for (i = 0; i <= D_LAST; i++)
KASSERT(ump->softdep_curdeps[i] == 0,
("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
TYPENAME(i), ump->softdep_curdeps[i]));
#endif
free(ump->um_softdep, M_MOUNTDATA);
}
static struct jblocks *
jblocks_create(void)
{
struct jblocks *jblocks;
jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
TAILQ_INIT(&jblocks->jb_segs);
jblocks->jb_avail = 10;
jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
M_JBLOCKS, M_WAITOK | M_ZERO);
return (jblocks);
}
static ufs2_daddr_t
jblocks_alloc(jblocks, bytes, actual)
struct jblocks *jblocks;
int bytes;
int *actual;
{
ufs2_daddr_t daddr;
struct jextent *jext;
int freecnt;
int blocks;
blocks = bytes / DEV_BSIZE;
jext = &jblocks->jb_extent[jblocks->jb_head];
freecnt = jext->je_blocks - jblocks->jb_off;
if (freecnt == 0) {
jblocks->jb_off = 0;
if (++jblocks->jb_head > jblocks->jb_used)
jblocks->jb_head = 0;
jext = &jblocks->jb_extent[jblocks->jb_head];
freecnt = jext->je_blocks;
}
if (freecnt > blocks)
freecnt = blocks;
*actual = freecnt * DEV_BSIZE;
daddr = jext->je_daddr + jblocks->jb_off;
jblocks->jb_off += freecnt;
jblocks->jb_free -= freecnt;
return (daddr);
}
static void
jblocks_free(jblocks, mp, bytes)
struct jblocks *jblocks;
struct mount *mp;
int bytes;
{
LOCK_OWNED(VFSTOUFS(mp));
jblocks->jb_free += bytes / DEV_BSIZE;
if (jblocks->jb_suspended)
worklist_speedup(mp);
wakeup(jblocks);
}
static void
jblocks_destroy(jblocks)
struct jblocks *jblocks;
{
if (jblocks->jb_extent)
free(jblocks->jb_extent, M_JBLOCKS);
free(jblocks, M_JBLOCKS);
}
static void
jblocks_add(jblocks, daddr, blocks)
struct jblocks *jblocks;
ufs2_daddr_t daddr;
int blocks;
{
struct jextent *jext;
jblocks->jb_blocks += blocks;
jblocks->jb_free += blocks;
jext = &jblocks->jb_extent[jblocks->jb_used];
/* Adding the first block. */
if (jext->je_daddr == 0) {
jext->je_daddr = daddr;
jext->je_blocks = blocks;
return;
}
/* Extending the last extent. */
if (jext->je_daddr + jext->je_blocks == daddr) {
jext->je_blocks += blocks;
return;
}
/* Adding a new extent. */
if (++jblocks->jb_used == jblocks->jb_avail) {
jblocks->jb_avail *= 2;
jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
M_JBLOCKS, M_WAITOK | M_ZERO);
memcpy(jext, jblocks->jb_extent,
sizeof(struct jextent) * jblocks->jb_used);
free(jblocks->jb_extent, M_JBLOCKS);
jblocks->jb_extent = jext;
}
jext = &jblocks->jb_extent[jblocks->jb_used];
jext->je_daddr = daddr;
jext->je_blocks = blocks;
return;
}
int
softdep_journal_lookup(mp, vpp)
struct mount *mp;
struct vnode **vpp;
{
struct componentname cnp;
struct vnode *dvp;
ino_t sujournal;
int error;
error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
if (error)
return (error);
bzero(&cnp, sizeof(cnp));
cnp.cn_nameiop = LOOKUP;
cnp.cn_flags = ISLASTCN;
cnp.cn_thread = curthread;
cnp.cn_cred = curthread->td_ucred;
cnp.cn_pnbuf = SUJ_FILE;
cnp.cn_nameptr = SUJ_FILE;
cnp.cn_namelen = strlen(SUJ_FILE);
error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
vput(dvp);
if (error != 0)
return (error);
error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
return (error);
}
/*
* Open and verify the journal file.
*/
static int
journal_mount(mp, fs, cred)
struct mount *mp;
struct fs *fs;
struct ucred *cred;
{
struct jblocks *jblocks;
struct ufsmount *ump;
struct vnode *vp;
struct inode *ip;
ufs2_daddr_t blkno;
int bcount;
int error;
int i;
ump = VFSTOUFS(mp);
ump->softdep_journal_tail = NULL;
ump->softdep_on_journal = 0;
ump->softdep_accdeps = 0;
ump->softdep_req = 0;
ump->softdep_jblocks = NULL;
error = softdep_journal_lookup(mp, &vp);
if (error != 0) {
printf("Failed to find journal. Use tunefs to create one\n");
return (error);
}
ip = VTOI(vp);
if (ip->i_size < SUJ_MIN) {
error = ENOSPC;
goto out;
}
bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */
jblocks = jblocks_create();
for (i = 0; i < bcount; i++) {
error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
if (error)
break;
jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
}
if (error) {
jblocks_destroy(jblocks);
goto out;
}
jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
ump->softdep_jblocks = jblocks;
out:
if (error == 0) {
MNT_ILOCK(mp);
mp->mnt_flag |= MNT_SUJ;
mp->mnt_flag &= ~MNT_SOFTDEP;
MNT_IUNLOCK(mp);
/*
* Only validate the journal contents if the
* filesystem is clean, otherwise we write the logs
* but they'll never be used. If the filesystem was
* still dirty when we mounted it the journal is
* invalid and a new journal can only be valid if it
* starts from a clean mount.
*/
if (fs->fs_clean) {
DIP_SET(ip, i_modrev, fs->fs_mtime);
ip->i_flags |= IN_MODIFIED;
ffs_update(vp, 1);
}
}
vput(vp);
return (error);
}
static void
journal_unmount(ump)
struct ufsmount *ump;
{
if (ump->softdep_jblocks)
jblocks_destroy(ump->softdep_jblocks);
ump->softdep_jblocks = NULL;
}
/*
* Called when a journal record is ready to be written. Space is allocated
* and the journal entry is created when the journal is flushed to stable
* store.
*/
static void
add_to_journal(wk)
struct worklist *wk;
{
struct ufsmount *ump;
ump = VFSTOUFS(wk->wk_mp);
LOCK_OWNED(ump);
if (wk->wk_state & ONWORKLIST)
panic("add_to_journal: %s(0x%X) already on list",
TYPENAME(wk->wk_type), wk->wk_state);
wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
if (LIST_EMPTY(&ump->softdep_journal_pending)) {
ump->softdep_jblocks->jb_age = ticks;
LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
} else
LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
ump->softdep_journal_tail = wk;
ump->softdep_on_journal += 1;
}
/*
* Remove an arbitrary item for the journal worklist maintain the tail
* pointer. This happens when a new operation obviates the need to
* journal an old operation.
*/
static void
remove_from_journal(wk)
struct worklist *wk;
{
struct ufsmount *ump;
ump = VFSTOUFS(wk->wk_mp);
LOCK_OWNED(ump);
#ifdef SUJ_DEBUG
{
struct worklist *wkn;
LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
if (wkn == wk)
break;
if (wkn == NULL)
panic("remove_from_journal: %p is not in journal", wk);
}
#endif
/*
* We emulate a TAILQ to save space in most structures which do not
* require TAILQ semantics. Here we must update the tail position
* when removing the tail which is not the final entry. This works
* only if the worklist linkage are at the beginning of the structure.
*/
if (ump->softdep_journal_tail == wk)
ump->softdep_journal_tail =
(struct worklist *)wk->wk_list.le_prev;
WORKLIST_REMOVE(wk);
ump->softdep_on_journal -= 1;
}
/*
* Check for journal space as well as dependency limits so the prelink
* code can throttle both journaled and non-journaled filesystems.
* Threshold is 0 for low and 1 for min.
*/
static int
journal_space(ump, thresh)
struct ufsmount *ump;
int thresh;
{
struct jblocks *jblocks;
int limit, avail;
jblocks = ump->softdep_jblocks;
if (jblocks == NULL)
return (1);
/*
* We use a tighter restriction here to prevent request_cleanup()
* running in threads from running into locks we currently hold.
* We have to be over the limit and our filesystem has to be
* responsible for more than our share of that usage.
*/
limit = (max_softdeps / 10) * 9;
if (dep_current[D_INODEDEP] > limit &&
ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
return (0);
if (thresh)
thresh = jblocks->jb_min;
else
thresh = jblocks->jb_low;
avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
avail = jblocks->jb_free - avail;
return (avail > thresh);
}
static void
journal_suspend(ump)
struct ufsmount *ump;
{
struct jblocks *jblocks;
struct mount *mp;
mp = UFSTOVFS(ump);
jblocks = ump->softdep_jblocks;
MNT_ILOCK(mp);
if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
stat_journal_min++;
mp->mnt_kern_flag |= MNTK_SUSPEND;
mp->mnt_susp_owner = ump->softdep_flushtd;
}
jblocks->jb_suspended = 1;
MNT_IUNLOCK(mp);
}
static int
journal_unsuspend(struct ufsmount *ump)
{
struct jblocks *jblocks;
struct mount *mp;
mp = UFSTOVFS(ump);
jblocks = ump->softdep_jblocks;
if (jblocks != NULL && jblocks->jb_suspended &&
journal_space(ump, jblocks->jb_min)) {
jblocks->jb_suspended = 0;
FREE_LOCK(ump);
mp->mnt_susp_owner = curthread;
vfs_write_resume(mp, 0);
ACQUIRE_LOCK(ump);
return (1);
}
return (0);
}
/*
* Called before any allocation function to be certain that there is
* sufficient space in the journal prior to creating any new records.
* Since in the case of block allocation we may have multiple locked
* buffers at the time of the actual allocation we can not block
* when the journal records are created. Doing so would create a deadlock
* if any of these buffers needed to be flushed to reclaim space. Instead
* we require a sufficiently large amount of available space such that
* each thread in the system could have passed this allocation check and
* still have sufficient free space. With 20% of a minimum journal size
* of 1MB we have 6553 records available.
*/
int
softdep_prealloc(vp, waitok)
struct vnode *vp;
int waitok;
{
struct ufsmount *ump;
KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
("softdep_prealloc called on non-softdep filesystem"));
/*
* Nothing to do if we are not running journaled soft updates.
* If we currently hold the snapshot lock, we must avoid
* handling other resources that could cause deadlock. Do not
* touch quotas vnode since it is typically recursed with
* other vnode locks held.
*/
if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
(vp->v_vflag & VV_SYSTEM) != 0)
return (0);
ump = VFSTOUFS(vp->v_mount);
ACQUIRE_LOCK(ump);
if (journal_space(ump, 0)) {
FREE_LOCK(ump);
return (0);
}
stat_journal_low++;
FREE_LOCK(ump);
if (waitok == MNT_NOWAIT)
return (ENOSPC);
/*
* Attempt to sync this vnode once to flush any journal
* work attached to it.
*/
if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
ffs_syncvnode(vp, waitok, 0);
ACQUIRE_LOCK(ump);
process_removes(vp);
process_truncates(vp);
if (journal_space(ump, 0) == 0) {
softdep_speedup(ump);
if (journal_space(ump, 1) == 0)
journal_suspend(ump);
}
FREE_LOCK(ump);
return (0);
}
/*
* Before adjusting a link count on a vnode verify that we have sufficient
* journal space. If not, process operations that depend on the currently
* locked pair of vnodes to try to flush space as the syncer, buf daemon,
* and softdep flush threads can not acquire these locks to reclaim space.
*/
static void
softdep_prelink(dvp, vp)
struct vnode *dvp;
struct vnode *vp;
{
struct ufsmount *ump;
ump = VFSTOUFS(dvp->v_mount);
LOCK_OWNED(ump);
/*
* Nothing to do if we have sufficient journal space.
* If we currently hold the snapshot lock, we must avoid
* handling other resources that could cause deadlock.
*/
if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
return;
stat_journal_low++;
FREE_LOCK(ump);
if (vp)
ffs_syncvnode(vp, MNT_NOWAIT, 0);
ffs_syncvnode(dvp, MNT_WAIT, 0);
ACQUIRE_LOCK(ump);
/* Process vp before dvp as it may create .. removes. */
if (vp) {
process_removes(vp);
process_truncates(vp);
}
process_removes(dvp);
process_truncates(dvp);
softdep_speedup(ump);
process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
if (journal_space(ump, 0) == 0) {
softdep_speedup(ump);
if (journal_space(ump, 1) == 0)
journal_suspend(ump);
}
}
static void
jseg_write(ump, jseg, data)
struct ufsmount *ump;
struct jseg *jseg;
uint8_t *data;
{
struct jsegrec *rec;
rec = (struct jsegrec *)data;
rec->jsr_seq = jseg->js_seq;
rec->jsr_oldest = jseg->js_oldseq;
rec->jsr_cnt = jseg->js_cnt;
rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
rec->jsr_crc = 0;
rec->jsr_time = ump->um_fs->fs_mtime;
}
static inline void
inoref_write(inoref, jseg, rec)
struct inoref *inoref;
struct jseg *jseg;
struct jrefrec *rec;
{
inoref->if_jsegdep->jd_seg = jseg;
rec->jr_ino = inoref->if_ino;
rec->jr_parent = inoref->if_parent;
rec->jr_nlink = inoref->if_nlink;
rec->jr_mode = inoref->if_mode;
rec->jr_diroff = inoref->if_diroff;
}
static void
jaddref_write(jaddref, jseg, data)
struct jaddref *jaddref;
struct jseg *jseg;
uint8_t *data;
{
struct jrefrec *rec;
rec = (struct jrefrec *)data;
rec->jr_op = JOP_ADDREF;
inoref_write(&jaddref->ja_ref, jseg, rec);
}
static void
jremref_write(jremref, jseg, data)
struct jremref *jremref;
struct jseg *jseg;
uint8_t *data;
{
struct jrefrec *rec;
rec = (struct jrefrec *)data;
rec->jr_op = JOP_REMREF;
inoref_write(&jremref->jr_ref, jseg, rec);
}
static void
jmvref_write(jmvref, jseg, data)
struct jmvref *jmvref;
struct jseg *jseg;
uint8_t *data;
{
struct jmvrec *rec;
rec = (struct jmvrec *)data;
rec->jm_op = JOP_MVREF;
rec->jm_ino = jmvref->jm_ino;
rec->jm_parent = jmvref->jm_parent;
rec->jm_oldoff = jmvref->jm_oldoff;
rec->jm_newoff = jmvref->jm_newoff;
}
static void
jnewblk_write(jnewblk, jseg, data)
struct jnewblk *jnewblk;
struct jseg *jseg;
uint8_t *data;
{
struct jblkrec *rec;
jnewblk->jn_jsegdep->jd_seg = jseg;
rec = (struct jblkrec *)data;
rec->jb_op = JOP_NEWBLK;
rec->jb_ino = jnewblk->jn_ino;
rec->jb_blkno = jnewblk->jn_blkno;
rec->jb_lbn = jnewblk->jn_lbn;
rec->jb_frags = jnewblk->jn_frags;
rec->jb_oldfrags = jnewblk->jn_oldfrags;
}
static void
jfreeblk_write(jfreeblk, jseg, data)
struct jfreeblk *jfreeblk;
struct jseg *jseg;
uint8_t *data;
{
struct jblkrec *rec;
jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
rec = (struct jblkrec *)data;
rec->jb_op = JOP_FREEBLK;
rec->jb_ino = jfreeblk->jf_ino;
rec->jb_blkno = jfreeblk->jf_blkno;
rec->jb_lbn = jfreeblk->jf_lbn;
rec->jb_frags = jfreeblk->jf_frags;
rec->jb_oldfrags = 0;
}
static void
jfreefrag_write(jfreefrag, jseg, data)
struct jfreefrag *jfreefrag;
struct jseg *jseg;
uint8_t *data;
{
struct jblkrec *rec;
jfreefrag->fr_jsegdep->jd_seg = jseg;
rec = (struct jblkrec *)data;
rec->jb_op = JOP_FREEBLK;
rec->jb_ino = jfreefrag->fr_ino;
rec->jb_blkno = jfreefrag->fr_blkno;
rec->jb_lbn = jfreefrag->fr_lbn;
rec->jb_frags = jfreefrag->fr_frags;
rec->jb_oldfrags = 0;
}
static void
jtrunc_write(jtrunc, jseg, data)
struct jtrunc *jtrunc;
struct jseg *jseg;
uint8_t *data;
{
struct jtrncrec *rec;
jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
rec = (struct jtrncrec *)data;
rec->jt_op = JOP_TRUNC;
rec->jt_ino = jtrunc->jt_ino;
rec->jt_size = jtrunc->jt_size;
rec->jt_extsize = jtrunc->jt_extsize;
}
static void
jfsync_write(jfsync, jseg, data)
struct jfsync *jfsync;
struct jseg *jseg;
uint8_t *data;
{
struct jtrncrec *rec;
rec = (struct jtrncrec *)data;
rec->jt_op = JOP_SYNC;
rec->jt_ino = jfsync->jfs_ino;
rec->jt_size = jfsync->jfs_size;
rec->jt_extsize = jfsync->jfs_extsize;
}
static void
softdep_flushjournal(mp)
struct mount *mp;
{
struct jblocks *jblocks;
struct ufsmount *ump;
if (MOUNTEDSUJ(mp) == 0)
return;
ump = VFSTOUFS(mp);
jblocks = ump->softdep_jblocks;
ACQUIRE_LOCK(ump);
while (ump->softdep_on_journal) {
jblocks->jb_needseg = 1;
softdep_process_journal(mp, NULL, MNT_WAIT);
}
FREE_LOCK(ump);
}
static void softdep_synchronize_completed(struct bio *);
static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
static void
softdep_synchronize_completed(bp)
struct bio *bp;
{
struct jseg *oldest;
struct jseg *jseg;
struct ufsmount *ump;
/*
* caller1 marks the last segment written before we issued the
* synchronize cache.
*/
jseg = bp->bio_caller1;
if (jseg == NULL) {
g_destroy_bio(bp);
return;
}
ump = VFSTOUFS(jseg->js_list.wk_mp);
ACQUIRE_LOCK(ump);
oldest = NULL;
/*
* Mark all the journal entries waiting on the synchronize cache
* as completed so they may continue on.
*/
while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
jseg->js_state |= COMPLETE;
oldest = jseg;
jseg = TAILQ_PREV(jseg, jseglst, js_next);
}
/*
* Restart deferred journal entry processing from the oldest
* completed jseg.
*/
if (oldest)
complete_jsegs(oldest);
FREE_LOCK(ump);
g_destroy_bio(bp);
}
/*
* Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
* barriers. The journal must be written prior to any blocks that depend
* on it and the journal can not be released until the blocks have be
* written. This code handles both barriers simultaneously.
*/
static void
softdep_synchronize(bp, ump, caller1)
struct bio *bp;
struct ufsmount *ump;
void *caller1;
{
bp->bio_cmd = BIO_FLUSH;
bp->bio_flags |= BIO_ORDERED;
bp->bio_data = NULL;
bp->bio_offset = ump->um_cp->provider->mediasize;
bp->bio_length = 0;
bp->bio_done = softdep_synchronize_completed;
bp->bio_caller1 = caller1;
g_io_request(bp,
(struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
}
/*
* Flush some journal records to disk.
*/
static void
softdep_process_journal(mp, needwk, flags)
struct mount *mp;
struct worklist *needwk;
int flags;
{
struct jblocks *jblocks;
struct ufsmount *ump;
struct worklist *wk;
struct jseg *jseg;
struct buf *bp;
struct bio *bio;
uint8_t *data;
struct fs *fs;
int shouldflush;
int segwritten;
int jrecmin; /* Minimum records per block. */
int jrecmax; /* Maximum records per block. */
int size;
int cnt;
int off;
int devbsize;
if (MOUNTEDSUJ(mp) == 0)
return;
shouldflush = softdep_flushcache;
bio = NULL;
jseg = NULL;
ump = VFSTOUFS(mp);
LOCK_OWNED(ump);
fs = ump->um_fs;
jblocks = ump->softdep_jblocks;
devbsize = ump->um_devvp->v_bufobj.bo_bsize;
/*
* We write anywhere between a disk block and fs block. The upper
* bound is picked to prevent buffer cache fragmentation and limit
* processing time per I/O.
*/
jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
segwritten = 0;
for (;;) {
cnt = ump->softdep_on_journal;
/*
* Criteria for writing a segment:
* 1) We have a full block.
* 2) We're called from jwait() and haven't found the
* journal item yet.
* 3) Always write if needseg is set.
* 4) If we are called from process_worklist and have
* not yet written anything we write a partial block
* to enforce a 1 second maximum latency on journal
* entries.
*/
if (cnt < (jrecmax - 1) && needwk == NULL &&
jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
break;
cnt++;
/*
* Verify some free journal space. softdep_prealloc() should
* guarantee that we don't run out so this is indicative of
* a problem with the flow control. Try to recover
* gracefully in any event.
*/
while (jblocks->jb_free == 0) {
if (flags != MNT_WAIT)
break;
printf("softdep: Out of journal space!\n");
softdep_speedup(ump);
msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
}
FREE_LOCK(ump);
jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
workitem_alloc(&jseg->js_list, D_JSEG, mp);
LIST_INIT(&jseg->js_entries);
LIST_INIT(&jseg->js_indirs);
jseg->js_state = ATTACHED;
if (shouldflush == 0)
jseg->js_state |= COMPLETE;
else if (bio == NULL)
bio = g_alloc_bio();
jseg->js_jblocks = jblocks;
bp = geteblk(fs->fs_bsize, 0);
ACQUIRE_LOCK(ump);
/*
* If there was a race while we were allocating the block
* and jseg the entry we care about was likely written.
* We bail out in both the WAIT and NOWAIT case and assume
* the caller will loop if the entry it cares about is
* not written.
*/
cnt = ump->softdep_on_journal;
if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
bp->b_flags |= B_INVAL | B_NOCACHE;
WORKITEM_FREE(jseg, D_JSEG);
FREE_LOCK(ump);
brelse(bp);
ACQUIRE_LOCK(ump);
break;
}
/*
* Calculate the disk block size required for the available
* records rounded to the min size.
*/
if (cnt == 0)
size = devbsize;
else if (cnt < jrecmax)
size = howmany(cnt, jrecmin) * devbsize;
else
size = fs->fs_bsize;
/*
* Allocate a disk block for this journal data and account
* for truncation of the requested size if enough contiguous
* space was not available.
*/
bp->b_blkno = jblocks_alloc(jblocks, size, &size);
bp->b_lblkno = bp->b_blkno;
bp->b_offset = bp->b_blkno * DEV_BSIZE;
bp->b_bcount = size;
bp->b_flags &= ~B_INVAL;
bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
/*
* Initialize our jseg with cnt records. Assign the next
* sequence number to it and link it in-order.
*/
cnt = MIN(cnt, (size / devbsize) * jrecmin);
jseg->js_buf = bp;
jseg->js_cnt = cnt;
jseg->js_refs = cnt + 1; /* Self ref. */
jseg->js_size = size;
jseg->js_seq = jblocks->jb_nextseq++;
if (jblocks->jb_oldestseg == NULL)
jblocks->jb_oldestseg = jseg;
jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
if (jblocks->jb_writeseg == NULL)
jblocks->jb_writeseg = jseg;
/*
* Start filling in records from the pending list.
*/
data = bp->b_data;
off = 0;
/*
* Always put a header on the first block.
* XXX As with below, there might not be a chance to get
* into the loop. Ensure that something valid is written.
*/
jseg_write(ump, jseg, data);
off += JREC_SIZE;
data = bp->b_data + off;
/*
* XXX Something is wrong here. There's no work to do,
* but we need to perform and I/O and allow it to complete
* anyways.
*/
if (LIST_EMPTY(&ump->softdep_journal_pending))
stat_emptyjblocks++;
while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
!= NULL) {
if (cnt == 0)
break;
/* Place a segment header on every device block. */
if ((off % devbsize) == 0
|