These changes appear to give us benefits with both small (32MB) and

large (1G) memory machine configurations.  I was able to run 'dbench 32'
on a 32MB system without bring the machine to a grinding halt.

    * buffer cache hash table now dynamically allocated.  This will
      have no effect on memory consumption for smaller systems and
      will help scale the buffer cache for larger systems.

    * minor enhancement to pmap_clearbit().  I noticed that
      all the calls to it used constant arguments.  Making
      it an inline allows the constants to propogate to
      deeper inlines and should produce better code.

    * removal of inherent vfs_ioopt support through the emplacement
      of appropriate #ifdef's, with John's permission.  If we do not
      find a use for it by the end of the year we will remove it entirely.

    * removal of getnewbufloops* counters & sysctl's - no longer
      necessary for debugging, getnewbuf() is now optimal.

    * buffer hash table functions removed from sys/buf.h and localized
      to vfs_bio.c

    * VFS_BIO_NEED_DIRTYFLUSH flag and support code added
      ( bwillwrite() ), allowing processes to block when too many dirty
      buffers are present in the system.

    * removal of a softdep test in bdwrite() that is no longer necessary
      now that bdwrite() no longer attempts to flush dirty buffers.

    * slight optimization added to bqrelse() - there is no reason
      to test for available buffer space on B_DELWRI buffers.

    * addition of reverse-scanning code to vfs_bio_awrite().
      vfs_bio_awrite() will attempt to locate clusterable areas
      in both the forward and reverse direction relative to the
      offset of the buffer passed to it.  This will probably not
      make much of a difference now, but I believe we will start
      to rely on it heavily in the future if we decide to shift
      some of the burden of the clustering closer to the actual
      I/O initiation.

    * Removal of the newbufcnt and lastnewbuf counters that Kirk
      added.  They do not fix any race conditions that haven't already
      been fixed by the gbincore() test done after the only call
      to getnewbuf().  getnewbuf() is a static, so there is no chance
      of it being misused by other modules.  ( Unless Kirk can think
      of a specific thing that this code fixes.  I went through it
      very carefully and didn't see anything ).

    * removal of VOP_ISLOCKED() check in flushbufqueues().  I do not
      think this check is necessary, the buffer should flush properly
      whether the vnode is locked or not. ( yes? ).

    * removal of extra arguments passed to getnewbuf() that are not
      necessary.

    * missed cluster_wbuild() that had to be a cluster_wbuild_wb() in
      vfs_cluster.c

    * vn_write() now calls bwillwrite() *PRIOR* to locking the vnode,
      which should greatly aid flushing operations in heavy load
      situations - both the pageout and update daemons will be able
      to operate more efficiently.

    * removal of b_usecount.  We may add it back in later but for now
      it is useless.  Prior implementations of the buffer cache never
      had enough buffers for it to be useful, and current implementations
      which make more buffers available might not benefit relative to
      the amount of sophistication required to implement a b_usecount.
      Straight LRU should work just as well, especially when most things
      are VMIO backed.  I expect that (even though John will not like
      this assumption) directories will become VMIO backed some point soon.

Submitted by:	Matthew Dillon <dillon@backplane.com>
Reviewed by:	Kirk McKusick <mckusick@mckusick.com>
This commit is contained in:
Kirk McKusick 1999-07-08 06:06:00 +00:00
parent bedf427650
commit ad8ac923fa
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=48677
14 changed files with 267 additions and 232 deletions

View File

@ -23,7 +23,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $Id: machdep.c,v 1.46 1999/07/05 08:52:40 msmith Exp $
* $Id: machdep.c,v 1.47 1999/07/06 17:48:16 peter Exp $
*/
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
@ -354,7 +354,7 @@ again:
valloc(swbuf, struct buf, nswbuf);
valloc(buf, struct buf, nbuf);
v = bufhashinit(v);
/*
* End of first pass, size has been calculated so allocate memory

View File

@ -35,7 +35,7 @@
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
* $Id: machdep.c,v 1.352 1999/07/05 08:52:49 msmith Exp $
* $Id: machdep.c,v 1.353 1999/07/06 07:13:33 cracauer Exp $
*/
#include "apm.h"
@ -355,7 +355,7 @@ again:
valloc(swbuf, struct buf, nswbuf);
valloc(buf, struct buf, nbuf);
v = bufhashinit(v);
/*
* End of first pass, size has been calculated so allocate memory

View File

@ -39,7 +39,7 @@
* SUCH DAMAGE.
*
* from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
* $Id: pmap.c,v 1.239 1999/06/08 17:14:22 dt Exp $
* $Id: pmap.c,v 1.240 1999/06/23 21:47:21 luoqi Exp $
*/
/*
@ -3105,9 +3105,10 @@ pmap_changebit(pa, bit, setem)
/*
* pmap_clearbit:
*
* Clear a bit/bits in every pte mapping a given physical page.
* Clear a bit/bits in every pte mapping a given physical page. Making
* this inline allows the pmap_changebit inline to be well optimized.
*/
static void
static __inline void
pmap_clearbit(
vm_offset_t pa,
int bit)

View File

@ -35,7 +35,7 @@
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
* $Id: machdep.c,v 1.352 1999/07/05 08:52:49 msmith Exp $
* $Id: machdep.c,v 1.353 1999/07/06 07:13:33 cracauer Exp $
*/
#include "apm.h"
@ -355,7 +355,7 @@ again:
valloc(swbuf, struct buf, nswbuf);
valloc(buf, struct buf, nbuf);
v = bufhashinit(v);
/*
* End of first pass, size has been calculated so allocate memory

View File

@ -39,7 +39,7 @@
* SUCH DAMAGE.
*
* from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
* $Id: pmap.c,v 1.239 1999/06/08 17:14:22 dt Exp $
* $Id: pmap.c,v 1.240 1999/06/23 21:47:21 luoqi Exp $
*/
/*
@ -3105,9 +3105,10 @@ pmap_changebit(pa, bit, setem)
/*
* pmap_clearbit:
*
* Clear a bit/bits in every pte mapping a given physical page.
* Clear a bit/bits in every pte mapping a given physical page. Making
* this inline allows the pmap_changebit inline to be well optimized.
*/
static void
static __inline void
pmap_clearbit(
vm_offset_t pa,
int bit)

View File

@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
* $Id: kern_subr.c,v 1.27 1999/02/22 18:39:49 bde Exp $
* $Id: kern_subr.c,v 1.28 1999/03/12 03:09:29 julian Exp $
*/
#include <sys/param.h>
@ -156,6 +156,7 @@ uiomoveco(cp, n, uio, obj)
if (ticks - switchticks >= hogticks)
uio_yield();
if (uio->uio_rw == UIO_READ) {
#ifdef ENABLE_VFS_IOOPT
if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) &&
((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
((uio->uio_offset & PAGE_MASK) == 0) &&
@ -163,7 +164,9 @@ uiomoveco(cp, n, uio, obj)
error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
uio->uio_offset, cnt,
(vm_offset_t) iov->iov_base, NULL);
} else {
} else
#endif
{
error = copyout(cp, iov->iov_base, cnt);
}
} else {
@ -192,6 +195,8 @@ uiomoveco(cp, n, uio, obj)
return (0);
}
#ifdef ENABLE_VFS_IOOPT
int
uioread(n, uio, obj, nread)
int n;
@ -258,6 +263,8 @@ uioread(n, uio, obj, nread)
return error;
}
#endif
/*
* Give next character to user as result of read.
*/

View File

@ -11,7 +11,7 @@
* 2. Absolutely no warranty of function or purpose is made by the author
* John S. Dyson.
*
* $Id: vfs_bio.c,v 1.219 1999/06/29 05:59:41 peter Exp $
* $Id: vfs_bio.c,v 1.220 1999/07/04 00:25:27 mckusick Exp $
*/
/*
@ -90,14 +90,11 @@ static int bufspace, maxbufspace, vmiospace,
#if 0
static int maxvmiobufspace;
#endif
static int maxbdrun;
static int needsbuffer;
static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
static int numfreebuffers, lofreebuffers, hifreebuffers;
static int getnewbufcalls;
static int getnewbufloops;
static int getnewbufloops1;
static int getnewbufloops2;
static int getnewbufloops3;
static int getnewbufrestarts;
static int kvafreespace;
@ -121,6 +118,8 @@ SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
&hibufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
&bufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
&maxbdrun, 0, "");
#if 0
SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
&maxvmiobufspace, 0, "");
@ -135,18 +134,12 @@ SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
&kvafreespace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
&getnewbufcalls, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops, CTLFLAG_RW,
&getnewbufloops, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops1, CTLFLAG_RW,
&getnewbufloops1, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops2, CTLFLAG_RW,
&getnewbufloops2, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops3, CTLFLAG_RW,
&getnewbufloops3, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
&getnewbufrestarts, 0, "");
static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
static int bufhashmask;
static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
char *buf_wmesg = BUF_WMESG;
@ -155,11 +148,23 @@ extern int vm_swap_size;
#define BUF_MAXUSE 24
#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
#define VFS_BIO_NEED_RESERVED02 0x02 /* unused */
#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */
#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
#define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */
/*
* Buffer hash table code. Note that the logical block scans linearly, which
* gives us some L1 cache locality.
*/
static __inline
struct bufhashhdr *
bufhash(struct vnode *vnp, daddr_t bn)
{
return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
}
/*
* kvaspacewakeup:
*
@ -184,6 +189,24 @@ kvaspacewakeup(void)
}
}
/*
* numdirtywakeup:
*
* If someone is blocked due to there being too many dirty buffers,
* and numdirtybuffers is now reasonable, wake them up.
*/
static __inline void
numdirtywakeup(void)
{
if (numdirtybuffers < hidirtybuffers) {
if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
wakeup(&needsbuffer);
}
}
}
/*
* bufspacewakeup:
*
@ -260,10 +283,23 @@ bd_wakeup(int dirtybuflevel)
/*
* Initialize buffer headers and related structures.
* Initialize buffer headers and related structures.
*/
vm_offset_t
bufhashinit(vm_offset_t vaddr)
{
/* first, make a null hash table */
for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
;
bufhashtbl = (void *)vaddr;
vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
--bufhashmask;
return(vaddr);
}
void
bufinit()
bufinit(void)
{
struct buf *bp;
int i;
@ -272,8 +308,7 @@ bufinit()
LIST_INIT(&invalhash);
simple_lock_init(&buftimelock);
/* first, make a null hash table */
for (i = 0; i < BUFHSZ; i++)
for (i = 0; i <= bufhashmask; i++)
LIST_INIT(&bufhashtbl[i]);
/* next, make a null set of free lists */
@ -329,8 +364,8 @@ bufinit()
* Reduce the chance of a deadlock occuring by limiting the number
* of delayed-write dirty buffers we allow to stack up.
*/
lodirtybuffers = nbuf / 6 + 10;
hidirtybuffers = nbuf / 3 + 20;
lodirtybuffers = nbuf / 7 + 10;
hidirtybuffers = nbuf / 4 + 20;
numdirtybuffers = 0;
/*
@ -341,6 +376,15 @@ bufinit()
hifreebuffers = 2 * lofreebuffers;
numfreebuffers = nbuf;
/*
* Maximum number of async ops initiated per buf_daemon loop. This is
* somewhat of a hack at the moment, we really need to limit ourselves
* based on the number of bytes of I/O in-transit that were initiated
* from buf_daemon.
*/
if ((maxbdrun = nswbuf / 4) < 4)
maxbdrun = 4;
kvafreespace = 0;
bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
@ -383,19 +427,14 @@ bremfree(struct buf * bp)
if (bp->b_qindex == QUEUE_EMPTYKVA) {
kvafreespace -= bp->b_kvasize;
}
if (BUF_REFCNT(bp) == 1)
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
else if (BUF_REFCNT(bp) == 0)
panic("bremfree: not locked");
else
/* Temporary panic to verify exclusive locking */
/* This panic goes away when we allow shared refs */
panic("bremfree: multiple refs");
KASSERT(BUF_REFCNT(bp) == 0, ("bremfree: bp %p not locked",bp));
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
bp->b_qindex = QUEUE_NONE;
runningbufspace += bp->b_bufsize;
} else {
#if !defined(MAX_PERF)
panic("bremfree: removing a buffer when not on a queue");
if (BUF_REFCNT(bp) <= 1)
panic("bremfree: removing a buffer not on a queue");
#endif
}
@ -599,7 +638,9 @@ bwrite(struct buf * bp)
void
bdwrite(struct buf * bp)
{
#if 0
struct vnode *vp;
#endif
#if !defined(MAX_PERF)
if (BUF_REFCNT(bp) == 0)
@ -653,6 +694,11 @@ bdwrite(struct buf * bp)
bd_wakeup(hidirtybuffers);
/*
* note: we cannot initiate I/O from a bdwrite even if we wanted to,
* due to the softdep code.
*/
#if 0
/*
* XXX The soft dependency code is not prepared to
* have I/O done when a bdwrite is requested. For
@ -664,6 +710,7 @@ bdwrite(struct buf * bp)
(vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
return;
#endif
}
/*
@ -722,6 +769,7 @@ bundirty(bp)
bp->b_flags &= ~B_DELWRI;
reassignbuf(bp, bp->b_vp);
--numdirtybuffers;
numdirtywakeup();
}
}
@ -756,6 +804,34 @@ bowrite(struct buf * bp)
return (VOP_BWRITE(bp->b_vp, bp));
}
/*
* bwillwrite:
*
* Called prior to the locking of any vnodes when we are expecting to
* write. We do not want to starve the buffer cache with too many
* dirty buffers so we block here. By blocking prior to the locking
* of any vnodes we attempt to avoid the situation where a locked vnode
* prevents the various system daemons from flushing related buffers.
*/
void
bwillwrite(void)
{
int twenty = (hidirtybuffers - lodirtybuffers) / 5;
if (numdirtybuffers > hidirtybuffers + twenty) {
int s;
s = splbio();
while (numdirtybuffers > hidirtybuffers) {
bd_wakeup(hidirtybuffers);
needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
}
splx(s);
}
}
/*
* brelse:
*
@ -799,8 +875,10 @@ brelse(struct buf * bp)
bp->b_flags |= B_INVAL;
if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
(*bioops.io_deallocate)(bp);
if (bp->b_flags & B_DELWRI)
if (bp->b_flags & B_DELWRI) {
--numdirtybuffers;
numdirtywakeup();
}
bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
if ((bp->b_flags & B_VMIO) == 0) {
if (bp->b_bufsize)
@ -991,6 +1069,7 @@ brelse(struct buf * bp)
if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
bp->b_flags &= ~B_DELWRI;
--numdirtybuffers;
numdirtywakeup();
}
runningbufspace -= bp->b_bufsize;
@ -1070,7 +1149,7 @@ bqrelse(struct buf * bp)
/*
* Something we can maybe wakeup
*/
if (bp->b_bufsize)
if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
bufspacewakeup();
/* unlock */
@ -1139,7 +1218,7 @@ gbincore(struct vnode * vp, daddr_t blkno)
struct buf *bp;
struct bufhashhdr *bh;
bh = BUFHASH(vp, blkno);
bh = bufhash(vp, blkno);
bp = bh->lh_first;
/* Search hash chain */
@ -1155,14 +1234,18 @@ gbincore(struct vnode * vp, daddr_t blkno)
}
/*
* this routine implements clustered async writes for
* clearing out B_DELWRI buffers... This is much better
* than the old way of writing only one buffer at a time.
* vfs_bio_awrite:
*
* Implement clustered async writes for clearing out B_DELWRI buffers.
* This is much better then the old way of writing only one buffer at
* a time. Note that we may not be presented with the buffers in the
* correct order, so we search for the cluster in both directions.
*/
int
vfs_bio_awrite(struct buf * bp)
{
int i;
int j;
daddr_t lblkno = bp->b_lblkno;
struct vnode *vp = bp->b_vp;
int s;
@ -1174,8 +1257,9 @@ vfs_bio_awrite(struct buf * bp)
s = splbio();
/*
* right now we support clustered writing only to regular files, and
* then only if our I/O system is not saturated.
* right now we support clustered writing only to regular files. If
* we find a clusterable block we could be in the middle of a cluster
* rather then at the beginning.
*/
if ((vp->v_type == VREG) &&
(vp->v_mount != 0) && /* Only on nodes that have the size info */
@ -1191,18 +1275,34 @@ vfs_bio_awrite(struct buf * bp)
(B_DELWRI | B_CLUSTEROK)) &&
(bpa->b_bufsize == size)) {
if ((bpa->b_blkno == bpa->b_lblkno) ||
(bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
(bpa->b_blkno !=
bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
break;
} else {
break;
}
}
ncl = i;
for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
if ((bpa = gbincore(vp, lblkno - j)) &&
BUF_REFCNT(bpa) == 0 &&
((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
(B_DELWRI | B_CLUSTEROK)) &&
(bpa->b_bufsize == size)) {
if ((bpa->b_blkno == bpa->b_lblkno) ||
(bpa->b_blkno !=
bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
break;
} else {
break;
}
}
--j;
ncl = i + j;
/*
* this is a possible cluster write
*/
if (ncl != 1) {
nwritten = cluster_wbuild(vp, size, lblkno, ncl);
nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
splx(s);
return nwritten;
}
@ -1240,21 +1340,12 @@ vfs_bio_awrite(struct buf * bp)
* If we have to flush dirty buffers ( but we try to avoid this )
*
* To avoid VFS layer recursion we do not flush dirty buffers ourselves.
* Instead we ask the pageout daemon to do it for us. We attempt to
* Instead we ask the buf daemon to do it for us. We attempt to
* avoid piecemeal wakeups of the pageout daemon.
*/
/*
* We fully expect to be able to handle any fragmentation and buffer
* space issues by freeing QUEUE_CLEAN buffers. If this fails, we
* have to wakeup the pageout daemon and ask it to flush some of our
* QUEUE_DIRTY buffers. We have to be careful to prevent a deadlock.
* XXX
*/
static struct buf *
getnewbuf(struct vnode *vp, daddr_t blkno,
int slpflag, int slptimeo, int size, int maxsize)
getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
{
struct buf *bp;
struct buf *nbp;
@ -1262,8 +1353,6 @@ getnewbuf(struct vnode *vp, daddr_t blkno,
int outofspace;
int nqindex;
int defrag = 0;
static int newbufcnt = 0;
int lastnewbuf = newbufcnt;
++getnewbufcalls;
--getnewbufrestarts;
@ -1338,13 +1427,9 @@ restart:
* depending.
*/
if (nbp)
--getnewbufloops;
while ((bp = nbp) != NULL) {
int qindex = nqindex;
++getnewbufloops;
/*
* Calculate next bp ( we can only use it if we do not block
* or do other fancy things ).
@ -1372,7 +1457,6 @@ restart:
/*
* Sanity Checks
*/
KASSERT(BUF_REFCNT(bp) == 0, ("getnewbuf: busy buffer %p on free list", bp));
KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
/*
@ -1388,14 +1472,10 @@ restart:
* buffer isn't useful for fixing that problem we continue.
*/
if (defrag > 0 && bp->b_kvasize == 0) {
++getnewbufloops1;
if (defrag > 0 && bp->b_kvasize == 0)
continue;
}
if (outofspace > 0 && bp->b_bufsize == 0) {
++getnewbufloops2;
if (outofspace > 0 && bp->b_bufsize == 0)
continue;
}
/*
* Start freeing the bp. This is somewhat involved. nbp
@ -1433,7 +1513,6 @@ restart:
}
if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
(*bioops.io_deallocate)(bp);
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
@ -1451,7 +1530,6 @@ restart:
bp->b_bcount = 0;
bp->b_npages = 0;
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_usecount = 5;
LIST_INIT(&bp->b_dep);
@ -1489,19 +1567,26 @@ restart:
/*
* If we exhausted our list, sleep as appropriate. We may have to
* wakeup the pageout daemon to write out some dirty buffers.
* wakeup various daemons and write out some dirty buffers.
*
* Generally we are sleeping due to insufficient buffer space.
*/
if (bp == NULL) {
int flags;
char *waitmsg;
dosleep:
if (defrag > 0)
if (defrag > 0) {
flags = VFS_BIO_NEED_KVASPACE;
else if (outofspace > 0)
waitmsg = "nbufkv";
} else if (outofspace > 0) {
waitmsg = "nbufbs";
flags = VFS_BIO_NEED_BUFSPACE;
else
} else {
waitmsg = "newbuf";
flags = VFS_BIO_NEED_ANY;
}
/* XXX */
@ -1509,7 +1594,7 @@ dosleep:
needsbuffer |= flags;
while (needsbuffer & flags) {
if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
"newbuf", slptimeo))
waitmsg, slptimeo))
return (NULL);
}
} else {
@ -1553,42 +1638,7 @@ dosleep:
}
bp->b_data = bp->b_kvabase;
}
/*
* If we have slept at some point in this process and another
* process has managed to allocate a new buffer while we slept,
* we have to return NULL so that our caller can recheck to
* ensure that the other process did not create an identically
* identified buffer to the one we were requesting. We make this
* check by incrementing the static int newbufcnt each time we
* successfully allocate a new buffer. By saving the value of
* newbufcnt in our local lastnewbuf, we can compare newbufcnt
* with lastnewbuf to see if any other process managed to
* allocate a buffer while we were doing so ourselves.
*
* Note that bp, if valid, is locked.
*/
if (lastnewbuf == newbufcnt) {
/*
* No buffers allocated, so we can return one if we were
* successful, or continue trying if we were not successful.
*/
if (bp != NULL) {
newbufcnt += 1;
return (bp);
}
goto restart;
}
/*
* Another process allocated a buffer since we were called, so
* we have to free the one we allocated and return NULL to let
* our caller recheck to see if a new buffer is still needed.
*/
if (bp != NULL) {
bp->b_flags |= B_INVAL;
brelse(bp);
}
return (NULL);
return(bp);
}
/*
@ -1601,7 +1651,6 @@ static void
waitfreebuffers(int slpflag, int slptimeo)
{
while (numfreebuffers < hifreebuffers) {
bd_wakeup(0);
if (numfreebuffers >= hifreebuffers)
break;
needsbuffer |= VFS_BIO_NEED_FREE;
@ -1646,60 +1695,72 @@ buf_daemon()
bd_request = 0;
/*
* Do the flush.
* Do the flush. Limit the number of buffers we flush in one
* go. The failure condition occurs when processes are writing
* buffers faster then we can dispose of them. In this case
* we may be flushing so often that the previous set of flushes
* have not had time to complete, causing us to run out of
* physical buffers and block.
*/
{
while (numdirtybuffers > bd_flushto) {
int runcount = maxbdrun;
while (numdirtybuffers > bd_flushto && runcount) {
--runcount;
if (flushbufqueues() == 0)
break;
}
}
/*
* Whew. If nobody is requesting anything we sleep until the
* next event. If we sleep and the sleep times out and
* nobody is waiting for interesting things we back-off.
* Otherwise we get more aggressive.
* If nobody is requesting anything we sleep
*/
if (bd_request == 0)
tsleep(&bd_request, PVM, "psleep", bd_interval);
/*
* We calculate how much to add or subtract from bd_flushto
* and bd_interval based on how far off we are from the
* optimal number of dirty buffers, which is 20% below the
* hidirtybuffers mark. We cannot use hidirtybuffers straight
* because being right on the mark will cause getnewbuf()
* to oscillate our wakeup.
*
* The larger the error in either direction, the more we adjust
* bd_flushto and bd_interval. The time interval is adjusted
* by 2 seconds per whole-buffer-range of error. This is an
* exponential convergence algorithm, with large errors
* producing large changes and small errors producing small
* changes.
*/
if (bd_request == 0 &&
tsleep(&bd_request, PVM, "psleep", bd_interval) &&
needsbuffer == 0) {
/*
* timed out and nothing serious going on,
* increase the flushto high water mark to reduce
* the flush rate.
*/
bd_flushto += 10;
} else {
/*
* We were woken up or hit a serious wall that needs
* to be addressed.
*/
bd_flushto -= 10;
if (needsbuffer) {
int middb = (lodirtybuffers+hidirtybuffers)/2;
bd_interval >>= 1;
if (bd_flushto > middb)
bd_flushto = middb;
}
{
int brange = hidirtybuffers - lodirtybuffers;
int middb = hidirtybuffers - brange / 5;
int deltabuf = middb - numdirtybuffers;
bd_flushto += deltabuf / 20;
bd_interval += deltabuf * (2 * hz) / (brange * 1);
}
if (bd_flushto < lodirtybuffers) {
if (bd_flushto < lodirtybuffers)
bd_flushto = lodirtybuffers;
bd_interval -= hz / 10;
}
if (bd_flushto > hidirtybuffers) {
if (bd_flushto > hidirtybuffers)
bd_flushto = hidirtybuffers;
bd_interval += hz / 10;
}
if (bd_interval < hz / 10)
bd_interval = hz / 10;
if (bd_interval > 5 * hz)
bd_interval = 5 * hz;
}
}
/*
* flushbufqueues:
*
* Try to flush a buffer in the dirty queue. We must be careful to
* free up B_INVAL buffers instead of write them, which NFS is
* particularly sensitive to.
*/
static int
flushbufqueues(void)
{
@ -1709,15 +1770,6 @@ flushbufqueues(void)
bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
while (bp) {
/*
* Try to free up B_INVAL delayed-write buffers rather then
* writing them out. Note also that NFS is somewhat sensitive
* to B_INVAL buffers so it is doubly important that we do
* this.
*
* We do not try to sync buffers whos vnodes are locked, we
* cannot afford to block in this process.
*/
KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
if ((bp->b_flags & B_DELWRI) != 0) {
if (bp->b_flags & B_INVAL) {
@ -1728,11 +1780,9 @@ flushbufqueues(void)
++r;
break;
}
if (!VOP_ISLOCKED(bp->b_vp)) {
vfs_bio_awrite(bp);
++r;
break;
}
vfs_bio_awrite(bp);
++r;
break;
}
bp = TAILQ_NEXT(bp, b_freelist);
}
@ -1957,8 +2007,6 @@ loop:
*/
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
if (bp->b_usecount < BUF_MAXUSE)
++bp->b_usecount;
if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
"getblk", slpflag, slptimeo) == ENOLCK)
goto loop;
@ -2036,8 +2084,6 @@ loop:
goto loop;
}
if (bp->b_usecount < BUF_MAXUSE)
++bp->b_usecount;
splx(s);
bp->b_flags &= ~B_DONE;
} else {
@ -2063,8 +2109,7 @@ loop:
maxsize = vmio ? size + (offset & PAGE_MASK) : size;
maxsize = imax(maxsize, bsize);
if ((bp = getnewbuf(vp, blkno,
slpflag, slptimeo, size, maxsize)) == NULL) {
if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
if (slpflag || slptimeo) {
splx(s);
return NULL;
@ -2079,7 +2124,8 @@ loop:
* If the buffer is created out from under us, we have to
* throw away the one we just created. There is now window
* race because we are safely running at splbio() from the
* point of the duplicate buffer creation through to here.
* point of the duplicate buffer creation through to here,
* and we've locked the buffer.
*/
if (gbincore(vp, blkno)) {
bp->b_flags |= B_INVAL;
@ -2096,7 +2142,7 @@ loop:
bgetvp(vp, bp);
LIST_REMOVE(bp, b_hash);
bh = BUFHASH(vp, blkno);
bh = bufhash(vp, blkno);
LIST_INSERT_HEAD(bh, bp, b_hash);
/*
@ -2135,7 +2181,7 @@ geteblk(int size)
int s;
s = splbio();
while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
splx(s);
allocbuf(bp, size);
bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
@ -2218,7 +2264,8 @@ allocbuf(struct buf *bp, int size)
#if !defined(NO_B_MALLOC)
/*
* We only use malloced memory on the first allocation.
* and revert to page-allocated memory when the buffer grows.
* and revert to page-allocated memory when the buffer
* grows.
*/
if ( (bufmallocspace < maxbufmallocspace) &&
(bp->b_bufsize == 0) &&

View File

@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
* $Id: vfs_cluster.c,v 1.85 1999/06/29 05:59:43 peter Exp $
* $Id: vfs_cluster.c,v 1.86 1999/07/04 00:31:17 mckusick Exp $
*/
#include "opt_debug_cluster.h"
@ -150,21 +150,12 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
}
/*
* Set another read-ahead mark so we know to check
* again.
* Set another read-ahead mark so we know
* to check again.
*/
if (((i % racluster) == (racluster - 1)) ||
(i == (maxra - 1)))
tbp->b_flags |= B_RAM;
#if 0
if ((tbp->b_usecount < 1) &&
BUF_REFCNT(tbp) == 0 &&
(tbp->b_qindex == QUEUE_LRU)) {
TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist);
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist);
}
#endif
}
splx(s);
if (i >= maxra) {
@ -586,7 +577,7 @@ cluster_write(bp, filesize)
if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
if (!async)
cluster_wbuild(vp, lblocksize,
cluster_wbuild_wb(vp, lblocksize,
vp->v_cstart, cursize);
} else {
struct buf **bpp, **endbp;

View File

@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
* $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $
* $Id: vfs_subr.c,v 1.206 1999/07/04 00:25:29 mckusick Exp $
*/
/*
@ -115,8 +115,8 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad,
static int reassignbufmethod = 1;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
int vfs_ioopt = 0;
#ifdef ENABLE_VFS_IOOPT
int vfs_ioopt = 0;
SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
#endif

View File

@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
* $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $
* $Id: vfs_subr.c,v 1.206 1999/07/04 00:25:29 mckusick Exp $
*/
/*
@ -115,8 +115,8 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad,
static int reassignbufmethod = 1;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
int vfs_ioopt = 0;
#ifdef ENABLE_VFS_IOOPT
int vfs_ioopt = 0;
SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
#endif

View File

@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
* $Id: vfs_vnops.c,v 1.68 1999/04/28 11:37:12 phk Exp $
* $Id: vfs_vnops.c,v 1.69 1999/07/02 16:29:15 phk Exp $
*/
#include <sys/param.h>
@ -334,10 +334,14 @@ vn_write(fp, uio, cred, flags)
struct ucred *cred;
int flags;
{
struct vnode *vp = (struct vnode *)fp->f_data;
struct vnode *vp;
struct proc *p = uio->uio_procp;
int error, ioflag;
vp = (struct vnode *)fp->f_data;
if (vp->v_type == VREG)
bwillwrite();
vp = (struct vnode *)fp->f_data; /* XXX needed? */
ioflag = IO_UNIT;
if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
ioflag |= IO_APPEND;

View File

@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
* $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $
* $Id: buf.h,v 1.75 1999/07/04 00:25:32 mckusick Exp $
*/
#ifndef _SYS_BUF_H_
@ -100,7 +100,7 @@ struct buf {
TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */
long b_flags; /* B_* flags. */
unsigned short b_qindex; /* buffer queue index */
unsigned char b_usecount; /* buffer use count */
unsigned char b_unused1; /* unused field */
unsigned char b_xflags; /* extra flags */
struct lock b_lock; /* Buffer lock */
int b_error; /* Errno value. */
@ -410,18 +410,6 @@ bufq_first(struct buf_queue_head *head)
#endif /* KERNEL */
/*
* number of buffer hash entries
*/
#define BUFHSZ 512
/*
* buffer hash table calculation, originally by David Greenman
*/
#define BUFHASH(vnp, bn) \
(&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
/*
* Definitions for the buffer free lists.
*/
@ -458,7 +446,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
struct uio;
vm_offset_t bufhashinit __P((vm_offset_t));
void bufinit __P((void));
void bwillwrite __P((void));
void bremfree __P((struct buf *));
int bread __P((struct vnode *, daddr_t, int,
struct ucred *, struct buf **));

View File

@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
* $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $
* $Id: buf.h,v 1.75 1999/07/04 00:25:32 mckusick Exp $
*/
#ifndef _SYS_BUF_H_
@ -100,7 +100,7 @@ struct buf {
TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */
long b_flags; /* B_* flags. */
unsigned short b_qindex; /* buffer queue index */
unsigned char b_usecount; /* buffer use count */
unsigned char b_unused1; /* unused field */
unsigned char b_xflags; /* extra flags */
struct lock b_lock; /* Buffer lock */
int b_error; /* Errno value. */
@ -410,18 +410,6 @@ bufq_first(struct buf_queue_head *head)
#endif /* KERNEL */
/*
* number of buffer hash entries
*/
#define BUFHSZ 512
/*
* buffer hash table calculation, originally by David Greenman
*/
#define BUFHASH(vnp, bn) \
(&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
/*
* Definitions for the buffer free lists.
*/
@ -458,7 +446,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
struct uio;
vm_offset_t bufhashinit __P((vm_offset_t));
void bufinit __P((void));
void bwillwrite __P((void));
void bremfree __P((struct buf *));
int bread __P((struct vnode *, daddr_t, int,
struct ucred *, struct buf **));

View File

@ -31,7 +31,7 @@
* SUCH DAMAGE.
*
* @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
* $Id: ufs_readwrite.c,v 1.57 1999/01/28 00:57:56 dillon Exp $
* $Id: ufs_readwrite.c,v 1.58 1999/04/05 19:38:30 julian Exp $
*/
#define BLKSIZE(a, b, c) blksize(a, b, c)
@ -106,7 +106,8 @@ READ(ap)
if (object)
vm_object_reference(object);
#if 1
#ifdef ENABLE_VFS_IOOPT
/*
* If IO optimisation is turned on,
* and we are NOT a VM based IO request,
@ -150,7 +151,7 @@ READ(ap)
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
break;
#if 1
#ifdef ENABLE_VFS_IOOPT
if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
/*
* Obviously we didn't finish above, but we
@ -276,6 +277,7 @@ READ(ap)
xfersize = size;
}
#ifdef ENABLE_VFS_IOOPT
if (vfs_ioopt && object &&
(bp->b_flags & B_VMIO) &&
((blkoffset & PAGE_MASK) == 0) &&
@ -289,7 +291,9 @@ READ(ap)
error =
uiomoveco((char *)bp->b_data + blkoffset,
(int)xfersize, uio, object);
} else {
} else
#endif
{
/*
* otherwise use the general form
*/