Source to kern/vfs_bio.c
/*
* Mach Operating System
* Copyright (c) 1992 Carnegie Mellon University
* All Rights Reserved.
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or [email protected]
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie Mellon
* the rights to redistribute these changes.
*/
/*
* vfs_bio.c,v 1.1.2.2 1993/08/07 01:56:01 cgd Exp
*
* (Mach) HISTORY
* Revision 2.3 92/07/08 16:19:50 mrt
* Added fourth arg, TRUE, to vm_allocate call in getnewbuf.
* [92/07/02 mrt]
*
* Revision 2.2 92/06/25 17:25:40 mrt
* Clear b_resid on release. No one cares once released.
* [ XXX -- this had to be squished; symlinks care -- cgd]
* [92/06/25 rwd]
* Set b_rcred before VOP_STRATEGY calls. [from Jolitz]
*
* Revision 2.1 92/04/21 17:12:36 rwd
* BSDSS
*/
/*
* Copyright (c) 1982, 1986, 1989 Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)vfs_bio.c 7.40 (Berkeley) 5/8/91
*/
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/specdev.h>
#include <sys/mount.h>
#include <sys/trace.h>
#include <sys/resourcevar.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
int debug_bio = 0;
int freebufspace = 0;
int allocbufspace = 0;
extern int bufpages;
static struct buf *getnewbuf __P((int));
extern vm_map_t buffer_map;
struct buf *buf; /* the buffer pool itself */
char *buffers;
int nbuf; /* number of buffer headers */
int bufpages; /* number of memory pages in the buffer pool */
struct buf *swbuf; /* swap I/O headers (XXX go elsewhere?) */
int nswbuf;
struct bufhd bufhash[BUFHSZ]; /* heads of hash lists */
struct buf bfreelist[BQUEUES]; /* heads of available lists */
struct buf bswlist; /* head of free swap header list */
struct buf *bclnlist; /* head of cleaned page list */
/*
* Initialize buffers and hash links for buffers.
*/
void
bufinit()
{
struct bufhd *bh;
struct buf *bp;
freebufspace = bufpages * NBPG;
/* first, make a null hash table */
for(bh = bufhash; bh < bufhash + BUFHSZ; bh++) {
bh->b_flags = 0;
bh->b_forw = (struct buf *)bh;
bh->b_back = (struct buf *)bh;
}
/* next, make a null set of free lists */
for(bp = bfreelist; bp < bfreelist + BQUEUES; bp++) {
bp->b_flags = 0;
bp->av_forw = bp;
bp->av_back = bp;
bp->b_forw = bp;
bp->b_back = bp;
}
/* finally, initialize each buffer header and stick on empty q */
for(bp = buf; bp < buf + nbuf ; bp++) {
bp->b_flags = B_HEAD | B_INVAL; /* we're just an empty header */
bp->b_dev = NODEV;
bp->b_rcred = bp->b_wcred = NOCRED;
bp->b_vp = 0;
binstailfree(bp, bfreelist + BQ_EMPTY);
binshash(bp, bfreelist + BQ_EMPTY);
}
}
/*
* Find the block in the buffer pool.
* If the buffer is not present, allocate a new buffer and load
* its contents according to the filesystem fill routine.
*/
int
bread(vp, blkno, size, cred, bpp)
struct vnode *vp;
daddr_t blkno;
int size;
struct ucred *cred;
struct buf **bpp;
{
struct buf *bp;
int rv = 0;
bp = getblk (vp, blkno, size);
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0 || (bp->b_flags & B_INVAL) != 0) {
bp->b_flags |= B_READ;
bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
if (cred != NOCRED && bp->b_rcred == NOCRED) {
crhold(cred);
bp->b_rcred = cred;
}
VOP_STRATEGY(bp);
rv = biowait (bp);
}
*bpp = bp;
return (rv);
}
/*
* Operates like bread, but also starts I/O on the specified
* read-ahead block. [See page 55 of Bach's Book]
*/
int
breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
struct vnode *vp;
daddr_t blkno;
int size;
daddr_t rablkno;
int rabsize;
struct ucred *cred;
struct buf **bpp;
{
struct buf *bp, *rabp;
int rv = 0, needwait = 0;
bp = getblk (vp, blkno, size);
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0 || (bp->b_flags & B_INVAL) != 0) {
bp->b_flags |= B_READ;
bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
if (cred != NOCRED && bp->b_rcred == NOCRED) {
crhold(cred);
bp->b_rcred = cred;
}
VOP_STRATEGY(bp);
needwait++;
}
rabp = getblk (vp, rablkno, rabsize);
/* if not found in cache, do some I/O (overlapped with first) */
if ((rabp->b_flags & B_CACHE) == 0 || (rabp->b_flags & B_INVAL) != 0) {
rabp->b_flags |= B_READ | B_ASYNC;
rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
if (cred != NOCRED && bp->b_rcred == NOCRED) {
crhold(cred);
rabp->b_rcred = cred;
}
VOP_STRATEGY(rabp);
} else
brelse(rabp);
/* wait for original I/O */
if (needwait)
rv = biowait (bp);
*bpp = bp;
return (rv);
}
/*
* Synchronous write.
* Release buffer on completion.
*/
int
bwrite(bp)
register struct buf *bp;
{
int rv;
if(bp->b_flags & B_INVAL) {
brelse(bp);
return (0);
} else {
int wasdelayed;
if(!(bp->b_flags & B_BUSY))
panic("bwrite: not busy");
wasdelayed = bp->b_flags & B_DELWRI;
bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_ASYNC|B_DELWRI);
if(wasdelayed)
reassignbuf(bp, bp->b_vp);
bp->b_flags |= B_DIRTY;
bp->b_vp->v_numoutput++;
VOP_STRATEGY(bp);
rv = biowait(bp);
if (!rv)
bp->b_flags &= ~B_DIRTY;
brelse(bp);
return (rv);
}
}
/*
* Delayed write.
*
* The buffer is marked dirty, but is not queued for I/O.
* This routine should be used when the buffer is expected
* to be modified again soon, typically a small write that
* partially fills a buffer.
*
* NB: magnetic tapes cannot be delayed; they must be
* written in the order that the writes are requested.
*/
void
bdwrite(bp)
register struct buf *bp;
{
if(!(bp->b_flags & B_BUSY))
panic("bdwrite: not busy");
if(bp->b_flags & B_INVAL) {
brelse(bp);
return;
}
if(bp->b_flags & B_TAPE) {
bwrite(bp);
return;
}
bp->b_flags &= ~(B_READ|B_DONE);
bp->b_flags |= B_DIRTY|B_DELWRI;
reassignbuf(bp, bp->b_vp);
brelse(bp);
return;
}
/*
* Asynchronous write.
* Start I/O on a buffer, but do not wait for it to complete.
* The buffer is released when the I/O completes.
*/
void
bawrite(bp)
register struct buf *bp;
{
if(!(bp->b_flags & B_BUSY))
panic("bawrite: not busy");
if(bp->b_flags & B_INVAL)
brelse(bp);
else {
int wasdelayed;
wasdelayed = bp->b_flags & B_DELWRI;
bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
if(wasdelayed)
reassignbuf(bp, bp->b_vp);
bp->b_flags |= B_DIRTY | B_ASYNC;
bp->b_vp->v_numoutput++;
VOP_STRATEGY(bp);
}
}
/*
* Release a buffer.
* Even if the buffer is dirty, no I/O is started.
*/
void
brelse(bp)
register struct buf *bp;
{
int x;
/* anyone need a "free" block? */
x=splbio();
if ((bfreelist + BQ_AGE)->b_flags & B_WANTED) {
(bfreelist + BQ_AGE) ->b_flags &= ~B_WANTED;
wakeup(bfreelist);
}
/* anyone need this very block? */
if (bp->b_flags & B_WANTED) {
bp->b_flags &= ~B_WANTED;
wakeup(bp);
}
if (bp->b_flags & (B_INVAL|B_ERROR)) {
bp->b_flags |= B_INVAL;
bp->b_flags &= ~(B_DELWRI|B_CACHE);
if(bp->b_vp)
brelvp(bp);
}
/* enqueue */
/* just an empty buffer head ... */
/*if(bp->b_flags & B_HEAD)
binsheadfree(bp, bfreelist + BQ_EMPTY)*/
/* buffers with junk contents */
/*else*/ if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE))
binsheadfree(bp, bfreelist + BQ_AGE)
/* buffers with stale but valid contents */
else if(bp->b_flags & B_AGE)
binstailfree(bp, bfreelist + BQ_AGE)
/* buffers with valid and quite potentially reuseable contents */
else
binstailfree(bp, bfreelist + BQ_LRU)
/* unlock */
bp->b_flags &= ~B_BUSY;
/* bp->b_resid = 0; XXX (cgd) -- actually, we need resid for symlinks */
splx(x);
}
/*
* Find a buffer which is available for use.
* If free memory for buffer space and an empty header from the empty list,
* use that. Otherwise, select something from a free list.
* Preference is to AGE list, then LRU list.
*/
static struct buf *
getnewbuf(sz)
int sz;
{
struct buf *bp;
int x, allocsize;
x = splbio();
allocsize = MAXBSIZE; /* XXX -- should be round_page(sz) */
start:
/* can we constitute a new buffer? */
if (freebufspace >= allocsize
&& bfreelist[BQ_EMPTY].av_forw != (struct buf *)bfreelist+BQ_EMPTY) {
caddr_t addr;
int kr;
#if 0
if ((kr = vm_allocate(buffer_map, (vm_offset_t *) &addr, (vm_size_t) allocsize, TRUE))
!= KERN_SUCCESS) goto tryfree;
#else
if (! (addr = (caddr_t) kmem_alloc (buffer_map, (vm_size_t) allocsize))) goto tryfree;
#endif
freebufspace -= allocsize;
allocbufspace += allocsize;
bp = bfreelist[BQ_EMPTY].av_forw;
if (debug_bio) printf(" allocate bp: %x addr: %x (alloc)size: %x\n",bp, addr, allocsize);
bp->b_flags = B_BUSY | B_INVAL;
bremfree(bp);
bp->b_un.b_addr = (caddr_t) addr;
bp->b_bufsize = allocsize;
goto fillin;
}
tryfree:
if (bfreelist[BQ_AGE].av_forw != (struct buf *)bfreelist+BQ_AGE) {
bp = bfreelist[BQ_AGE].av_forw;
bremfree(bp);
} else if (bfreelist[BQ_LRU].av_forw != (struct buf *)bfreelist+BQ_LRU) {
bp = bfreelist[BQ_LRU].av_forw;
bremfree(bp);
} else {
/* wait for a free buffer of any kind */
(bfreelist + BQ_AGE)->b_flags |= B_WANTED;
tsleep(bfreelist, PRIBIO, "getnewbuf", 0);
splx(x);
return (0);
}
/* if we are a delayed write, convert to an async write! */
if (bp->b_flags & B_DELWRI) {
bp->b_flags |= B_BUSY;
bawrite (bp);
goto start;
}
if(bp->b_vp)
brelvp(bp);
/* we are not free, nor do we contain interesting data */
if (bp->b_rcred != NOCRED)
crfree(bp->b_rcred);
if (bp->b_wcred != NOCRED)
crfree(bp->b_wcred);
bp->b_flags = B_BUSY;
fillin:
bremhash(bp);
splx(x);
bp->b_dev = NODEV;
bp->b_vp = NULL;
bp->b_blkno = bp->b_lblkno = 0;
bp->b_iodone = 0;
bp->b_error = 0;
bp->b_resid = 0;
bp->b_wcred = bp->b_rcred = NOCRED;
if (bp->b_bufsize != allocsize)
allocbuf(bp, allocsize);
bp->b_bcount = sz;
bp->b_dirtyoff = bp->b_dirtyend = 0;
return (bp);
}
/*
* Check to see if a block is currently memory resident.
* This routine must be called at splbio(). If it is not,
* and you take a disk interrupt while in the while loop,
* you can loop forever.
*/
struct buf *
incore(vp, blkno)
struct vnode *vp;
daddr_t blkno;
{
struct buf *bh;
struct buf *bp;
bh = BUFHASH(vp, blkno);
/* Search hash chain */
bp = bh->b_forw;
while (bp != (struct buf *) bh) {
/* hit */
if (bp->b_lblkno == blkno && bp->b_vp == vp
&& (bp->b_flags & B_INVAL) == 0)
return (bp);
bp = bp->b_forw;
}
return(0);
}
/*
* Get a block of requested size that is associated with
* a given vnode and block offset. If it is found in the
* block cache, mark it as having been found, make it busy
* and return it. Otherwise, return an empty block of the
* correct size. It is up to the caller to insure that the
* cached blocks be of the correct size.
*/
struct buf *
getblk(vp, blkno, size)
register struct vnode *vp;
daddr_t blkno;
int size;
{
struct buf *bp, *bh;
int x;
x = splbio();
start:
if (bp = incore(vp, blkno)) {
if (bp->b_flags & B_BUSY) {
bp->b_flags |= B_WANTED;
tsleep (bp, PRIBIO, "getblk", 0);
goto start;
}
bp->b_flags |= B_BUSY | B_CACHE;
bremfree(bp);
if (size > bp->b_bufsize)
panic("getblk: buffer too small"); /* XXX */
/* if (bp->b_bufsize != size) allocbuf(bp, size); */
} else {
splx(x);
if((bp = getnewbuf(size)) == NULL) {
x = splbio();
goto start;
}
bp->b_blkno = bp->b_lblkno = blkno;
bgetvp(vp, bp);
x = splbio();
bh = BUFHASH(vp, blkno);
binshash(bp, bh);
bp->b_flags = B_BUSY;
}
splx(x);
return (bp);
}
/*
* Get an empty, disassociated buffer of given size.
*/
struct buf *
geteblk(size)
int size;
{
struct buf *bp;
int x;
while ((bp = getnewbuf(size)) == 0)
;
x = splbio();
binshash(bp, bfreelist + BQ_AGE);
splx(x);
return (bp);
}
/*
* Exchange a buffer's underlying buffer storage for one of different
* size, taking care to maintain contents appropriately. When buffer
* increases in size, caller is responsible for filling out additional
* contents. When buffer shrinks in size, data is lost, so caller must
* first return it to backing store before shrinking the buffer, as
* no implied I/O will be done.
*/
void
allocbuf(bp, size)
register struct buf *bp;
int size;
{
vm_size_t current_size, desired_size;
vm_offset_t new_start;
int kr;
current_size = bp->b_bufsize;
desired_size = MAXBSIZE; /* XXX (cgd) -- round_page(size) */
if (current_size < desired_size) {
/*
* Buffer is growing.
* If buffer already has data, allocate new area and copy
* old data to it.
*/
#if 0
kr = vm_allocate(buffer_map,
&new_start,
desired_size,
TRUE);
if (kr != KERN_SUCCESS)
#else
new_start = kmem_alloc (buffer_map, desired_size);
if (! new_start)
#endif
panic("allocbuf: allocate",kr);
if (debug_bio) printf(" reallocate bp: %x addr: %x size: %x\n",bp, new_start, desired_size);
if (current_size) {
bcopy(bp->b_un.b_addr,
(caddr_t) new_start,
bp->b_bufsize);
#if 0
kr = vm_deallocate(buffer_map,
(vm_offset_t)bp->b_un.b_addr,
current_size);
if (kr != KERN_SUCCESS)
panic("allocbuf: deallocate",kr);
#else
kmem_free (buffer_map, (vm_offset_t)bp->b_un.b_addr, current_size);
#endif
if (debug_bio) printf(" deallocate bp: %x addr: %x size: %x\n",bp, bp->b_un.b_addr,bp->b_bufsize);
}
bp->b_un.b_addr = (char *)new_start;
bp->b_bufsize = desired_size;
/* adjust buffer cache's idea of memory allocated to buffer contents */
freebufspace -= desired_size - current_size;
allocbufspace += desired_size - current_size;
}
bp->b_bcount = size;
}
/*
* Patiently await operations to complete on this buffer.
* When they do, extract error value and return it.
* Extract and return any errors associated with the I/O.
* If an invalid block, force it off the lookup hash chains.
*/
int
biowait(bp)
register struct buf *bp;
{
int x;
x = splbio();
while ((bp->b_flags & B_DONE) == 0)
tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
if((bp->b_flags & B_ERROR) || bp->b_error) {
if ((bp->b_flags & B_INVAL) == 0) {
bp->b_flags |= B_INVAL;
/* XXX
* brelse() already puts buffers with B_ERROR set on the age queue
* and i believe it's wrong to let bp->b_error override B_ERROR
* but it certainly appears to work OK this way...
*/
bremhash(bp);
binshash(bp, bfreelist + BQ_AGE);
}
if (!bp->b_error)
bp->b_error = EIO;
else
bp->b_flags |= B_ERROR;
splx(x);
return (bp->b_error);
} else {
splx(x);
return (0);
}
}
/*
* Finish up operations on a buffer, calling an optional
* function (if requested), and releasing the buffer if
* marked asynchronous. Then mark this buffer done so that
* others biowait()'ing for it will notice when they are
* woken up from sleep().
*/
void
biodone(bp)
register struct buf *bp;
{
int x;
x = splbio();
if (bp->b_flags & B_CALL) (*bp->b_iodone)(bp);
bp->b_flags &= ~B_CALL;
if ((bp->b_flags & (B_READ|B_DIRTY)) == B_DIRTY) {
bp->b_flags &= ~B_DIRTY;
vwakeup(bp);
}
if (bp->b_flags & B_ASYNC)
brelse(bp);
bp->b_flags &= ~B_ASYNC;
bp->b_flags |= B_DONE;
wakeup(bp);
splx(x);
}