Source to kern/vfs_bio.c


Enter a symbol's name here to quickly find it.

/* 
 * Mach Operating System
 * Copyright (c) 1992 Carnegie Mellon University
 * All Rights Reserved.
 * 
 * Permission to use, copy, modify and distribute this software and its
 * documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 * 
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 * 
 * Carnegie Mellon requests users of this software to return to
 * 
 *  Software Distribution Coordinator  or  [email protected]
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 * 
 * any improvements or extensions that they make and grant Carnegie Mellon 
 * the rights to redistribute these changes.
 */
/*
 *	vfs_bio.c,v 1.1.2.2 1993/08/07 01:56:01 cgd Exp
 *
 * (Mach) HISTORY
 * Revision 2.3  92/07/08  16:19:50  mrt
 * 	Added fourth arg, TRUE, to vm_allocate call in getnewbuf.
 * 	[92/07/02            mrt]
 * 
 * Revision 2.2  92/06/25  17:25:40  mrt
 * 	Clear b_resid on release.  No one cares once released.
 *		[ XXX -- this had to be squished; symlinks care -- cgd]
 * 	[92/06/25            rwd]
 * 	Set b_rcred before VOP_STRATEGY calls. [from Jolitz]
 * 
 * Revision 2.1  92/04/21  17:12:36  rwd
 * BSDSS
 */

/*
 * Copyright (c) 1982, 1986, 1989 Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	from: @(#)vfs_bio.c	7.40 (Berkeley) 5/8/91
 */

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/specdev.h>
#include <sys/mount.h>
#include <sys/trace.h>
#include <sys/resourcevar.h>
#include <vm/vm.h>
#include <vm/vm_param.h>

int debug_bio = 0;
int freebufspace = 0;
int allocbufspace = 0;
extern int bufpages;

static struct  buf *getnewbuf  __P((int));
extern vm_map_t 	buffer_map;

struct buf	*buf;			/* the buffer pool itself */
char		*buffers;
int		nbuf;			/* number of buffer headers */
int		bufpages;	 /* number of memory pages in the buffer pool */
struct buf	*swbuf;		      /* swap I/O headers (XXX go elsewhere?) */
int		nswbuf;
struct bufhd	bufhash[BUFHSZ];	/* heads of hash lists */
struct buf	bfreelist[BQUEUES];	/* heads of available lists */
struct buf	bswlist;		/* head of free swap header list */
struct buf	*bclnlist;		/* head of cleaned page list */

/*
 * Initialize buffers and hash links for buffers.
 */
void
bufinit()
{
    struct bufhd *bh;
    struct buf *bp;

    freebufspace = bufpages * NBPG;

    /* first, make a null hash table */
    for(bh = bufhash; bh < bufhash + BUFHSZ; bh++) {
	bh->b_flags = 0;
	bh->b_forw = (struct buf *)bh;
	bh->b_back = (struct buf *)bh;
    }

    /* next, make a null set of free lists */
    for(bp = bfreelist; bp < bfreelist + BQUEUES; bp++) {
	bp->b_flags = 0;
	bp->av_forw = bp;
	bp->av_back = bp;
	bp->b_forw = bp;
	bp->b_back = bp;

    }

    /* finally, initialize each buffer header and stick on empty q */
    for(bp = buf; bp < buf + nbuf ; bp++) {
	bp->b_flags = B_HEAD | B_INVAL;	/* we're just an empty header */
	bp->b_dev = NODEV;
	bp->b_rcred = bp->b_wcred = NOCRED;
	bp->b_vp = 0;
	binstailfree(bp, bfreelist + BQ_EMPTY);
	binshash(bp, bfreelist + BQ_EMPTY);
    }
}


/*
 * Find the block in the buffer pool.
 * If the buffer is not present, allocate a new buffer and load
 * its contents according to the filesystem fill routine.
 */
int
bread(vp, blkno, size, cred, bpp)
	struct vnode *vp;
	daddr_t blkno;
	int size;
	struct ucred *cred;
	struct buf **bpp;
{
	struct buf *bp;
	int rv = 0;

	bp = getblk (vp, blkno, size);

	/* if not found in cache, do some I/O */
	if ((bp->b_flags & B_CACHE) == 0 || (bp->b_flags & B_INVAL) != 0) {
		bp->b_flags |= B_READ;
		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
		if (cred != NOCRED && bp->b_rcred == NOCRED) {
			crhold(cred);
			bp->b_rcred = cred;
		}
		VOP_STRATEGY(bp);
		rv = biowait (bp);
	}
	*bpp = bp;
	
	return (rv);
}

/*
 * Operates like bread, but also starts I/O on the specified
 * read-ahead block. [See page 55 of Bach's Book]
 */
int
breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
	struct vnode *vp;
	daddr_t blkno; 
	int size;
	daddr_t rablkno; 
	int rabsize;
	struct ucred *cred;
	struct buf **bpp;
{
	struct buf *bp, *rabp;
	int rv = 0, needwait = 0;

	bp = getblk (vp, blkno, size);

	/* if not found in cache, do some I/O */
	if ((bp->b_flags & B_CACHE) == 0 || (bp->b_flags & B_INVAL) != 0) {
		bp->b_flags |= B_READ;
		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
		if (cred != NOCRED && bp->b_rcred == NOCRED) {
			crhold(cred);
			bp->b_rcred = cred;
		}
		VOP_STRATEGY(bp);
		needwait++;
	}
	
	rabp = getblk (vp, rablkno, rabsize);

	/* if not found in cache, do some I/O (overlapped with first) */
	if ((rabp->b_flags & B_CACHE) == 0 || (rabp->b_flags & B_INVAL) != 0) {
		rabp->b_flags |= B_READ | B_ASYNC;
		rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
		if (cred != NOCRED && bp->b_rcred  == NOCRED) {
			crhold(cred);
			rabp->b_rcred = cred;
		}
		VOP_STRATEGY(rabp);
	} else
		brelse(rabp);
	
	/* wait for original I/O */
	if (needwait)
		rv = biowait (bp);

	*bpp = bp;
	return (rv);
}

/*
 * Synchronous write.
 * Release buffer on completion.
 */
int
bwrite(bp)
	register struct buf *bp;
{
	int rv;

	if(bp->b_flags & B_INVAL) {
		brelse(bp);
		return (0);
	} else {
		int wasdelayed;

		if(!(bp->b_flags & B_BUSY))
			panic("bwrite: not busy");

		wasdelayed = bp->b_flags & B_DELWRI;
		bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_ASYNC|B_DELWRI);
		if(wasdelayed) 
			reassignbuf(bp, bp->b_vp);

		bp->b_flags |= B_DIRTY;
		bp->b_vp->v_numoutput++;
		VOP_STRATEGY(bp);
		rv = biowait(bp);
		if (!rv)
			bp->b_flags &= ~B_DIRTY;
		brelse(bp);
		return (rv);
	}
}

/*
 * Delayed write.
 *
 * The buffer is marked dirty, but is not queued for I/O.
 * This routine should be used when the buffer is expected
 * to be modified again soon, typically a small write that
 * partially fills a buffer.
 *
 * NB: magnetic tapes cannot be delayed; they must be
 * written in the order that the writes are requested.
 */
void 
bdwrite(bp)
	register struct buf *bp;
{

	if(!(bp->b_flags & B_BUSY))
		panic("bdwrite: not busy");

	if(bp->b_flags & B_INVAL) {
		brelse(bp);
		return;
	}
	if(bp->b_flags & B_TAPE) {
		bwrite(bp);
		return;
	}
	bp->b_flags &= ~(B_READ|B_DONE);
	bp->b_flags |= B_DIRTY|B_DELWRI;
	reassignbuf(bp, bp->b_vp);
	brelse(bp);
	return;
}

/*
 * Asynchronous write.
 * Start I/O on a buffer, but do not wait for it to complete.
 * The buffer is released when the I/O completes.
 */
void
bawrite(bp)
	register struct buf *bp;
{

	if(!(bp->b_flags & B_BUSY))
		panic("bawrite: not busy");

	if(bp->b_flags & B_INVAL)
		brelse(bp);
	else {
		int wasdelayed;

		wasdelayed = bp->b_flags & B_DELWRI;
		bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
		if(wasdelayed) 
			reassignbuf(bp, bp->b_vp);

		bp->b_flags |= B_DIRTY | B_ASYNC;
		bp->b_vp->v_numoutput++;
		VOP_STRATEGY(bp);
	}
}

/*
 * Release a buffer.
 * Even if the buffer is dirty, no I/O is started.
 */
void
brelse(bp)
	register struct buf *bp;
{
	int x;

	/* anyone need a "free" block? */
	x=splbio();
	if ((bfreelist + BQ_AGE)->b_flags & B_WANTED) {
		(bfreelist + BQ_AGE) ->b_flags &= ~B_WANTED;
		wakeup(bfreelist);
	}
	/* anyone need this very block? */
	if (bp->b_flags & B_WANTED) {
		bp->b_flags &= ~B_WANTED;
		wakeup(bp);
	}

	if (bp->b_flags & (B_INVAL|B_ERROR)) {
		bp->b_flags |= B_INVAL;
		bp->b_flags &= ~(B_DELWRI|B_CACHE);
		if(bp->b_vp)
			brelvp(bp);
	}

	/* enqueue */
	/* just an empty buffer head ... */
	/*if(bp->b_flags & B_HEAD)
		binsheadfree(bp, bfreelist + BQ_EMPTY)*/
	/* buffers with junk contents */
	/*else*/ if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE))
		binsheadfree(bp, bfreelist + BQ_AGE)
	/* buffers with stale but valid contents */
	else if(bp->b_flags & B_AGE)
		binstailfree(bp, bfreelist + BQ_AGE)
	/* buffers with valid and quite potentially reuseable contents */
	else
		binstailfree(bp, bfreelist + BQ_LRU)

	/* unlock */
	bp->b_flags &= ~B_BUSY;
	/* bp->b_resid = 0; XXX (cgd) -- actually, we need resid for symlinks */
	splx(x);
}

/*
 * Find a buffer which is available for use.
 * If free memory for buffer space and an empty header from the empty list,
 * use that. Otherwise, select something from a free list.
 * Preference is to AGE list, then LRU list.
 */
static struct buf *
getnewbuf(sz)
    int sz;
{
	struct buf *bp;
	int x, allocsize;

	x = splbio();
	allocsize = MAXBSIZE;  /* XXX -- should be round_page(sz) */

start:
	/* can we constitute a new buffer? */
	if (freebufspace >= allocsize
		&& bfreelist[BQ_EMPTY].av_forw != (struct buf *)bfreelist+BQ_EMPTY) {
		caddr_t addr;
		int kr;

#if 0
		if ((kr = vm_allocate(buffer_map, (vm_offset_t *)  &addr, (vm_size_t) allocsize, TRUE))
		    != KERN_SUCCESS) goto tryfree;
#else
		if (! (addr = (caddr_t) kmem_alloc (buffer_map, (vm_size_t) allocsize))) goto tryfree;
#endif
		freebufspace -= allocsize;
		allocbufspace += allocsize;

		bp = bfreelist[BQ_EMPTY].av_forw;
		if (debug_bio) printf("   allocate bp: %x addr: %x (alloc)size: %x\n",bp, addr, allocsize);
		bp->b_flags = B_BUSY | B_INVAL;
		bremfree(bp);
		bp->b_un.b_addr = (caddr_t) addr;
		bp->b_bufsize = allocsize;
		goto fillin;
	}

tryfree:
	if (bfreelist[BQ_AGE].av_forw != (struct buf *)bfreelist+BQ_AGE) {
		bp = bfreelist[BQ_AGE].av_forw;
		bremfree(bp);
	} else if (bfreelist[BQ_LRU].av_forw != (struct buf *)bfreelist+BQ_LRU) {
		bp = bfreelist[BQ_LRU].av_forw;
		bremfree(bp);
	} else	{
		/* wait for a free buffer of any kind */
		(bfreelist + BQ_AGE)->b_flags |= B_WANTED;
		tsleep(bfreelist, PRIBIO, "getnewbuf", 0);
		splx(x);
		return (0);
	}

	/* if we are a delayed write, convert to an async write! */
	if (bp->b_flags & B_DELWRI) {
		bp->b_flags |= B_BUSY;
		bawrite (bp);
		goto start;
	}


	if(bp->b_vp)
		brelvp(bp);

	/* we are not free, nor do we contain interesting data */
	if (bp->b_rcred != NOCRED)
		crfree(bp->b_rcred);
	if (bp->b_wcred != NOCRED)
		crfree(bp->b_wcred);

	bp->b_flags = B_BUSY;
fillin:
	bremhash(bp);
	splx(x);
	bp->b_dev = NODEV;
	bp->b_vp = NULL;
	bp->b_blkno = bp->b_lblkno = 0;
	bp->b_iodone = 0;
	bp->b_error = 0;
	bp->b_resid = 0;
	bp->b_wcred = bp->b_rcred = NOCRED;
	if (bp->b_bufsize != allocsize)
		allocbuf(bp, allocsize);
	bp->b_bcount = sz;
	bp->b_dirtyoff = bp->b_dirtyend = 0;
	return (bp);
}

/*
 * Check to see if a block is currently memory resident.
 * This routine must be called at splbio().  If it is not,
 * and you take a disk interrupt while in the while loop,
 * you can loop forever.
 */
struct buf *
incore(vp, blkno)
	struct vnode *vp;
	daddr_t blkno;
{
	struct buf *bh;
	struct buf *bp;

	bh = BUFHASH(vp, blkno);

	/* Search hash chain */
	bp = bh->b_forw;
	while (bp != (struct buf *) bh) {
		/* hit */
		if (bp->b_lblkno == blkno && bp->b_vp == vp
			&& (bp->b_flags & B_INVAL) == 0)
			return (bp);
		bp = bp->b_forw;
	}
	
	return(0);
}

/*
 * Get a block of requested size that is associated with
 * a given vnode and block offset. If it is found in the
 * block cache, mark it as having been found, make it busy
 * and return it. Otherwise, return an empty block of the
 * correct size. It is up to the caller to insure that the
 * cached blocks be of the correct size.
 */
struct buf *
getblk(vp, blkno, size)
	register struct vnode *vp;
	daddr_t blkno;
	int size;
{
	struct buf *bp, *bh;
	int x;

	x = splbio();
start:
	if (bp = incore(vp, blkno)) {
		if (bp->b_flags & B_BUSY) {
			bp->b_flags |= B_WANTED;
			tsleep (bp, PRIBIO, "getblk", 0);
			goto start;
		}
		bp->b_flags |= B_BUSY | B_CACHE;
		bremfree(bp);
		if (size > bp->b_bufsize)
			panic("getblk: buffer too small"); /* XXX */
		/* if (bp->b_bufsize != size) allocbuf(bp, size); */
	} else {
		splx(x);
		if((bp = getnewbuf(size)) == NULL) {
			x = splbio();
			goto start;
		}
		bp->b_blkno = bp->b_lblkno = blkno;
		bgetvp(vp, bp);
		x = splbio();
		bh = BUFHASH(vp, blkno);
		binshash(bp, bh);
		bp->b_flags = B_BUSY;
	}
	splx(x);
	return (bp);
}

/*
 * Get an empty, disassociated buffer of given size.
 */
struct buf *
geteblk(size)
	int size;
{
	struct buf *bp;
	int x;

	while ((bp = getnewbuf(size)) == 0)
		;
	x = splbio();
	binshash(bp, bfreelist + BQ_AGE);
	splx(x);

	return (bp);
}

/*
 * Exchange a buffer's underlying buffer storage for one of different
 * size, taking care to maintain contents appropriately. When buffer
 * increases in size, caller is responsible for filling out additional
 * contents. When buffer shrinks in size, data is lost, so caller must
 * first return it to backing store before shrinking the buffer, as
 * no implied I/O will be done.
 */
void
allocbuf(bp, size)
	register struct buf *bp;
	int size;
{
	vm_size_t	current_size, desired_size;
	vm_offset_t	new_start;
	int		kr;

	current_size = bp->b_bufsize;
	desired_size = MAXBSIZE; /* XXX (cgd) -- round_page(size) */

	if (current_size < desired_size) {
	    /*
	     * Buffer is growing.
	     * If buffer already has data, allocate new area and copy
	     * old data to it.
	     */
#if 0
	    kr = vm_allocate(buffer_map,
			     &new_start,
			     desired_size,
			     TRUE);
	    if (kr != KERN_SUCCESS)
#else
	    new_start = kmem_alloc (buffer_map, desired_size);
	    if (! new_start)
#endif
		panic("allocbuf: allocate",kr);
	    if (debug_bio) printf(" reallocate bp: %x addr: %x size: %x\n",bp, new_start, desired_size);
	    if (current_size) {
		bcopy(bp->b_un.b_addr,
		      (caddr_t) new_start,
		      bp->b_bufsize);
#if 0
		kr = vm_deallocate(buffer_map,
				   (vm_offset_t)bp->b_un.b_addr,
				   current_size);
		if (kr != KERN_SUCCESS)
		    panic("allocbuf: deallocate",kr);
#else
		kmem_free (buffer_map, (vm_offset_t)bp->b_un.b_addr, current_size);
#endif
		if (debug_bio) printf(" deallocate bp: %x addr: %x size: %x\n",bp, bp->b_un.b_addr,bp->b_bufsize);
	    }
	    bp->b_un.b_addr = (char *)new_start;
	    bp->b_bufsize = desired_size;

	    /* adjust buffer cache's idea of memory allocated to buffer contents */
	    freebufspace -= desired_size - current_size;
	    allocbufspace += desired_size - current_size;
	}

	bp->b_bcount = size;
}

/*
 * Patiently await operations to complete on this buffer.
 * When they do, extract error value and return it.
 * Extract and return any errors associated with the I/O.
 * If an invalid block, force it off the lookup hash chains.
 */
int
biowait(bp)
	register struct buf *bp;
{
	int x;

	x = splbio();
	while ((bp->b_flags & B_DONE) == 0)
		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
	if((bp->b_flags & B_ERROR) || bp->b_error) {
		if ((bp->b_flags & B_INVAL) == 0) {
			bp->b_flags |= B_INVAL;
/* XXX
 * brelse() already puts buffers with B_ERROR set on the age queue
 *	and i believe it's wrong to let bp->b_error override B_ERROR
 *	but it certainly appears to work OK this way...
 */	
			bremhash(bp);
			binshash(bp, bfreelist + BQ_AGE);
		}
		if (!bp->b_error)
			bp->b_error = EIO;
		else
			bp->b_flags |= B_ERROR;
		splx(x);
		return (bp->b_error);
	} else {
		splx(x);
		return (0);
	}
}

/*
 * Finish up operations on a buffer, calling an optional
 * function (if requested), and releasing the buffer if
 * marked asynchronous. Then mark this buffer done so that
 * others biowait()'ing for it will notice when they are
 * woken up from sleep().
 */
void
biodone(bp)
	register struct buf *bp;
{
	int x;

	x = splbio();
	if (bp->b_flags & B_CALL) (*bp->b_iodone)(bp);
	bp->b_flags &=  ~B_CALL;
	if ((bp->b_flags & (B_READ|B_DIRTY)) == B_DIRTY)  {
		bp->b_flags &=  ~B_DIRTY;
		vwakeup(bp);
	}
	if (bp->b_flags & B_ASYNC)
		brelse(bp);
	bp->b_flags &=  ~B_ASYNC;
	bp->b_flags |= B_DONE;
	wakeup(bp);
	splx(x);
}