Annotation of XNU/bsd/vfs/vfs_bio.c, revision 1.1.1.1

1.1       root        1: /*
                      2:  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
                      3:  *
                      4:  * @APPLE_LICENSE_HEADER_START@
                      5:  * 
                      6:  * The contents of this file constitute Original Code as defined in and
                      7:  * are subject to the Apple Public Source License Version 1.1 (the
                      8:  * "License").  You may not use this file except in compliance with the
                      9:  * License.  Please obtain a copy of the License at
                     10:  * http://www.apple.com/publicsource and read it before using this file.
                     11:  * 
                     12:  * This Original Code and all software distributed under the License are
                     13:  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
                     14:  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
                     15:  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
                     16:  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
                     17:  * License for the specific language governing rights and limitations
                     18:  * under the License.
                     19:  * 
                     20:  * @APPLE_LICENSE_HEADER_END@
                     21:  */
                     22: /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
                     23: /*-
                     24:  * Copyright (c) 1994 Christopher G. Demetriou
                     25:  * Copyright (c) 1982, 1986, 1989, 1993
                     26:  *     The Regents of the University of California.  All rights reserved.
                     27:  * (c) UNIX System Laboratories, Inc.
                     28:  * All or some portions of this file are derived from material licensed
                     29:  * to the University of California by American Telephone and Telegraph
                     30:  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
                     31:  * the permission of UNIX System Laboratories, Inc.
                     32:  *
                     33:  * Redistribution and use in source and binary forms, with or without
                     34:  * modification, are permitted provided that the following conditions
                     35:  * are met:
                     36:  * 1. Redistributions of source code must retain the above copyright
                     37:  *    notice, this list of conditions and the following disclaimer.
                     38:  * 2. Redistributions in binary form must reproduce the above copyright
                     39:  *    notice, this list of conditions and the following disclaimer in the
                     40:  *    documentation and/or other materials provided with the distribution.
                     41:  * 3. All advertising materials mentioning features or use of this software
                     42:  *    must display the following acknowledgement:
                     43:  *     This product includes software developed by the University of
                     44:  *     California, Berkeley and its contributors.
                     45:  * 4. Neither the name of the University nor the names of its contributors
                     46:  *    may be used to endorse or promote products derived from this software
                     47:  *    without specific prior written permission.
                     48:  *
                     49:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     50:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     51:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     52:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     53:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     54:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     55:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     56:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     57:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     58:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     59:  * SUCH DAMAGE.
                     60:  *
                     61:  * The NEXTSTEP Software License Agreement specifies the terms
                     62:  * and conditions for redistribution.
                     63:  *
                     64:  *     @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
                     65:  */
                     66: 
                     67: /*
                     68:  * Some references:
                     69:  *     Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
                     70:  *     Leffler, et al.: The Design and Implementation of the 4.3BSD
                     71:  *             UNIX Operating System (Addison Welley, 1989)
                     72:  */
                     73: /*
                     74:  * HISTORY
                     75:  * 17-July-97  Umesh Vaishampayan ([email protected])
                     76:  *     Eliminated multiple definition of buffers and buf which are defined in
                     77:  *     conf/param.c.
                     78:  *     Eliminated multiple definition of nbuf and bufpages which are defined
                     79:  *     in machdep/XXX/unix_startup.c
                     80:  *
                     81:  * 11-July-97  Umesh Vaishampayan ([email protected])
                     82:  *     Defined global variables for use when tracing is turned on.
                     83:  */
                     84: 
                     85: #include <mach_nbc.h>
                     86: #include <sys/param.h>
                     87: #include <sys/systm.h>
                     88: #include <sys/proc.h>
                     89: #include <sys/buf.h>
                     90: #include <sys/vnode.h>
                     91: #include <sys/mount.h>
                     92: #include <sys/trace.h>
                     93: #include <sys/malloc.h>
                     94: #include <sys/resourcevar.h>
                     95: #include <miscfs/specfs/specdev.h>
                     96: 
                     97: extern void reassignbuf(struct buf *, struct vnode *);
                     98: 
                     99: extern int nbuf;               /* The number of buffer headers */
                    100: extern int niobuf;
                    101: extern struct buf *buf;                /* The buffer headers. */
                    102: extern char    *buffers;       /* The buffer contents. */
                    103: extern int bufpages;           /* Number of memory pages in the buffer pool. */
                    104: struct buf *swbuf;     /* Swap I/O buffer headers. */
                    105: int nswbuf;                    /* Number of swap I/O buffer headers. */
                    106: struct buf bswlist;    /* Head of swap I/O buffer headers free list. */
                    107: struct buf *bclnlist;/* Head of cleaned page list. */
                    108: 
                    109: #if TRACE
                    110: struct proc *traceproc;
                    111: int    tracewhich, tracebuf[TRCSIZ];
                    112: u_int  tracex;
                    113: char   traceflags[TR_NFLAGS];
                    114: #endif /* TRACE */
                    115: 
                    116: /* Macros to clear/set/test flags. */
                    117: #define        SET(t, f)       (t) |= (f)
                    118: #define        CLR(t, f)       (t) &= ~(f)
                    119: #define        ISSET(t, f)     ((t) & (f))
                    120: 
                    121: /*
                    122:  * Definitions for the buffer hash lists.
                    123:  */
                    124: #define        BUFHASH(dvp, lbn)       \
                    125:        (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
                    126: LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
                    127: u_long bufhash;
                    128: 
                    129: /*
                    130:  * Insq/Remq for the buffer hash lists.
                    131:  */
                    132: #define        binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
                    133: #define        bremhash(bp)            LIST_REMOVE(bp, b_hash)
                    134: 
                    135: /*
                    136:  * Definitions for the buffer free lists.
                    137:  */
                    138: #define        BQUEUES         4               /* number of free buffer queues */
                    139: 
                    140: #define        BQ_LOCKED       0               /* super-blocks &c */
                    141: #define        BQ_LRU          1               /* lru, useful buffers */
                    142: #define        BQ_AGE          2               /* rubbish */
                    143: #define        BQ_EMPTY        3               /* buffer headers with no memory */
                    144: 
                    145: TAILQ_HEAD(ioqueue, buf) iobufqueue;
                    146: TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
                    147: int needbuffer;
                    148: 
                    149: /*
                    150:  * Insq/Remq for the buffer free lists.
                    151:  */
                    152: #define        binsheadfree(bp, dp)    do { \
                    153:                                    TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
                    154:                                    (bp)->b_timestamp = time.tv_sec; \
                    155:                                } while (0)
                    156: 
                    157: #define        binstailfree(bp, dp)    do { \
                    158:                                    TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
                    159:                                    (bp)->b_timestamp = time.tv_sec; \
                    160:                                } while (0)
                    161: 
                    162: 
                    163: /* Time in seconds before a buf on a list is considered as a stale buf */
                    164: #define LRU_IS_STALE 120 /* default value for the LRU */
                    165: #define AGE_IS_STALE 60  /* default value for the AGE */
                    166: 
                    167: int lru_is_stale = LRU_IS_STALE;
                    168: int age_is_stale = AGE_IS_STALE;
                    169: 
                    170: 
                    171: 
                    172: void
                    173: bremfree(bp)
                    174:        struct buf *bp;
                    175: {
                    176:        struct bqueues *dp = NULL;
                    177: 
                    178:        /*
                    179:         * We only calculate the head of the freelist when removing
                    180:         * the last element of the list as that is the only time that
                    181:         * it is needed (e.g. to reset the tail pointer).
                    182:         *
                    183:         * NB: This makes an assumption about how tailq's are implemented.
                    184:         */
                    185:        if (bp->b_freelist.tqe_next == NULL) {
                    186:                for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
                    187:                        if (dp->tqh_last == &bp->b_freelist.tqe_next)
                    188:                                break;
                    189:                if (dp == &bufqueues[BQUEUES])
                    190:                        panic("bremfree: lost tail");
                    191:        }
                    192:        TAILQ_REMOVE(dp, bp, b_freelist);
                    193:        bp->b_timestamp = 0; 
                    194: }
                    195: 
                    196: /*
                    197:  * Initialize buffers and hash links for buffers.
                    198:  */
                    199: void
                    200: bufinit()
                    201: {
                    202:        register struct buf *bp;
                    203:        struct bqueues *dp;
                    204:        register int i;
                    205:        int base, residual;
                    206: 
                    207:        for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
                    208:                TAILQ_INIT(dp);
                    209:        bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
                    210:        base = bufpages / nbuf;
                    211:        residual = bufpages % nbuf;
                    212:        for (i = 0; i < nbuf; i++) {
                    213:                bp = &buf[i];
                    214:                bzero((char *)bp, sizeof *bp);
                    215:                bp->b_dev = NODEV;
                    216:                bp->b_rcred = NOCRED;
                    217:                bp->b_wcred = NOCRED;
                    218:                bp->b_vnbufs.le_next = NOLIST;
                    219:                bp->b_data = buffers + i * MAXBSIZE;
                    220:                if (i < residual)
                    221:                        bp->b_bufsize = (base + 1) * CLBYTES;
                    222:                else
                    223:                        bp->b_bufsize = base * CLBYTES;
                    224:                bp->b_flags = B_INVAL;
                    225:                dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
                    226:                binsheadfree(bp, dp);
                    227:                binshash(bp, &invalhash);
                    228:        }
                    229:        base = (int )(buffers + (i * MAXBSIZE));
                    230: 
                    231:        for (; i < nbuf + niobuf; i++) {
                    232:                bp = &buf[i];
                    233:                bzero((char *)bp, sizeof *bp);
                    234:                bp->b_dev = NODEV;
                    235:                bp->b_rcred = NOCRED;
                    236:                bp->b_wcred = NOCRED;
                    237:                bp->b_vnbufs.le_next = NOLIST;
                    238:                bp->b_data = (char *)base;
                    239:                bp->b_bufsize = 0;
                    240:                bp->b_flags = B_INVAL;
                    241:                binsheadfree(bp, &iobufqueue);
                    242: 
                    243:                base += MAXPHYSIO;
                    244:        }
                    245: }
                    246: 
                    247: __inline struct buf *
                    248: bio_doread(vp, blkno, size, cred, async)
                    249:        struct vnode *vp;
                    250:        daddr_t blkno;
                    251:        int size;
                    252:        struct ucred *cred;
                    253:        int async;
                    254: {
                    255:        register struct buf *bp;
                    256:        struct proc     *p = current_proc();
                    257: 
                    258:        bp = getblk(vp, blkno, size, 0, 0);
                    259: 
                    260:        /*
                    261:         * If buffer does not have data valid, start a read.
                    262:         * Note that if buffer is B_INVAL, getblk() won't return it.
                    263:         * Therefore, it's valid if it's I/O has completed or been delayed.
                    264:         */
                    265:        if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
                    266:                /* Start I/O for the buffer (keeping credentials). */
                    267:                SET(bp->b_flags, B_READ | async);
                    268:                if (cred != NOCRED && bp->b_rcred == NOCRED) {
                    269:                        crhold(cred);
                    270:                        bp->b_rcred = cred;
                    271:                }
                    272:                VOP_STRATEGY(bp);
                    273: 
                    274:                trace(TR_BREADMISS, pack(vp, size), blkno);
                    275: 
                    276:                /* Pay for the read. */
                    277:                if (p && p->p_stats) 
                    278:                        p->p_stats->p_ru.ru_inblock++;          /* XXX */
                    279:        } else if (async) {
                    280:                brelse(bp);
                    281:        }
                    282: 
                    283:        trace(TR_BREADHIT, pack(vp, size), blkno);
                    284: 
                    285:        return (bp);
                    286: }
                    287: 
                    288: /*
                    289:  * Read a disk block.
                    290:  * This algorithm described in Bach (p.54).
                    291:  */
                    292: int
                    293: bread(vp, blkno, size, cred, bpp)
                    294:        struct vnode *vp;
                    295:        daddr_t blkno;
                    296:        int size;
                    297:        struct ucred *cred;
                    298:        struct buf **bpp;
                    299: {
                    300:        register struct buf *bp;
                    301: 
                    302:        /* Get buffer for block. */
                    303:        bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
                    304: 
                    305:        /* Wait for the read to complete, and return result. */
                    306:        return (biowait(bp));
                    307: }
                    308: 
                    309: /*
                    310:  * Read-ahead multiple disk blocks. The first is sync, the rest async.
                    311:  * Trivial modification to the breada algorithm presented in Bach (p.55).
                    312:  */
                    313: int
                    314: breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
                    315:        struct vnode *vp;
                    316:        daddr_t blkno; int size;
                    317:        daddr_t rablks[]; int rasizes[];
                    318:        int nrablks;
                    319:        struct ucred *cred;
                    320:        struct buf **bpp;
                    321: {
                    322:        register struct buf *bp;
                    323:        int i;
                    324: 
                    325:        bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
                    326: 
                    327:        /*
                    328:         * For each of the read-ahead blocks, start a read, if necessary.
                    329:         */
                    330:        for (i = 0; i < nrablks; i++) {
                    331:                /* If it's in the cache, just go on to next one. */
                    332:                if (incore(vp, rablks[i]))
                    333:                        continue;
                    334: 
                    335:                /* Get a buffer for the read-ahead block */
                    336:                (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC);
                    337:        }
                    338: 
                    339:        /* Otherwise, we had to start a read for it; wait until it's valid. */
                    340:        return (biowait(bp));
                    341: }
                    342: 
                    343: /*
                    344:  * Read with single-block read-ahead.  Defined in Bach (p.55), but
                    345:  * implemented as a call to breadn().
                    346:  * XXX for compatibility with old file systems.
                    347:  */
                    348: int
                    349: breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
                    350:        struct vnode *vp;
                    351:        daddr_t blkno; int size;
                    352:        daddr_t rablkno; int rabsize;
                    353:        struct ucred *cred;
                    354:        struct buf **bpp;
                    355: {
                    356: 
                    357:        return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));     
                    358: }
                    359: 
                    360: /*
                    361:  * Block write.  Described in Bach (p.56)
                    362:  */
                    363: int
                    364: bwrite(bp)
                    365:        struct buf *bp;
                    366: {
                    367:        int rv, sync, wasdelayed;
                    368:        struct proc     *p = current_proc();
                    369: 
                    370:        /* Remember buffer type, to switch on it later. */
                    371:        sync = !ISSET(bp->b_flags, B_ASYNC);
                    372:        wasdelayed = ISSET(bp->b_flags, B_DELWRI);
                    373:        CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
                    374: 
                    375:        if (!sync) {
                    376:                /*
                    377:                 * If not synchronous, pay for the I/O operation and make
                    378:                 * sure the buf is on the correct vnode queue.  We have
                    379:                 * to do this now, because if we don't, the vnode may not
                    380:                 * be properly notified that its I/O has completed.
                    381:                 */
                    382:                if (wasdelayed)
                    383:                        reassignbuf(bp, bp->b_vp);
                    384:                else
                    385:                if (p && p->p_stats) 
                    386:                        p->p_stats->p_ru.ru_oublock++;          /* XXX */
                    387:        }
                    388: 
                    389:        trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
                    390: 
                    391:        /* Initiate disk write.  Make sure the appropriate party is charged. */
                    392:        SET(bp->b_flags, B_WRITEINPROG);
                    393:        bp->b_vp->v_numoutput++;
                    394:        VOP_STRATEGY(bp);
                    395: 
                    396:        if (sync) {
                    397:                /*
                    398:                 * If I/O was synchronous, wait for it to complete.
                    399:                 */
                    400:                rv = biowait(bp);
                    401: 
                    402:                /*
                    403:                 * Pay for the I/O operation, if it's not been paid for, and
                    404:                 * make sure it's on the correct vnode queue. (async operatings
                    405:                 * were payed for above.)
                    406:                 */
                    407:                if (wasdelayed)
                    408:                        reassignbuf(bp, bp->b_vp);
                    409:                else
                    410:                if (p && p->p_stats) 
                    411:                        p->p_stats->p_ru.ru_oublock++;          /* XXX */
                    412: 
                    413:                /* Release the buffer. */
                    414:                brelse(bp);
                    415: 
                    416:                return (rv);
                    417:        } else {
                    418:                return (0);
                    419:        }
                    420: }
                    421: 
                    422: int
                    423: vn_bwrite(ap)
                    424:        struct vop_bwrite_args *ap;
                    425: {
                    426: 
                    427:        return (bwrite(ap->a_bp));
                    428: }
                    429: 
                    430: /*
                    431:  * Delayed write.
                    432:  *
                    433:  * The buffer is marked dirty, but is not queued for I/O.
                    434:  * This routine should be used when the buffer is expected
                    435:  * to be modified again soon, typically a small write that
                    436:  * partially fills a buffer.
                    437:  *
                    438:  * NB: magnetic tapes cannot be delayed; they must be
                    439:  * written in the order that the writes are requested.
                    440:  *
                    441:  * Described in Leffler, et al. (pp. 208-213).
                    442:  */
                    443: void
                    444: bdwrite(bp)
                    445:        struct buf *bp;
                    446: {
                    447:        struct proc *p = current_proc();
                    448: 
                    449:        /*
                    450:         * If the block hasn't been seen before:
                    451:         *      (1) Mark it as having been seen,
                    452:         *      (2) Charge for the write.
                    453:         *      (3) Make sure it's on its vnode's correct block list,
                    454:         */
                    455:        if (!ISSET(bp->b_flags, B_DELWRI)) {
                    456:                SET(bp->b_flags, B_DELWRI);
                    457:                if (p && p->p_stats) 
                    458:                        p->p_stats->p_ru.ru_oublock++;          /* XXX */
                    459:                reassignbuf(bp, bp->b_vp);
                    460:        }
                    461: 
                    462:        /* If this is a tape block, write it the block now. */
                    463:        if (ISSET(bp->b_flags, B_TAPE)) {
                    464:                bwrite(bp);
                    465:                return;
                    466:        }
                    467: 
                    468:        /* Otherwise, the "write" is done, so mark and release the buffer. */
                    469:        SET(bp->b_flags, B_DONE);
                    470:        brelse(bp);
                    471: }
                    472: 
                    473: /*
                    474:  * Asynchronous block write; just an asynchronous bwrite().
                    475:  */
                    476: void
                    477: bawrite(bp)
                    478:        struct buf *bp;
                    479: {
                    480: 
                    481:        SET(bp->b_flags, B_ASYNC);
                    482:        VOP_BWRITE(bp);
                    483: }
                    484: 
                    485: /*
                    486:  * Release a buffer on to the free lists.
                    487:  * Described in Bach (p. 46).
                    488:  */
                    489: void
                    490: brelse(bp)
                    491:        struct buf *bp;
                    492: {
                    493:        struct bqueues *bufq;
                    494:        int s;
                    495: 
                    496:        trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
                    497: 
                    498:        /* Wake up any processes waiting for any buffer to become free. */
                    499:        if (needbuffer) {
                    500:                needbuffer = 0;
                    501:                wakeup(&needbuffer);
                    502:        }
                    503: 
                    504:        /* Wake up any proceeses waiting for _this_ buffer to become free. */
                    505:        if (ISSET(bp->b_flags, B_WANTED)) {
                    506:                CLR(bp->b_flags, B_WANTED);
                    507:                wakeup(bp);
                    508:        }
                    509: 
                    510:        /* Block disk interrupts. */
                    511:        s = splbio();
                    512: 
                    513:        /*
                    514:         * Determine which queue the buffer should be on, then put it there.
                    515:         */
                    516: 
                    517:        /* If it's locked, don't report an error; try again later. */
                    518:        if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
                    519:                CLR(bp->b_flags, B_ERROR);
                    520: 
                    521:        /* If it's not cacheable, or an error, mark it invalid. */
                    522:        if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
                    523:                SET(bp->b_flags, B_INVAL);
                    524: 
                    525:        if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
                    526:                /*
                    527:                 * If it's invalid or empty, dissociate it from its vnode
                    528:                 * and put on the head of the appropriate queue.
                    529:                 */
                    530:                if (bp->b_vp)
                    531:                        brelvp(bp);
                    532:                CLR(bp->b_flags, B_DELWRI);
                    533:                if (bp->b_bufsize <= 0)
                    534:                        /* no data */
                    535:                        bufq = &bufqueues[BQ_EMPTY];
                    536:                else
                    537:                        /* invalid data */
                    538:                        bufq = &bufqueues[BQ_AGE];
                    539:                binsheadfree(bp, bufq);
                    540:        } else {
                    541:                /*
                    542:                 * It has valid data.  Put it on the end of the appropriate
                    543:                 * queue, so that it'll stick around for as long as possible.
                    544:                 */
                    545:                if (ISSET(bp->b_flags, B_LOCKED))
                    546:                        /* locked in core */
                    547:                        bufq = &bufqueues[BQ_LOCKED];
                    548:                else if (ISSET(bp->b_flags, B_AGE))
                    549:                        /* stale but valid data */
                    550:                        bufq = &bufqueues[BQ_AGE];
                    551:                else
                    552:                        /* valid data */
                    553:                        bufq = &bufqueues[BQ_LRU];
                    554:                binstailfree(bp, bufq);
                    555:        }
                    556: 
                    557:        /* Unlock the buffer. */
                    558:        CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
                    559: 
                    560:        /* Allow disk interrupts. */
                    561:        splx(s);
                    562: }
                    563: 
                    564: /*
                    565:  * Determine if a block is in the cache.
                    566:  * Just look on what would be its hash chain.  If it's there, return
                    567:  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
                    568:  * we normally don't return the buffer, unless the caller explicitly
                    569:  * wants us to.
                    570:  */
                    571: struct buf *
                    572: incore(vp, blkno)
                    573:        struct vnode *vp;
                    574:        daddr_t blkno;
                    575: {
                    576:        struct buf *bp;
                    577: 
                    578:        bp = BUFHASH(vp, blkno)->lh_first;
                    579: 
                    580:        /* Search hash chain */
                    581:        for (; bp != NULL; bp = bp->b_hash.le_next) {
                    582:                if (bp->b_lblkno == blkno && bp->b_vp == vp &&
                    583:                    !ISSET(bp->b_flags, B_INVAL))
                    584:                return (bp);
                    585:        }
                    586: 
                    587:        return (0);
                    588: }
                    589: 
                    590: /*
                    591:  * Get a block of requested size that is associated with
                    592:  * a given vnode and block offset. If it is found in the
                    593:  * block cache, mark it as having been found, make it busy
                    594:  * and return it. Otherwise, return an empty block of the
                    595:  * correct size. It is up to the caller to insure that the
                    596:  * cached blocks be of the correct size.
                    597:  */
                    598: struct buf *
                    599: getblk(vp, blkno, size, slpflag, slptimeo)
                    600:        register struct vnode *vp;
                    601:        daddr_t blkno;
                    602:        int size, slpflag, slptimeo;
                    603: {
                    604:        struct buf *bp;
                    605:        int s, err;
                    606: 
                    607: start:
                    608:        s = splbio();
                    609:        if (bp = incore(vp, blkno)) {   /* XXX NFS VOP_BWRITE foolishness */
                    610:                if (ISSET(bp->b_flags, B_BUSY)) {
                    611:                        SET(bp->b_flags, B_WANTED);
                    612:                        err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
                    613:                            slptimeo);
                    614:                        splx(s);
                    615:                        if (err)
                    616:                                return (NULL);
                    617:                        goto start;
                    618:                }
                    619:                SET(bp->b_flags, (B_BUSY | B_CACHE));
                    620:                bremfree(bp);
                    621:                splx(s);
                    622:                allocbuf(bp, size);
                    623:        } else {
                    624:                splx(s);
                    625:                if ((bp = getnewbuf(slpflag, slptimeo)) == NULL)
                    626:                        goto start;
                    627:                binshash(bp, BUFHASH(vp, blkno));
                    628:                allocbuf(bp, size);
                    629:                bp->b_blkno = bp->b_lblkno = blkno;
                    630:                s = splbio();
                    631:                bgetvp(vp, bp);
                    632:                splx(s);
                    633:        }
                    634:        return (bp);
                    635: }
                    636: 
                    637: /*
                    638:  * Get an empty, disassociated buffer of given size.
                    639:  */
                    640: struct buf *
                    641: geteblk(size)
                    642:        int size;
                    643: {
                    644:        struct buf *bp; 
                    645: 
                    646:        while ((bp = getnewbuf(0, 0)) == 0)
                    647:                ;
                    648:        SET(bp->b_flags, B_INVAL);
                    649:        binshash(bp, &invalhash);
                    650:        allocbuf(bp, size);
                    651: 
                    652:        return (bp);
                    653: }
                    654: 
                    655: /*
                    656:  * Expand or contract the actual memory allocated to a buffer.
                    657:  *
                    658:  * If the buffer shrinks, data is lost, so it's up to the
                    659:  * caller to have written it out *first*; this routine will not
                    660:  * start a write.  If the buffer grows, it's the callers
                    661:  * responsibility to fill out the buffer's additional contents.
                    662:  */
                    663: int
                    664: allocbuf(bp, size)
                    665:        struct buf *bp;
                    666:        int size;
                    667: {
                    668:        struct buf      *nbp;
                    669:        vm_size_t       desired_size;
                    670:        int          s;
                    671: 
                    672:        desired_size = roundup(size, CLBYTES);
                    673:        if (desired_size > MAXBSIZE)
                    674:                panic("allocbuf: buffer larger than MAXBSIZE requested");
                    675: 
                    676:        if (bp->b_bufsize == desired_size)
                    677:                goto out;
                    678: 
                    679:        /*
                    680:         * If the buffer is smaller than the desired size, we need to snarf
                    681:         * it from other buffers.  Get buffers (via getnewbuf()), and
                    682:         * steal their pages.
                    683:         */
                    684:        while (bp->b_bufsize < desired_size) {
                    685:                int amt;
                    686: 
                    687:                /* find a buffer */
                    688:                while ((nbp = getnewbuf(0, 0)) == NULL)
                    689:                        ;
                    690:                SET(nbp->b_flags, B_INVAL);
                    691:                binshash(nbp, &invalhash);
                    692: 
                    693:                /* and steal its pages, up to the amount we need */
                    694:                amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize));
                    695:                pagemove((nbp->b_data + nbp->b_bufsize - amt),
                    696:                        bp->b_data + bp->b_bufsize, amt);
                    697:                bp->b_bufsize += amt;
                    698:                nbp->b_bufsize -= amt;
                    699: 
                    700:                /* reduce transfer count if we stole some data */
                    701:                if (nbp->b_bcount > nbp->b_bufsize)
                    702:                        nbp->b_bcount = nbp->b_bufsize;
                    703: 
                    704: #if DIAGNOSTIC
                    705:                if (nbp->b_bufsize < 0)
                    706:                        panic("allocbuf: negative bufsize");
                    707: #endif
                    708: 
                    709:                brelse(nbp);
                    710:        }
                    711: 
                    712:        /*
                    713:         * If we want a buffer smaller than the current size,
                    714:         * shrink this buffer.  Grab a buf head from the EMPTY queue,
                    715:         * move a page onto it, and put it on front of the AGE queue.
                    716:         * If there are no free buffer headers, leave the buffer alone.
                    717:         */
                    718:        if (bp->b_bufsize > desired_size) {
                    719:                s = splbio();
                    720:                if ((nbp = bufqueues[BQ_EMPTY].tqh_first) == NULL) {
                    721:                        /* No free buffer head */
                    722:                        splx(s);
                    723:                        goto out;
                    724:                }
                    725:                bremfree(nbp);
                    726:                SET(nbp->b_flags, B_BUSY);
                    727:                splx(s);
                    728: 
                    729:                /* move the page to it and note this change */
                    730:                pagemove(bp->b_data + desired_size,
                    731:                    nbp->b_data, bp->b_bufsize - desired_size);
                    732:                nbp->b_bufsize = bp->b_bufsize - desired_size;
                    733:                bp->b_bufsize = desired_size;
                    734:                nbp->b_bcount = 0;
                    735:                SET(nbp->b_flags, B_INVAL);
                    736: 
                    737:                /* release the newly-filled buffer and leave */
                    738:                brelse(nbp);
                    739:        }
                    740: 
                    741: out:
                    742:        bp->b_bcount = size;
                    743: }
                    744: 
                    745: /*
                    746:  * Find a buffer which is available for use.
                    747:  * Select something from a free list.
                    748:  * Preference is to AGE list, then LRU list.    
                    749:  */
                    750: struct buf *
                    751: getnewbuf(slpflag, slptimeo)
                    752:        int slpflag, slptimeo;
                    753: {
                    754:        register struct buf *bp;
                    755:        register struct buf *lru_bp;
                    756:        register struct buf *age_bp;
                    757:        register int age_time, lru_time;
                    758:        int s;
                    759:        struct ucred *cred;
                    760: 
                    761: start:
                    762:        s = splbio();
                    763: 
                    764:        age_bp = bufqueues[BQ_AGE].tqh_first;
                    765:        lru_bp = bufqueues[BQ_LRU].tqh_first;
                    766: 
                    767:        if (age_bp == NULL && lru_bp == NULL) {
                    768:                /* wait for a free buffer of any kind */
                    769:                needbuffer = 1;
                    770:                tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
                    771:                splx(s);
                    772:                return (0);
                    773:        }
                    774:        if (age_bp == NULL)
                    775:                bp = lru_bp;
                    776:        else if (lru_bp == NULL)
                    777:                bp = age_bp;
                    778:        else {
                    779:                if (((age_time = (time.tv_sec - age_bp->b_timestamp)) < 0) ||
                    780:                        ((lru_time = (time.tv_sec - lru_bp->b_timestamp)) < 0)) {
                    781:                        /* time was set backwards */
                    782:                        bp = age_bp;
                    783:                        /*
                    784:                         * we should probably re-timestamp eveything in the queues
                    785:                         * at this point with the current time
                    786:                         */
                    787:                } else {
                    788:                        if (lru_time >= lru_is_stale && age_time < age_is_stale)
                    789:                                bp = lru_bp;
                    790:                        else
                    791:                                bp = age_bp;
                    792:                }
                    793:        }
                    794:        bremfree(bp);
                    795: 
                    796:        /* Buffer is no longer on free lists. */
                    797:        SET(bp->b_flags, B_BUSY);
                    798:        splx(s);
                    799: 
                    800:        /* If buffer was a delayed write, start it, and go back to the top. */
                    801:        if (ISSET(bp->b_flags, B_DELWRI)) {
                    802:                bawrite (bp);
                    803:                goto start;
                    804:        }
                    805: 
                    806:        trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
                    807: 
                    808:        /* disassociate us from our vnode, if we had one... */
                    809:        s = splbio();
                    810:        if (bp->b_vp)
                    811:                brelvp(bp);
                    812:        splx(s);
                    813: 
                    814:        /* clear out various other fields */
                    815:        bp->b_flags = B_BUSY;
                    816:        bp->b_dev = NODEV;
                    817:        bp->b_blkno = bp->b_lblkno = 0;
                    818:        bp->b_iodone = 0;
                    819:        bp->b_error = 0;
                    820:        bp->b_resid = 0;
                    821:        bp->b_bcount = 0;
                    822:        bp->b_dirtyoff = bp->b_dirtyend = 0;
                    823:        bp->b_validoff = bp->b_validend = 0;
                    824: 
                    825:        /* nuke any credentials we were holding */
                    826:        cred = bp->b_rcred;
                    827:        if (cred != NOCRED) {
                    828:                bp->b_rcred = NOCRED; 
                    829:                crfree(cred);
                    830:        }
                    831:        cred = bp->b_wcred;
                    832:        if (cred != NOCRED) {
                    833:                bp->b_wcred = NOCRED;
                    834:                crfree(cred);
                    835:        }
                    836:        
                    837:        bremhash(bp);
                    838:        return (bp); 
                    839: }
                    840: 
                    841: /*
                    842:  * Wait for operations on the buffer to complete.
                    843:  * When they do, extract and return the I/O's error value.
                    844:  */
                    845: int
                    846: biowait(bp)
                    847:        struct buf *bp;
                    848: {
                    849:        int s;
                    850: 
                    851:        s = splbio();
                    852:        while (!ISSET(bp->b_flags, B_DONE))
                    853:                tsleep(bp, PRIBIO + 1, "biowait", 0);
                    854:        splx(s);
                    855: 
                    856:        /* check for interruption of I/O (e.g. via NFS), then errors. */
                    857:        if (ISSET(bp->b_flags, B_EINTR)) {
                    858:                CLR(bp->b_flags, B_EINTR);
                    859:                return (EINTR);
                    860:        } else if (ISSET(bp->b_flags, B_ERROR))
                    861:                return (bp->b_error ? bp->b_error : EIO);
                    862:        else
                    863:                return (0);
                    864: }
                    865: 
                    866: /*
                    867:  * Mark I/O complete on a buffer.
                    868:  *
                    869:  * If a callback has been requested, e.g. the pageout
                    870:  * daemon, do so. Otherwise, awaken waiting processes.
                    871:  *
                    872:  * [ Leffler, et al., says on p.247:
                    873:  *     "This routine wakes up the blocked process, frees the buffer
                    874:  *     for an asynchronous write, or, for a request by the pagedaemon
                    875:  *     process, invokes a procedure specified in the buffer structure" ]
                    876:  *
                    877:  * In real life, the pagedaemon (or other system processes) wants
                    878:  * to do async stuff to, and doesn't want the buffer brelse()'d.
                    879:  * (for swap pager, that puts swap buffers on the free lists (!!!),
                    880:  * for the vn device, that puts malloc'd buffers on the free lists!)
                    881:  */
                    882: void
                    883: biodone(bp)
                    884:        struct buf *bp;
                    885: {
                    886:        boolean_t       funnel_state;
                    887: 
                    888:        funnel_state = thread_set_funneled(TRUE);
                    889:        if (ISSET(bp->b_flags, B_DONE))
                    890:                panic("biodone already");
                    891:        SET(bp->b_flags, B_DONE);               /* note that it's done */
                    892: 
                    893:        if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))  /* wake up reader */
                    894:                vwakeup(bp);
                    895: 
                    896:        if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
                    897:                CLR(bp->b_flags, B_CALL);       /* but note callout done */
                    898:                (*bp->b_iodone)(bp);
                    899:        } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
                    900:                brelse(bp);
                    901:        else {                                  /* or just wakeup the buffer */
                    902:                CLR(bp->b_flags, B_WANTED);
                    903:                wakeup(bp);
                    904:        }
                    905:        (void) thread_set_funneled(funnel_state);
                    906: }
                    907: 
                    908: /*
                    909:  * Return a count of buffers on the "locked" queue.
                    910:  */
                    911: int
                    912: count_lock_queue()
                    913: {
                    914:        register struct buf *bp;
                    915:        register int n = 0;
                    916: 
                    917:        for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
                    918:            bp = bp->b_freelist.tqe_next)
                    919:                n++;
                    920:        return (n);
                    921: }
                    922: 
                    923: #if MACH_NBC
                    924: #include <ufs/ufs/quota.h>
                    925: #include <ufs/ufs/inode.h>
                    926: 
                    927: #define        btodevblk(b) ((b) / devBlocksize)
                    928: void
                    929: blkflush(struct vnode *vp, daddr_t blkno, vm_size_t size)
                    930: {
                    931:        register struct buf *ep, *nbp;
                    932:        daddr_t start, last;
                    933:        int s,err;
                    934:        struct inode *ip= VTOI(vp);
                    935:        int devBlocksize=1024;
                    936: 
                    937: #if 1
                    938:        VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlocksize);
                    939: #endif
                    940: 
                    941: 
                    942:        start = blkno;
                    943:        last = start + btodb(size, devBlocksize) - 1;
                    944: loop:
                    945:        for(ep = vp->v_dirtyblkhd.lh_first; ep; ep = nbp) {
                    946:                nbp = ep->b_vnbufs.le_next;
                    947:                if (ep->b_vp != vp || ISSET(ep->b_flags, B_INVAL))
                    948:                        continue;
                    949:                /* look for overlap */
                    950:                if (ep->b_bcount == 0 || ep->b_blkno > last ||
                    951:                    ep->b_blkno + btodevblk(ep->b_bcount) <= start)
                    952:                        continue;
                    953:                s = splbio();
                    954:                if (ISSET(ep->b_flags, B_BUSY)) {
                    955:                        SET(ep->b_flags, B_WANTED);
                    956:                        err = tsleep(ep, (PRIBIO + 1), "blkflush",
                    957:                            0);
                    958:                        splx(s);
                    959:                        goto loop;
                    960:                }
                    961:                if(ISSET(ep->b_flags, B_DELWRI)) {
                    962:                        bremfree(ep);
                    963:                        SET(ep->b_flags, B_BUSY);
                    964:                        (void) splx(s);
                    965:                        bwrite(ep);
                    966:                        goto loop;
                    967:                }
                    968:                (void) splx(s);
                    969:        }
                    970: 
                    971: }
                    972: #endif /* MACH_NBC */
                    973: #if DIAGNOSTIC
                    974: /*
                    975:  * Print out statistics on the current allocation of the buffer pool.
                    976:  * Can be enabled to print out on every ``sync'' by setting "syncprt"
                    977:  * in vfs_syscalls.c using sysctl.
                    978:  */
                    979: void
                    980: vfs_bufstats()
                    981: {
                    982:        int s, i, j, count;
                    983:        register struct buf *bp;
                    984:        register struct bqueues *dp;
                    985:        int counts[MAXBSIZE/CLBYTES+1];
                    986:        static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
                    987: 
                    988:        for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
                    989:                count = 0;
                    990:                for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
                    991:                        counts[j] = 0;
                    992:                s = splbio();
                    993:                for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
                    994:                        counts[bp->b_bufsize/CLBYTES]++;
                    995:                        count++;
                    996:                }
                    997:                splx(s);
                    998:                printf("%s: total-%d", bname[i], count);
                    999:                for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
                   1000:                        if (counts[j] != 0)
                   1001:                                printf(", %d-%d", j * CLBYTES, counts[j]);
                   1002:                printf("\n");
                   1003:        }
                   1004: }
                   1005: #endif /* DIAGNOSTIC */
                   1006: 
                   1007: 
                   1008: struct buf *
                   1009: alloc_io_buf(vp)
                   1010:        struct vnode *vp;
                   1011: {      register struct buf *bp;
                   1012:        int s;
                   1013: 
                   1014:        s = splbio();
                   1015: 
                   1016:        if ((bp = iobufqueue.tqh_first) == NULL) {
                   1017:                splx(s);
                   1018:                return (NULL);
                   1019:        }
                   1020:        TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
                   1021:        bp->b_timestamp = 0; 
                   1022: 
                   1023:        /* clear out various fields */
                   1024:        bp->b_flags = (B_BUSY | B_RAW);
                   1025:        bp->b_blkno = bp->b_lblkno = 0;
                   1026:        bp->b_iodone = 0;
                   1027:        bp->b_error = 0;
                   1028:        bp->b_resid = 0;
                   1029:        bp->b_bcount = 0;
                   1030:        bp->b_bufsize = 0;
                   1031:        bp->b_vp = vp;
                   1032: 
                   1033:        if (vp->v_type == VBLK || vp->v_type == VCHR)
                   1034:                bp->b_dev = vp->v_rdev;
                   1035:        else
                   1036:                bp->b_dev = NODEV;
                   1037:        splx(s);
                   1038: 
                   1039:        return (bp);
                   1040: }
                   1041: 
                   1042: void
                   1043: free_io_buf(bp)
                   1044:        struct buf *bp;
                   1045: {
                   1046:         int s;
                   1047: 
                   1048:        s = splbio();
                   1049:        /*
                   1050:         * put buffer back on the head of the iobufqueue
                   1051:         */
                   1052:        bp->b_vp = NULL;
                   1053:        bp->b_flags = B_INVAL;
                   1054: 
                   1055:        binsheadfree(bp, &iobufqueue);
                   1056: 
                   1057:        splx(s);
                   1058: }

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.