Source to bsd/vfs/vfs_cluster.c
/*
* Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* The contents of this file constitute Original Code as defined in and
* are subject to the Apple Public Source License Version 1.1 (the
* "License"). You may not use this file except in compliance with the
* License. Please obtain a copy of the License at
* http://www.apple.com/publicsource and read it before using this file.
*
* This Original Code and all software distributed under the License are
* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
* License for the specific language governing rights and limitations
* under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
*/
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/trace.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
#include <libkern/libkern.h>
#include <kern/mapfs.h>
#include <sys/kdebug.h>
/*
* Local declarations
*/
struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
daddr_t, daddr_t, long, int, long, long));
struct buf *cluster_create __P((struct vnode *, struct buf *, daddr_t, daddr_t, long,
int, long, daddr_t *, int));
int cluster_block __P((struct vnode *, u_quad_t, struct buf *, long, long));
void cluster_wbuild __P((struct vnode *, struct buf *, long,
daddr_t, int, daddr_t, long, int));
struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
#if DIAGNOSTIC
/*
* Set to 1 if reads of block zero should cause readahead to be done.
* Set to 0 treats a read of block zero as a non-sequential read.
*
* Setting to one assumes that most reads of block zero of files are due to
* sequential passes over the files (e.g. cat, sum) where additional blocks
* will soon be needed. Setting to zero assumes that the majority are
* surgical strikes to get particular info (e.g. size, file) where readahead
* blocks will not be used and, in fact, push out other potentially useful
* blocks from the cache. The former seems intuitive, but some quick tests
* showed that the latter performed better from a system-wide point of view.
*/
int doclusterraz = 0;
#define ISSEQREAD(vp, blk) \
(((blk) != 0 || doclusterraz) && \
((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
#else
#define ISSEQREAD(vp, blk) \
((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
#endif
/*
* This replaces bread. If this is a bread at the beginning of a file and
* lastr is 0, we assume this is the first read and we'll read up to two
* blocks if they are sequential. After that, we'll do regular read ahead
* in clustered chunks.
*
* There are 4 or 5 cases depending on how you count:
* Desired block is in the cache:
* 1 Not sequential access (0 I/Os).
* 2 Access is sequential, do read-ahead (1 ASYNC).
* Desired block is not in cache:
* 3 Not sequential access (1 SYNC).
* 4 Sequential access, next block is contiguous (1 SYNC).
* 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
*
* There are potentially two buffers that require I/O.
* bp is the block requested.
* rbp is the read-ahead block.
* If either is NULL, then you don't have to do the I/O.
*/
cluster_read(vp, filesize, lblkno, size, cred, bpp, secsize,
firstpass, resid, fp_sequential)
struct vnode *vp;
u_quad_t filesize;
daddr_t lblkno;
long size;
struct ucred *cred;
struct buf **bpp;
long secsize;
int firstpass;
long resid;
int *fp_sequential;
{
struct buf *bp, *rbp, *cbp;
daddr_t blkno, ioblkno;
long flags;
int error, num_ra, alreadyincore;
long num;
int sequential, case4;
int l_maxra;
int l_ralen;
int l_lastr;
#if DIAGNOSTIC
if (size == 0)
panic("cluster_read: size = 0");
#endif
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_START,
lblkno,
resid,
firstpass,
vp,
0);
error = 0;
flags = B_READ;
*bpp = bp = getblk(vp, lblkno, size, 0, 0);
if (resid == PAGE_SIZE && lblkno && !ISSEQREAD(vp, lblkno) &&
(vp->v_mount->mnt_stat.f_iosize & (PAGE_SIZE - 1)) == 0) {
if (bp->b_flags & B_CACHE) {
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_END,
lblkno,
size,
-1,
0,
0);
vp->v_consumed += (bp->b_bcount/size);
return (0);
}
bp->b_flags |= B_READ;
if (cluster_block(vp, filesize, bp, size, secsize)) {
error = biowait(bp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_END,
bp,
0,
0,
0,
0);
return(error);
}
}
l_maxra = vp->v_maxra;
l_ralen = vp->v_ralen;
l_lastr = vp->v_lastr;
/* round up resid count to nearest block size */
if ( resid > size )
resid += size - 1;
if (bp->b_flags & B_CACHE) {
/*
* Desired block is in cache; do any readahead ASYNC.
* Case 1, 2.
*/
trace(TR_BREADHIT, pack(vp, size), lblkno);
flags |= B_ASYNC;
if (resid > size)
resid -= size;
ioblkno = lblkno + (l_ralen ? l_ralen : 1);
alreadyincore = incore(vp, ioblkno) != NULL;
/*
* treat this as a hit for purposes of speculative I/O around paging activity
*/
vp->v_consumed += (bp->b_bcount/size);
bp = NULL;
} else {
/* Block wasn't in cache, case 3, 4, 5. */
trace(TR_BREADMISS, pack(vp, size), lblkno);
bp->b_flags |= B_READ;
ioblkno = lblkno;
alreadyincore = 0;
current_proc()->p_stats->p_ru.ru_inblock++; /* XXX */
}
/*
* XXX
* Replace 1 with a window size based on some permutation of
* maxcontig and rot_delay. This will let you figure out how
* many blocks you should read-ahead (case 2, 4, 5).
*
* If the access isn't sequential, reset the window to 1.
* Note that a read to the same block is considered sequential.
* This catches the case where the file is being read sequentially,
* but at smaller than the filesystem block size.
*/
rbp = NULL;
cbp = NULL;
case4 = 0;
if (!ISSEQREAD(vp, lblkno)) {
l_ralen = 0;
l_maxra = lblkno;
sequential = 0;
}
else
sequential = 1;
/* On first pass set the sequential state.
* Otherwise, just use the value passed in.
*/
if (firstpass)
*fp_sequential = sequential;
if (resid > size || *fp_sequential) {
if (((u_quad_t)(ioblkno + 1)) * (u_quad_t)size <= filesize && !alreadyincore &&
!(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
blkno != -1) {
/*
* Reading sequentially, and the next block is not in the
* cache. We are going to try reading ahead.
*/
if (num_ra) {
/*
* If our desired readahead block had been read
* in a previous readahead but is no longer in
* core, then we may be reading ahead too far
* or are not using our readahead very rapidly.
* In this case we scale back the window.
*/
if (*fp_sequential) {
if (!alreadyincore && ioblkno <= l_maxra)
l_ralen = max(l_ralen >> 1, 1);
/*
* There are more sequential blocks than our current
* window allows, scale up. Ideally we want to get
* in sync with the filesystem maxcontig value.
*/
else if (num_ra > l_ralen && lblkno != l_lastr)
l_ralen = l_ralen ?
min(num_ra, l_ralen << 1) : 1;
}
num = max((resid/size)-1, l_ralen);
num_ra = min(num, num_ra);
}
if (num_ra) { /* case 2, 4 */
cbp = cluster_rbuild(vp, filesize,
bp, ioblkno, blkno, size, num_ra, flags, secsize);
if (cbp) {
if ( !(cbp->b_flags & B_CALL)) {
if ((rbp = cbp) == bp)
rbp = NULL;
cbp = NULL;
} else
case4 = 1;
}
} else if (ioblkno == lblkno) {
bp->b_blkno = blkno;
/* Case 5: check how many blocks to read ahead */
++ioblkno;
if (((u_quad_t)(ioblkno + 1)) * (u_quad_t)size > filesize ||
incore(vp, ioblkno) || (error = VOP_BMAP(vp,
ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
goto skip_readahead;
/*
* Adjust readahead as above.
* Don't check alreadyincore, we know it is 0 from
* the previous conditional.
*/
if (num_ra) {
if (*fp_sequential) {
if (ioblkno <= l_maxra)
l_ralen = max(l_ralen >> 1, 1);
else if (num_ra > l_ralen && lblkno != l_lastr)
l_ralen = l_ralen ?
min(num_ra, l_ralen<<1) : 1;
}
num = max((resid/size)-1, l_ralen);
num_ra = min(num, num_ra);
}
flags |= B_ASYNC;
if (num_ra) {
cbp = cluster_rbuild(vp, filesize,
NULL, ioblkno, blkno, size, num_ra, flags,
secsize);
if (cbp) {
if ( !(cbp->b_flags & B_CALL)) {
rbp = cbp;
cbp = NULL;
}
}
} else {
rbp = getblk(vp, ioblkno, size, 0, 0);
rbp->b_flags |= flags;
rbp->b_blkno = blkno;
}
} else {
/* case 2; read ahead single block */
rbp = getblk(vp, ioblkno, size, 0, 0);
rbp->b_flags |= flags;
rbp->b_blkno = blkno;
}
if (cbp || rbp) { /* case 2, 5 */
trace(TR_BREADMISSRA,
pack(vp, (num_ra + 1) * size), ioblkno);
current_proc()->p_stats->p_ru.ru_inblock++; /* XXX */
}
}
}
skip_readahead:
if (bp && !case4) {
if (bp->b_flags & (B_DONE | B_DELWRI))
panic("cluster_read: DONE bp");
else {
/*
* issue the BMAP here if needed due to the block device's
* lack of a BMAP call in the strategy routine.... when being
* used by the filesystem/mount code, the blockno's being worked
* with are always physical so the strategy routine doesn't bother.
* Now that we are calling cluster read/write from spec_read/spec_write
* we have to use real logical blockno's in order to properly trigger
* the read-ahead and write-coalescing.
*/
if (bp->b_lblkno == bp->b_blkno) {
VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
if ((long)bp->b_blkno == -1)
clrbuf(bp);
}
error = VOP_STRATEGY(bp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_NONE,
bp->b_lblkno,
bp->b_bcount,
vp,
0xaaaaaaaa, 0 );
}
}
if (rbp) {
if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
rbp->b_flags &= ~(B_ASYNC | B_READ);
brelse(rbp);
} else {
/*
* issue the BMAP here if needed due to the block device's
* lack of a BMAP call in the strategy routine.... when being
* used by the filesystem/mount code, the blockno's being worked
* with are always physical so the strategy routine doesn't bother.
* Now that we are calling cluster read/write from spec_read/spec_write
* we have to use real logical blockno's in order to properly trigger
* the read-ahead and write-coalescing.
*/
if (rbp->b_lblkno == rbp->b_blkno) {
VOP_BMAP(vp, rbp->b_lblkno, NULL, &rbp->b_blkno, NULL);
if ((long)rbp->b_blkno == -1)
clrbuf(rbp);
}
(void) VOP_STRATEGY(rbp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_NONE,
rbp->b_lblkno,
rbp->b_bcount,
vp,
0xaaaaaabb, 0 );
}
}
if (cbp) {
(void) VOP_STRATEGY(cbp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_NONE,
cbp->b_lblkno,
cbp->b_bcount,
vp,
0xaaaaaacc, 0 );
}
/*
* Recalculate our maximum readahead
*/
if (rbp == NULL) {
if (cbp)
rbp = cbp;
else
rbp = bp;
}
if (rbp)
vp->v_maxra = rbp->b_lblkno + (rbp->b_bcount / size) - 1;
else
vp->v_maxra = l_maxra;
vp->v_ralen = l_ralen;
if (bp)
error = biowait(bp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_END,
bp,
rbp,
cbp,
vp->v_maxra,
0);
return(error);
}
struct pent {
int mask;
int num;
} pent[7] = {
{0,0},
{0,0},
{~0,1},
{~1,2},
{~3,4},
{~7,8},
{~15,16}};
int cluster_block(vp, filesize, bp, size, secsize)
struct vnode *vp;
u_quad_t filesize;
struct buf *bp;
long size;
long secsize;
{
struct buf *cbp;
daddr_t lblkno, blkno, ioblkno, lbn;
int num_io, num;
unsigned ratio;
#if 0 /* FIXED READS */
/* calculate maximum number of blocks to read in */
lblkno = bp->b_lblkno & ~0x07; /* put us on a 32k (8 page boundary) boundary */
num = 8;
num_io = 0;
#else /* ADAPTIVE READS */
if (vp->v_bread > vp->v_trigger) {
ratio = (vp->v_consumed*100) / vp->v_bread;
if (ratio < 50 && vp->v_power > 2) {
vp->v_power--;
vp->v_trigger = vp->v_bread + (16 * pent[vp->v_power].num);
} else if (ratio > 75 && vp->v_power < 6) {
vp->v_power++;
vp->v_trigger = vp->v_bread + (16 * pent[vp->v_power].num);
}
}
if ((num = pent[vp->v_power].num) == 1)
return (0);
lblkno = bp->b_lblkno & pent[vp->v_power].mask;
num_io = 0;
#endif
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 14)) | DBG_FUNC_START,
lblkno,
num,
vp->v_flag,
vp,
0 );
for (lbn = bp->b_lblkno; lbn > lblkno; lbn--) {
if (incore(vp, lbn - 1))
break;
}
num -= (lbn - lblkno);
for (;;) {
if (VOP_BMAP(vp, lbn, NULL, &blkno, &num_io) || blkno == -1 || num_io == 0) {
if (lbn == bp->b_lblkno) {
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 14)) | DBG_FUNC_END,
-1,
lbn,
blkno,
num_io,
0);
return (0);
}
}
if ((lbn + num_io) >= bp->b_lblkno)
break;
lbn++;
num--;
}
if ((num_io = min(num, num_io + 1)) == 1)
return (0);
if ((u_quad_t)size * ((u_quad_t)(lbn + num_io)) > filesize)
num_io = (filesize - ((u_quad_t)size * (u_quad_t)lbn)) / size;
cbp = cluster_create(vp, bp, lbn, blkno, size, num_io, secsize, &ioblkno, B_AGE);
if (cbp) {
(void) VOP_STRATEGY(cbp);
vp->v_bread += (cbp->b_bcount / size);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 14)) | DBG_FUNC_END,
cbp->b_lblkno,
cbp->b_bcount,
vp,
0xaaaaaadd,
0 );
return (1);
}
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 14)) | DBG_FUNC_END,
0,
0,
0,
0,
0);
return (0);
}
/*
* generate advisory I/O in as big of chunks as possible
* and then parcel them up into logical blocks in the buffer hash table.
*/
advisory_read(vp, filesize, lblkno, size, runt_size, io_size, secsize)
struct vnode *vp;
u_quad_t filesize;
daddr_t lblkno;
long size;
long runt_size;
long io_size;
long secsize;
{
struct buf *bp, *cbp;
daddr_t blkno, ioblkno;
int error, num_io;
long num;
error = 0;
/* calculate maximum number of blocks to read in */
num = (io_size + (size - 1)) / size;
if ((u_quad_t)size * ((u_quad_t)(lblkno + num)) > filesize) {
if (((u_quad_t)size * (u_quad_t)lblkno) >= filesize)
return(EFBIG);
io_size = filesize - ((u_quad_t)size * (u_quad_t)lblkno);
num = io_size / size;
} else
io_size = num * size;
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 13)) | DBG_FUNC_START,
lblkno,
io_size,
num,
vp,
0 );
while (num) {
if (error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_io))
break;
if (blkno == -1) {
lblkno++;
num--;
io_size -= size;
continue;
}
num_io = min(num, num_io + 1);
cbp = cluster_create(vp, NULL, lblkno, blkno, size, num_io, secsize, &ioblkno, 0);
if (cbp) {
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 13)) | DBG_FUNC_NONE,
cbp->b_blkno,
cbp->b_bcount,
vp,
0xaaaaaaee,
0 );
(void) VOP_STRATEGY(cbp);
} else {
if (ioblkno == lblkno) {
error = ENOMEM;
break;
}
}
io_size -= ((ioblkno - lblkno) * size);
num -= ioblkno - lblkno;
lblkno = ioblkno;
}
if (io_size && !error) {
bp = getblk(vp, lblkno, runt_size, 0, 0);
if (bp->b_flags & (B_DONE | B_DELWRI))
brelse(bp);
else {
bp->b_flags |= (B_READ | B_ASYNC);
(void) VOP_STRATEGY(bp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 13)) | DBG_FUNC_NONE,
bp->b_blkno,
bp->b_bcount,
vp,
0xaaaaaaff,
0 );
}
io_size -= runt_size;
}
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 13)) | DBG_FUNC_END,
lblkno,
io_size,
num,
error,
0);
return(error);
}
/*
* If blocks are contiguous on disk, use this to provide clustered
* read ahead. We will read as many blocks as possible sequentially
* and then parcel them up into logical blocks in the buffer hash table.
*/
struct buf *
cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags, secsize)
struct vnode *vp;
u_quad_t filesize;
struct buf *bp;
daddr_t lbn;
daddr_t blkno;
long size;
int run;
long flags;
long secsize;
{
struct cluster_save *b_save;
struct buf *tbp, *cbp;
caddr_t cp;
daddr_t bn;
int i, inc;
#if DIAGNOSTIC
if (size != vp->v_mount->mnt_stat.f_iosize)
panic("cluster_rbuild: size %d != filesize %d\n",
size, vp->v_mount->mnt_stat.f_iosize);
#endif
if ((u_quad_t)size * ((u_quad_t)(lbn + run + 1)) > filesize)
--run;
if (run == 0) {
if (!bp) {
bp = getblk(vp, lbn, size, 0, 0);
bp->b_blkno = blkno;
bp->b_flags |= flags;
}
return(bp);
}
b_save = _MALLOC(sizeof(struct buf *) * (run + 1) + sizeof(struct cluster_save),
M_SEGMENT, M_NOWAIT);
if (b_save)
cbp = alloc_io_buf(vp);
else
cbp = NULL;
if (b_save == NULL || cbp == NULL) {
if (b_save)
_FREE(b_save, M_SEGMENT);
if (cbp)
free_io_buf(cbp);
return (bp);
}
b_save->bs_bufsize = size;
b_save->bs_nchildren = 0;
b_save->bs_children = (struct buf **)(b_save + 1);
cbp->b_saveaddr = (caddr_t)b_save;
cbp->b_iodone = cluster_callback;
cbp->b_blkno = blkno;
cbp->b_lblkno = lbn;
cbp->b_flags |= flags | B_CALL;
inc = btodb(size, secsize);
cp = (char *)cbp->b_data;
tbp = bp;
for (bn = blkno, i = 0; i <= run; ++i, bn += inc) {
if (tbp == NULL) {
if (incore(vp, lbn + i))
/*
* A component of the cluster is already in core,
* terminate the cluster early.
*/
break;
tbp = getblk(vp, lbn + i, size, 0, 0);
}
pagemove(tbp->b_data, cp, size);
cbp->b_bcount += size;
cbp->b_bufsize += size;
cp += size;
if (bp != tbp)
tbp->b_flags |= flags | B_READ | B_ASYNC;
tbp->b_bufsize -= size;
tbp->b_blkno = bn;
b_save->bs_children[i] = tbp;
b_save->bs_nchildren++;
tbp = NULL;
}
/*
* The cluster may have been terminated early
* If no cluster could be formed, deallocate the cluster save info.
*/
if (i == 0) {
_FREE(b_save, M_SEGMENT);
free_io_buf(cbp);
return(bp);
}
return(cbp);
}
struct buf *
cluster_create(vp, bp, lbn, blkno, size, run, secsize, ioblkno, flags)
struct vnode *vp;
struct buf *bp;
daddr_t lbn;
daddr_t blkno;
long size;
int run;
long secsize;
daddr_t *ioblkno;
int flags;
{
struct cluster_save *b_save;
struct buf *tbp, *cbp;
caddr_t cp;
daddr_t bn;
int i, inc;
inc = btodb(size, secsize);
if (bp == NULL) {
while (run && (tbp = incore(vp, lbn))) {
/*
* if a block is already in core
* and is not busy
* then get and release to freshen it in the LRU
*/
if ( !(tbp->b_flags & B_BUSY)) {
tbp = getblk(vp, lbn, size, 0, 0);
brelse(tbp);
}
lbn++;
run--;
blkno += inc;
}
if (run == 0) {
*ioblkno = lbn;
return (NULL);
}
}
b_save = _MALLOC((sizeof(struct buf *) * run) + sizeof(struct cluster_save), M_SEGMENT, M_NOWAIT);
if (b_save)
cbp = alloc_io_buf(vp);
else
cbp = NULL;
if (b_save == NULL || cbp == NULL) {
if (b_save)
_FREE(b_save, M_SEGMENT);
if (cbp)
free_io_buf(cbp);
*ioblkno = lbn;
return (NULL);
}
b_save->bs_bufsize = size;
b_save->bs_nchildren = 0;
b_save->bs_children = (struct buf **)(b_save + 1);
cbp->b_saveaddr = (caddr_t)b_save;
cbp->b_iodone = cluster_callback;
cbp->b_blkno = blkno;
cbp->b_lblkno = lbn;
cbp->b_flags |= (B_READ | B_ASYNC | B_CALL);
cp = (char *)cbp->b_data;
for (bn = blkno, i = 0; i < run; ++i, bn += inc, ++lbn) {
if (bp && bp->b_lblkno == lbn)
tbp = bp;
else {
if (tbp = incore(vp, lbn)) {
/*
* A component of the cluster is already in core,
* terminate the cluster early.
* if its not busy then also
* get and release to freshen it in the LRU
*/
if ( !(tbp->b_flags & B_BUSY)) {
tbp = getblk(vp, lbn, size, 0, 0);
brelse(tbp);
}
break;
}
tbp = getblk(vp, lbn, size, 0, 0);
}
pagemove(tbp->b_data, cp, size);
tbp->b_bufsize -= size;
tbp->b_blkno = bn;
cbp->b_bcount += size;
cbp->b_bufsize += size;
cp += size;
if (tbp != bp)
tbp->b_flags |= (B_READ | B_ASYNC | flags);
b_save->bs_children[i] = tbp;
b_save->bs_nchildren++;
}
*ioblkno = lbn;
/*
* The cluster may have been terminated early
* If no cluster could be formed, deallocate the cluster save info.
*/
if (cbp->b_bcount == 0) {
_FREE(b_save, M_SEGMENT);
free_io_buf(cbp);
return(NULL);
}
return(cbp);
}
/*
* Cleanup after a clustered read or write.
* This is complicated by the fact that any of the buffers might have
* extra memory (if there were no empty buffer headers at allocbuf time)
* that we will need to shift around.
*/
void
cluster_callback(bp)
struct buf *bp;
{
struct cluster_save *b_save;
struct buf **bpp, *tbp;
long bsize;
int xsize;
int n;
caddr_t cp;
int error = 0;
/*
* Must propogate errors to all the components.
*/
if (bp->b_flags & B_ERROR)
error = bp->b_error;
b_save = (struct cluster_save *)(bp->b_saveaddr);
bsize = b_save->bs_bufsize;
xsize = bp->b_bcount - bp->b_resid;
cp = (char *)bp->b_data;
/*
* Move memory from the large cluster buffer into the component
* buffers and mark IO as done on these.
*/
for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
tbp = *bpp;
pagemove(cp, tbp->b_data, bsize);
tbp->b_bufsize += bsize;
n = min(bsize, xsize);
xsize -= n;
if ((tbp->b_bcount = n) == 0)
tbp->b_flags |= B_INVAL;
tbp->b_resid = bsize - n;
if (error) {
tbp->b_flags |= B_ERROR;
tbp->b_error = error;
}
biodone(tbp);
bp->b_bufsize -= bsize;
cp += bsize;
}
_FREE(b_save, M_SEGMENT);
free_io_buf(bp);
}
/*
* on close, flush out any remaining cluster
*
*/
cluster_close(vp, bsize, secsize)
struct vnode *vp;
int bsize;
long secsize;
{
int cursize;
if (vp->v_clen) {
cursize = vp->v_lastw - vp->v_cstart + 1;
cluster_wbuild(vp, NULL, bsize, vp->v_cstart, cursize, -1, secsize, 0);
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
}
}
/*
* Do clustered write for FFS.
*
* Three cases:
* 1. Write is not sequential (write asynchronously)
* Write is sequential:
* 2. beginning of cluster - begin cluster
* 3. middle of a cluster - add to cluster
* 4. end of a cluster - asynchronously write cluster
*/
cluster_write(bp, filesize, secsize)
struct buf *bp;
u_quad_t filesize;
long secsize;
{
struct vnode *vp;
daddr_t lbn;
daddr_t bn;
int cursize;
int need_commit;
int need_sync;
int bsize;
int error = 0;
need_commit = (bp->b_flags & B_CLUST_COMMIT);
need_sync = (bp->b_flags & B_CLUST_SYNC);
bp->b_flags &= ~(B_CLUST_COMMIT | B_CLUST_SYNC);
vp = bp->b_vp;
bn = bp->b_blkno;
lbn = bp->b_lblkno;
bsize = bp->b_bcount;
if ((bsize & (PAGE_SIZE - 1)) || bsize > MAXBSIZE) {
bp->b_flags |= B_AGE;
bawrite(bp);
return (error);
}
/* Initialize vnode to beginning of file. */
if (lbn == 0)
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
bp->b_lblkno,
bp->b_bcount,
vp,
0,
0);
if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bn != vp->v_lasta + btodb(bsize, secsize)))
{
if (vp->v_clen) {
/*
* Current block is neither logically or physically sequential to last written
*
* If we are not writing at the end of file, or the process
* seeked to another point in the file since its
* last write, then push the previous cluster.
* Otherwise try reallocating to make it sequential.
*/
cursize = vp->v_lastw - vp->v_cstart + 1;
if (((u_quad_t)(lbn + 1)) * (u_quad_t)bsize != filesize || lbn != vp->v_lastw + 1) {
cluster_wbuild(vp, NULL, bsize,
vp->v_cstart, cursize, lbn, secsize, need_sync);
} else {
struct buf **bpp, **endbp;
struct cluster_save *buflist;
buflist = cluster_collectbufs(vp, bp);
if (buflist == NULL) {
cluster_wbuild(vp, NULL, bsize,
vp->v_cstart, cursize, lbn, secsize, need_sync);
} else {
endbp = &buflist->bs_children
[buflist->bs_nchildren - 1];
if (VOP_REALLOCBLKS(vp, buflist)) {
/*
* Failed, push the previous cluster.
*/
for (bpp = buflist->bs_children;
bpp < endbp; bpp++)
brelse(*bpp);
_FREE(buflist, M_SEGMENT);
cluster_wbuild(vp, NULL, bsize,
vp->v_cstart, cursize, lbn, secsize, need_sync);
} else {
/*
* Succeeded, keep building cluster.
* don't bdwrite the last bp, we'll
* first check to see if we now have a full
* cluster, or the caller has requested a SYNC write
*/
for (bpp = buflist->bs_children;
bpp < endbp; bpp++)
bdwrite(*bpp);
_FREE(buflist, M_SEGMENT);
/*
* update the physical block number because,
* VOP_REALLOCBLKS will have changed it
*/
bn = bp->b_blkno;
goto chk_cluster_full;
}
}
}
}
if (need_commit) { /* we're being asked to do IO_SYNC and this is the last */
vp->v_clen = 0; /* chunk of the I/O request, so we can't start a new cluster yet */
if (need_sync)
bwrite(bp);
else
bawrite(bp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
bp->b_lblkno,
bp->b_blkno,
bp->b_bcount,
2,
0 );
} else {
/*
* begin a new cluster... limiting the size to MAXPHYSIO
*/
vp->v_cstart = lbn;
vp->v_clen = (MAXPHYSIO / bsize) - 1;
bdwrite(bp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
bp->b_lblkno,
bp->b_blkno,
bp->b_bcount,
3,
0 );
}
goto check_for_commit;
}
chk_cluster_full:
if ((lbn == vp->v_cstart + vp->v_clen) || need_commit) {
/*
* At end of cluster, write it out.
*/
cluster_wbuild(vp, bp, bsize, vp->v_cstart,
(lbn - vp->v_cstart) + 1, lbn, secsize, need_sync);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
vp->v_cstart,
vp->v_clen + 1,
lbn,
4,
0 );
vp->v_clen = 0;
} else {
/*
* In the middle of a cluster, so just delay the
* I/O for now.
*/
bdwrite(bp);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
bp->b_lblkno,
bp->b_blkno,
vp->v_cstart,
5,
0);
}
check_for_commit:
vp->v_lastw = lbn;
vp->v_lasta = bn;
if (need_commit) {
bp = getblk(vp, lbn, bsize, 0, 0);
if (bp->b_flags & B_ERROR)
error = (bp->b_error ? bp->b_error : EIO);
brelse(bp);
}
return (error);
}
/*
* This is an awful lot like cluster_rbuild...wish they could be combined.
* The last lbn argument is the current block on which I/O is being
* performed. Check to see that it doesn't fall in the middle of
* the current block (if last_bp == NULL).
*/
void
cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn, secsize, need_sync)
struct vnode *vp;
struct buf *last_bp;
long size;
daddr_t start_lbn;
int len;
daddr_t lbn;
long secsize;
int need_sync;
{
struct cluster_save *b_save;
struct buf *bp, *tbp;
caddr_t cp;
int i, s;
#if DIAGNOSTIC
if (size != vp->v_mount->mnt_stat.f_iosize)
panic("cluster_wbuild: size %d != filesize %d\n",
size, vp->v_mount->mnt_stat.f_iosize);
#endif
redo:
while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
++start_lbn;
--len;
}
/* Get more memory for current buffer */
if (len <= 1) {
if (last_bp) {
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
last_bp->b_lblkno,
last_bp->b_blkno,
last_bp->b_bcount,
10,
0 );
if (need_sync)
bwrite(last_bp);
else
bawrite(last_bp);
} else if (len) {
bp = getblk(vp, start_lbn, size, 0, 0);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
bp->b_lblkno,
bp->b_blkno,
bp->b_bcount,
11,
0 );
if (bp->b_flags & B_DELWRI) {
if (need_sync)
bwrite(bp);
else
bawrite(bp);
} else
brelse(bp);
}
return;
}
b_save = _MALLOC(sizeof(struct buf *) * len + sizeof(struct cluster_save),
M_SEGMENT, M_NOWAIT);
if (b_save)
bp = alloc_io_buf(vp);
else
bp = NULL;
if (b_save == NULL || bp == NULL) {
if (bp)
free_io_buf(bp);
if (b_save)
_FREE(b_save, M_SEGMENT);
for (i = 0; i < len; ++i, ++start_lbn) {
if (!incore(vp, start_lbn))
continue;
if (last_bp == NULL || start_lbn != lbn) {
tbp = getblk(vp, start_lbn, size, 0, 0);
if (tbp->b_flags & B_DELWRI) {
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
tbp->b_lblkno,
tbp->b_blkno,
tbp->b_bcount,
12,
0 );
if (need_sync)
bwrite(tbp);
else
bawrite(tbp);
} else
brelse(tbp);
}
}
if (last_bp) {
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
last_bp->b_lblkno,
last_bp->b_blkno,
last_bp->b_bcount,
13,
0 );
if (need_sync)
bwrite(last_bp);
else
bawrite(last_bp);
}
return;
}
b_save->bs_bufsize = size;
b_save->bs_nchildren = 0;
b_save->bs_children = (struct buf **)(b_save + 1);
bp->b_saveaddr = (caddr_t)b_save;
bp->b_iodone = cluster_callback;
bp->b_flags |= (B_WRITEINPROG | B_CALL | B_ASYNC);
cp = (char *)bp->b_data;
for (start_lbn, i = 0; i < len; ++i, ++start_lbn) {
/*
* Block is not in core or the non-sequential block
* ending our cluster was part of the cluster (in which
* case we don't want to write it twice).
*/
if (!incore(vp, start_lbn) ||
(last_bp == NULL && start_lbn == lbn))
break;
/*
* Get the desired block buffer (unless it is the final
* sequential block whose buffer was passed in explictly
* as last_bp).
*/
if (last_bp == NULL || start_lbn != lbn) {
tbp = getblk(vp, start_lbn, size, 0, 0);
if (!(tbp->b_flags & B_DELWRI)) {
brelse(tbp);
break;
}
} else
tbp = last_bp;
if (i == 0) {
bp->b_blkno = tbp->b_blkno;
bp->b_lblkno= tbp->b_lblkno;
} else {
if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize, secsize))) {
brelse(tbp);
break;
}
}
/* Move memory from children to parent */
pagemove(tbp->b_data, cp, size);
bp->b_bcount += size;
bp->b_bufsize += size;
cp += size;
tbp->b_bufsize -= size;
tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
tbp->b_flags |= (B_ASYNC | B_AGE);
s = splbio();
reassignbuf(tbp, tbp->b_vp); /* put on clean list */
++tbp->b_vp->v_numoutput;
splx(s);
b_save->bs_children[i] = tbp;
b_save->bs_nchildren++;
}
if (i == 0) {
/* None to cluster */
free_io_buf(bp);
_FREE(b_save, M_SEGMENT);
} else {
if (bp->b_bcount > MAXPHYSIO)
panic("cluster_wbuild: bp->b_bcount = %x\n", bp->b_bcount);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
bp->b_lblkno,
bp->b_bcount,
vp,
0xbbbbbbaa,
0 );
VOP_STRATEGY(bp);
}
if (i < len) {
len -= i + 1;
start_lbn += 1;
goto redo;
}
}
/*
* Collect together all the buffers in a cluster.
* Plus add one additional buffer.
*/
struct cluster_save *
cluster_collectbufs(vp, last_bp)
struct vnode *vp;
struct buf *last_bp;
{
struct cluster_save *buflist;
daddr_t lbn;
int i, j, len;
len = vp->v_lastw - vp->v_cstart + 1;
buflist = _MALLOC(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
M_SEGMENT, M_NOWAIT);
if (buflist == NULL)
return (NULL);
buflist->bs_nchildren = 0;
buflist->bs_children = (struct buf **)(buflist + 1);
for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
(void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
&buflist->bs_children[i]);
if(!(buflist->bs_children[i]->b_flags & B_DELWRI)) {
for (j=0; j<=i; j++)
brelse(buflist->bs_children[j]);
_FREE(buflist, M_SEGMENT);
return(NULL);
}
}
buflist->bs_children[i] = last_bp;
buflist->bs_nchildren = i + 1;
return (buflist);
}