|
|
1.1 root 1: /*
2: * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3: *
4: * @APPLE_LICENSE_HEADER_START@
5: *
6: * The contents of this file constitute Original Code as defined in and
7: * are subject to the Apple Public Source License Version 1.1 (the
8: * "License"). You may not use this file except in compliance with the
9: * License. Please obtain a copy of the License at
10: * http://www.apple.com/publicsource and read it before using this file.
11: *
12: * This Original Code and all software distributed under the License are
13: * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14: * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15: * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16: * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17: * License for the specific language governing rights and limitations
18: * under the License.
19: *
20: * @APPLE_LICENSE_HEADER_END@
21: */
22: /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23: /*
24: * Copyright (c) 1993
25: * The Regents of the University of California. All rights reserved.
26: *
27: * Redistribution and use in source and binary forms, with or without
28: * modification, are permitted provided that the following conditions
29: * are met:
30: * 1. Redistributions of source code must retain the above copyright
31: * notice, this list of conditions and the following disclaimer.
32: * 2. Redistributions in binary form must reproduce the above copyright
33: * notice, this list of conditions and the following disclaimer in the
34: * documentation and/or other materials provided with the distribution.
35: * 3. All advertising materials mentioning features or use of this software
36: * must display the following acknowledgement:
37: * This product includes software developed by the University of
38: * California, Berkeley and its contributors.
39: * 4. Neither the name of the University nor the names of its contributors
40: * may be used to endorse or promote products derived from this software
41: * without specific prior written permission.
42: *
43: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53: * SUCH DAMAGE.
54: *
55: * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
56: */
57:
58: #include <sys/param.h>
59: #include <sys/proc.h>
60: #include <sys/buf.h>
61: #include <sys/vnode.h>
62: #include <sys/mount.h>
63: #include <sys/trace.h>
64: #include <sys/malloc.h>
65: #include <sys/resourcevar.h>
66: #include <libkern/libkern.h>
67: #include <kern/mapfs.h>
68:
69: #include <sys/kdebug.h>
70:
71: /*
72: * Local declarations
73: */
74: struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
75: daddr_t, daddr_t, long, int, long, long));
76: struct buf *cluster_create __P((struct vnode *, struct buf *, daddr_t, daddr_t, long,
77: int, long, daddr_t *, int));
78: int cluster_block __P((struct vnode *, u_quad_t, struct buf *, long, long));
79: void cluster_wbuild __P((struct vnode *, struct buf *, long,
80: daddr_t, int, daddr_t, long, int));
81: struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
82:
83: #if DIAGNOSTIC
84: /*
85: * Set to 1 if reads of block zero should cause readahead to be done.
86: * Set to 0 treats a read of block zero as a non-sequential read.
87: *
88: * Setting to one assumes that most reads of block zero of files are due to
89: * sequential passes over the files (e.g. cat, sum) where additional blocks
90: * will soon be needed. Setting to zero assumes that the majority are
91: * surgical strikes to get particular info (e.g. size, file) where readahead
92: * blocks will not be used and, in fact, push out other potentially useful
93: * blocks from the cache. The former seems intuitive, but some quick tests
94: * showed that the latter performed better from a system-wide point of view.
95: */
96: int doclusterraz = 0;
97: #define ISSEQREAD(vp, blk) \
98: (((blk) != 0 || doclusterraz) && \
99: ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
100: #else
101: #define ISSEQREAD(vp, blk) \
102: ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
103: #endif
104:
105: /*
106: * This replaces bread. If this is a bread at the beginning of a file and
107: * lastr is 0, we assume this is the first read and we'll read up to two
108: * blocks if they are sequential. After that, we'll do regular read ahead
109: * in clustered chunks.
110: *
111: * There are 4 or 5 cases depending on how you count:
112: * Desired block is in the cache:
113: * 1 Not sequential access (0 I/Os).
114: * 2 Access is sequential, do read-ahead (1 ASYNC).
115: * Desired block is not in cache:
116: * 3 Not sequential access (1 SYNC).
117: * 4 Sequential access, next block is contiguous (1 SYNC).
118: * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
119: *
120: * There are potentially two buffers that require I/O.
121: * bp is the block requested.
122: * rbp is the read-ahead block.
123: * If either is NULL, then you don't have to do the I/O.
124: */
125:
126: cluster_read(vp, filesize, lblkno, size, cred, bpp, secsize,
127: firstpass, resid, fp_sequential)
128: struct vnode *vp;
129: u_quad_t filesize;
130: daddr_t lblkno;
131: long size;
132: struct ucred *cred;
133: struct buf **bpp;
134: long secsize;
135: int firstpass;
136: long resid;
137: int *fp_sequential;
138: {
139: struct buf *bp, *rbp, *cbp;
140: daddr_t blkno, ioblkno;
141: long flags;
142: int error, num_ra, alreadyincore;
143: long num;
144: int sequential, case4;
145: int l_maxra;
146: int l_ralen;
147: int l_lastr;
148:
149: #if DIAGNOSTIC
150: if (size == 0)
151: panic("cluster_read: size = 0");
152: #endif
153:
154: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_START,
155: lblkno,
156: resid,
157: firstpass,
158: vp,
159: 0);
160: error = 0;
161: flags = B_READ;
162: *bpp = bp = getblk(vp, lblkno, size, 0, 0);
163:
164: if (resid == PAGE_SIZE && lblkno && !ISSEQREAD(vp, lblkno) &&
165: (vp->v_mount->mnt_stat.f_iosize & (PAGE_SIZE - 1)) == 0) {
166: if (bp->b_flags & B_CACHE) {
167:
168: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_END,
169: lblkno,
170: size,
171: -1,
172: 0,
173: 0);
174:
175: vp->v_consumed += (bp->b_bcount/size);
176: return (0);
177: }
178: bp->b_flags |= B_READ;
179:
180: if (cluster_block(vp, filesize, bp, size, secsize)) {
181:
182: error = biowait(bp);
183:
184: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_END,
185: bp,
186: 0,
187: 0,
188: 0,
189: 0);
190:
191: return(error);
192: }
193: }
194: l_maxra = vp->v_maxra;
195: l_ralen = vp->v_ralen;
196: l_lastr = vp->v_lastr;
197:
198: /* round up resid count to nearest block size */
199: if ( resid > size )
200: resid += size - 1;
201:
202: if (bp->b_flags & B_CACHE) {
203: /*
204: * Desired block is in cache; do any readahead ASYNC.
205: * Case 1, 2.
206: */
207: trace(TR_BREADHIT, pack(vp, size), lblkno);
208: flags |= B_ASYNC;
209: if (resid > size)
210: resid -= size;
211:
212: ioblkno = lblkno + (l_ralen ? l_ralen : 1);
213: alreadyincore = incore(vp, ioblkno) != NULL;
214:
215: /*
216: * treat this as a hit for purposes of speculative I/O around paging activity
217: */
218: vp->v_consumed += (bp->b_bcount/size);
219:
220: bp = NULL;
221: } else {
222: /* Block wasn't in cache, case 3, 4, 5. */
223: trace(TR_BREADMISS, pack(vp, size), lblkno);
224: bp->b_flags |= B_READ;
225: ioblkno = lblkno;
226: alreadyincore = 0;
227: current_proc()->p_stats->p_ru.ru_inblock++; /* XXX */
228: }
229: /*
230: * XXX
231: * Replace 1 with a window size based on some permutation of
232: * maxcontig and rot_delay. This will let you figure out how
233: * many blocks you should read-ahead (case 2, 4, 5).
234: *
235: * If the access isn't sequential, reset the window to 1.
236: * Note that a read to the same block is considered sequential.
237: * This catches the case where the file is being read sequentially,
238: * but at smaller than the filesystem block size.
239: */
240: rbp = NULL;
241: cbp = NULL;
242: case4 = 0;
243:
244: if (!ISSEQREAD(vp, lblkno)) {
245: l_ralen = 0;
246: l_maxra = lblkno;
247: sequential = 0;
248: }
249: else
250: sequential = 1;
251:
252: /* On first pass set the sequential state.
253: * Otherwise, just use the value passed in.
254: */
255: if (firstpass)
256: *fp_sequential = sequential;
257:
258: if (resid > size || *fp_sequential) {
259: if (((u_quad_t)(ioblkno + 1)) * (u_quad_t)size <= filesize && !alreadyincore &&
260: !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
261: blkno != -1) {
262: /*
263: * Reading sequentially, and the next block is not in the
264: * cache. We are going to try reading ahead.
265: */
266: if (num_ra) {
267: /*
268: * If our desired readahead block had been read
269: * in a previous readahead but is no longer in
270: * core, then we may be reading ahead too far
271: * or are not using our readahead very rapidly.
272: * In this case we scale back the window.
273: */
274: if (*fp_sequential) {
275: if (!alreadyincore && ioblkno <= l_maxra)
276: l_ralen = max(l_ralen >> 1, 1);
277: /*
278: * There are more sequential blocks than our current
279: * window allows, scale up. Ideally we want to get
280: * in sync with the filesystem maxcontig value.
281: */
282: else if (num_ra > l_ralen && lblkno != l_lastr)
283: l_ralen = l_ralen ?
284: min(num_ra, l_ralen << 1) : 1;
285: }
286: num = max((resid/size)-1, l_ralen);
287: num_ra = min(num, num_ra);
288: }
289:
290: if (num_ra) { /* case 2, 4 */
291: cbp = cluster_rbuild(vp, filesize,
292: bp, ioblkno, blkno, size, num_ra, flags, secsize);
293:
294: if (cbp) {
295: if ( !(cbp->b_flags & B_CALL)) {
296: if ((rbp = cbp) == bp)
297: rbp = NULL;
298: cbp = NULL;
299: } else
300: case4 = 1;
301: }
302: } else if (ioblkno == lblkno) {
303: bp->b_blkno = blkno;
304: /* Case 5: check how many blocks to read ahead */
305: ++ioblkno;
306: if (((u_quad_t)(ioblkno + 1)) * (u_quad_t)size > filesize ||
307: incore(vp, ioblkno) || (error = VOP_BMAP(vp,
308: ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
309: goto skip_readahead;
310: /*
311: * Adjust readahead as above.
312: * Don't check alreadyincore, we know it is 0 from
313: * the previous conditional.
314: */
315: if (num_ra) {
316: if (*fp_sequential) {
317: if (ioblkno <= l_maxra)
318: l_ralen = max(l_ralen >> 1, 1);
319: else if (num_ra > l_ralen && lblkno != l_lastr)
320: l_ralen = l_ralen ?
321: min(num_ra, l_ralen<<1) : 1;
322: }
323: num = max((resid/size)-1, l_ralen);
324: num_ra = min(num, num_ra);
325: }
326: flags |= B_ASYNC;
327:
328: if (num_ra) {
329: cbp = cluster_rbuild(vp, filesize,
330: NULL, ioblkno, blkno, size, num_ra, flags,
331: secsize);
332: if (cbp) {
333: if ( !(cbp->b_flags & B_CALL)) {
334: rbp = cbp;
335: cbp = NULL;
336: }
337: }
338: } else {
339: rbp = getblk(vp, ioblkno, size, 0, 0);
340: rbp->b_flags |= flags;
341: rbp->b_blkno = blkno;
342: }
343: } else {
344: /* case 2; read ahead single block */
345: rbp = getblk(vp, ioblkno, size, 0, 0);
346: rbp->b_flags |= flags;
347: rbp->b_blkno = blkno;
348: }
349: if (cbp || rbp) { /* case 2, 5 */
350: trace(TR_BREADMISSRA,
351: pack(vp, (num_ra + 1) * size), ioblkno);
352: current_proc()->p_stats->p_ru.ru_inblock++; /* XXX */
353: }
354: }
355: }
356:
357: skip_readahead:
358: if (bp && !case4) {
359: if (bp->b_flags & (B_DONE | B_DELWRI))
360: panic("cluster_read: DONE bp");
361: else {
362: /*
363: * issue the BMAP here if needed due to the block device's
364: * lack of a BMAP call in the strategy routine.... when being
365: * used by the filesystem/mount code, the blockno's being worked
366: * with are always physical so the strategy routine doesn't bother.
367: * Now that we are calling cluster read/write from spec_read/spec_write
368: * we have to use real logical blockno's in order to properly trigger
369: * the read-ahead and write-coalescing.
370: */
371: if (bp->b_lblkno == bp->b_blkno) {
372: VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
373:
374: if ((long)bp->b_blkno == -1)
375: clrbuf(bp);
376: }
377: error = VOP_STRATEGY(bp);
378:
379: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_NONE,
380: bp->b_lblkno,
381: bp->b_bcount,
382: vp,
383: 0xaaaaaaaa, 0 );
384: }
385: }
386: if (rbp) {
387: if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
388: rbp->b_flags &= ~(B_ASYNC | B_READ);
389: brelse(rbp);
390: } else {
391: /*
392: * issue the BMAP here if needed due to the block device's
393: * lack of a BMAP call in the strategy routine.... when being
394: * used by the filesystem/mount code, the blockno's being worked
395: * with are always physical so the strategy routine doesn't bother.
396: * Now that we are calling cluster read/write from spec_read/spec_write
397: * we have to use real logical blockno's in order to properly trigger
398: * the read-ahead and write-coalescing.
399: */
400: if (rbp->b_lblkno == rbp->b_blkno) {
401: VOP_BMAP(vp, rbp->b_lblkno, NULL, &rbp->b_blkno, NULL);
402:
403: if ((long)rbp->b_blkno == -1)
404: clrbuf(rbp);
405: }
406: (void) VOP_STRATEGY(rbp);
407:
408: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_NONE,
409: rbp->b_lblkno,
410: rbp->b_bcount,
411: vp,
412: 0xaaaaaabb, 0 );
413: }
414: }
415: if (cbp) {
416: (void) VOP_STRATEGY(cbp);
417:
418: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_NONE,
419: cbp->b_lblkno,
420: cbp->b_bcount,
421: vp,
422: 0xaaaaaacc, 0 );
423: }
424: /*
425: * Recalculate our maximum readahead
426: */
427: if (rbp == NULL) {
428: if (cbp)
429: rbp = cbp;
430: else
431: rbp = bp;
432: }
433: if (rbp)
434: vp->v_maxra = rbp->b_lblkno + (rbp->b_bcount / size) - 1;
435: else
436: vp->v_maxra = l_maxra;
437: vp->v_ralen = l_ralen;
438:
439: if (bp)
440: error = biowait(bp);
441:
442: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 11)) | DBG_FUNC_END,
443: bp,
444: rbp,
445: cbp,
446: vp->v_maxra,
447: 0);
448: return(error);
449: }
450:
451: struct pent {
452: int mask;
453: int num;
454: } pent[7] = {
455: {0,0},
456: {0,0},
457: {~0,1},
458: {~1,2},
459: {~3,4},
460: {~7,8},
461: {~15,16}};
462:
463:
464: int cluster_block(vp, filesize, bp, size, secsize)
465: struct vnode *vp;
466: u_quad_t filesize;
467: struct buf *bp;
468: long size;
469: long secsize;
470: {
471: struct buf *cbp;
472: daddr_t lblkno, blkno, ioblkno, lbn;
473: int num_io, num;
474: unsigned ratio;
475:
476: #if 0 /* FIXED READS */
477: /* calculate maximum number of blocks to read in */
478:
479: lblkno = bp->b_lblkno & ~0x07; /* put us on a 32k (8 page boundary) boundary */
480: num = 8;
481: num_io = 0;
482: #else /* ADAPTIVE READS */
483: if (vp->v_bread > vp->v_trigger) {
484: ratio = (vp->v_consumed*100) / vp->v_bread;
485:
486: if (ratio < 50 && vp->v_power > 2) {
487: vp->v_power--;
488: vp->v_trigger = vp->v_bread + (16 * pent[vp->v_power].num);
489: } else if (ratio > 75 && vp->v_power < 6) {
490: vp->v_power++;
491: vp->v_trigger = vp->v_bread + (16 * pent[vp->v_power].num);
492: }
493: }
494: if ((num = pent[vp->v_power].num) == 1)
495: return (0);
496: lblkno = bp->b_lblkno & pent[vp->v_power].mask;
497: num_io = 0;
498: #endif
499:
500: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 14)) | DBG_FUNC_START,
501: lblkno,
502: num,
503: vp->v_flag,
504: vp,
505: 0 );
506:
507: for (lbn = bp->b_lblkno; lbn > lblkno; lbn--) {
508: if (incore(vp, lbn - 1))
509: break;
510: }
511: num -= (lbn - lblkno);
512:
513: for (;;) {
514: if (VOP_BMAP(vp, lbn, NULL, &blkno, &num_io) || blkno == -1 || num_io == 0) {
515: if (lbn == bp->b_lblkno) {
516: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 14)) | DBG_FUNC_END,
517: -1,
518: lbn,
519: blkno,
520: num_io,
521: 0);
522: return (0);
523: }
524: }
525: if ((lbn + num_io) >= bp->b_lblkno)
526: break;
527: lbn++;
528: num--;
529: }
530: if ((num_io = min(num, num_io + 1)) == 1)
531: return (0);
532:
533: if ((u_quad_t)size * ((u_quad_t)(lbn + num_io)) > filesize)
534: num_io = (filesize - ((u_quad_t)size * (u_quad_t)lbn)) / size;
535:
536: cbp = cluster_create(vp, bp, lbn, blkno, size, num_io, secsize, &ioblkno, B_AGE);
537:
538: if (cbp) {
539: (void) VOP_STRATEGY(cbp);
540: vp->v_bread += (cbp->b_bcount / size);
541: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 14)) | DBG_FUNC_END,
542: cbp->b_lblkno,
543: cbp->b_bcount,
544: vp,
545: 0xaaaaaadd,
546: 0 );
547:
548: return (1);
549: }
550: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 14)) | DBG_FUNC_END,
551: 0,
552: 0,
553: 0,
554: 0,
555: 0);
556: return (0);
557: }
558:
559:
560: /*
561: * generate advisory I/O in as big of chunks as possible
562: * and then parcel them up into logical blocks in the buffer hash table.
563: */
564: advisory_read(vp, filesize, lblkno, size, runt_size, io_size, secsize)
565: struct vnode *vp;
566: u_quad_t filesize;
567: daddr_t lblkno;
568: long size;
569: long runt_size;
570: long io_size;
571: long secsize;
572: {
573: struct buf *bp, *cbp;
574: daddr_t blkno, ioblkno;
575: int error, num_io;
576: long num;
577:
578: error = 0;
579:
580: /* calculate maximum number of blocks to read in */
581:
582: num = (io_size + (size - 1)) / size;
583:
584: if ((u_quad_t)size * ((u_quad_t)(lblkno + num)) > filesize) {
585: if (((u_quad_t)size * (u_quad_t)lblkno) >= filesize)
586: return(EFBIG);
587: io_size = filesize - ((u_quad_t)size * (u_quad_t)lblkno);
588:
589: num = io_size / size;
590: } else
591: io_size = num * size;
592:
593: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 13)) | DBG_FUNC_START,
594: lblkno,
595: io_size,
596: num,
597: vp,
598: 0 );
599:
600: while (num) {
601: if (error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_io))
602: break;
603:
604: if (blkno == -1) {
605: lblkno++;
606: num--;
607: io_size -= size;
608: continue;
609: }
610: num_io = min(num, num_io + 1);
611:
612: cbp = cluster_create(vp, NULL, lblkno, blkno, size, num_io, secsize, &ioblkno, 0);
613:
614: if (cbp) {
615: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 13)) | DBG_FUNC_NONE,
616: cbp->b_blkno,
617: cbp->b_bcount,
618: vp,
619: 0xaaaaaaee,
620: 0 );
621:
622: (void) VOP_STRATEGY(cbp);
623: } else {
624: if (ioblkno == lblkno) {
625: error = ENOMEM;
626: break;
627: }
628: }
629: io_size -= ((ioblkno - lblkno) * size);
630: num -= ioblkno - lblkno;
631: lblkno = ioblkno;
632: }
633: if (io_size && !error) {
634: bp = getblk(vp, lblkno, runt_size, 0, 0);
635:
636: if (bp->b_flags & (B_DONE | B_DELWRI))
637: brelse(bp);
638: else {
639: bp->b_flags |= (B_READ | B_ASYNC);
640:
641: (void) VOP_STRATEGY(bp);
642:
643: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 13)) | DBG_FUNC_NONE,
644: bp->b_blkno,
645: bp->b_bcount,
646: vp,
647: 0xaaaaaaff,
648: 0 );
649: }
650: io_size -= runt_size;
651: }
652:
653: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 13)) | DBG_FUNC_END,
654: lblkno,
655: io_size,
656: num,
657: error,
658: 0);
659: return(error);
660: }
661:
662:
663: /*
664: * If blocks are contiguous on disk, use this to provide clustered
665: * read ahead. We will read as many blocks as possible sequentially
666: * and then parcel them up into logical blocks in the buffer hash table.
667: */
668: struct buf *
669: cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags, secsize)
670: struct vnode *vp;
671: u_quad_t filesize;
672: struct buf *bp;
673: daddr_t lbn;
674: daddr_t blkno;
675: long size;
676: int run;
677: long flags;
678: long secsize;
679: {
680: struct cluster_save *b_save;
681: struct buf *tbp, *cbp;
682: caddr_t cp;
683: daddr_t bn;
684: int i, inc;
685:
686: #if DIAGNOSTIC
687: if (size != vp->v_mount->mnt_stat.f_iosize)
688: panic("cluster_rbuild: size %d != filesize %d\n",
689: size, vp->v_mount->mnt_stat.f_iosize);
690: #endif
691: if ((u_quad_t)size * ((u_quad_t)(lbn + run + 1)) > filesize)
692: --run;
693: if (run == 0) {
694: if (!bp) {
695: bp = getblk(vp, lbn, size, 0, 0);
696: bp->b_blkno = blkno;
697: bp->b_flags |= flags;
698: }
699: return(bp);
700: }
701: b_save = _MALLOC(sizeof(struct buf *) * (run + 1) + sizeof(struct cluster_save),
702: M_SEGMENT, M_NOWAIT);
703:
704: if (b_save)
705: cbp = alloc_io_buf(vp);
706: else
707: cbp = NULL;
708:
709: if (b_save == NULL || cbp == NULL) {
710: if (b_save)
711: _FREE(b_save, M_SEGMENT);
712: if (cbp)
713: free_io_buf(cbp);
714: return (bp);
715: }
716: b_save->bs_bufsize = size;
717: b_save->bs_nchildren = 0;
718: b_save->bs_children = (struct buf **)(b_save + 1);
719:
720: cbp->b_saveaddr = (caddr_t)b_save;
721: cbp->b_iodone = cluster_callback;
722: cbp->b_blkno = blkno;
723: cbp->b_lblkno = lbn;
724: cbp->b_flags |= flags | B_CALL;
725:
726: inc = btodb(size, secsize);
727: cp = (char *)cbp->b_data;
728: tbp = bp;
729:
730: for (bn = blkno, i = 0; i <= run; ++i, bn += inc) {
731: if (tbp == NULL) {
732: if (incore(vp, lbn + i))
733: /*
734: * A component of the cluster is already in core,
735: * terminate the cluster early.
736: */
737: break;
738: tbp = getblk(vp, lbn + i, size, 0, 0);
739: }
740: pagemove(tbp->b_data, cp, size);
741: cbp->b_bcount += size;
742: cbp->b_bufsize += size;
743: cp += size;
744:
745: if (bp != tbp)
746: tbp->b_flags |= flags | B_READ | B_ASYNC;
747: tbp->b_bufsize -= size;
748: tbp->b_blkno = bn;
749:
750: b_save->bs_children[i] = tbp;
751: b_save->bs_nchildren++;
752:
753: tbp = NULL;
754: }
755: /*
756: * The cluster may have been terminated early
757: * If no cluster could be formed, deallocate the cluster save info.
758: */
759: if (i == 0) {
760: _FREE(b_save, M_SEGMENT);
761: free_io_buf(cbp);
762: return(bp);
763: }
764: return(cbp);
765: }
766:
767:
768:
769: struct buf *
770: cluster_create(vp, bp, lbn, blkno, size, run, secsize, ioblkno, flags)
771: struct vnode *vp;
772: struct buf *bp;
773: daddr_t lbn;
774: daddr_t blkno;
775: long size;
776: int run;
777: long secsize;
778: daddr_t *ioblkno;
779: int flags;
780: {
781: struct cluster_save *b_save;
782: struct buf *tbp, *cbp;
783: caddr_t cp;
784: daddr_t bn;
785: int i, inc;
786:
787: inc = btodb(size, secsize);
788:
789: if (bp == NULL) {
790: while (run && (tbp = incore(vp, lbn))) {
791: /*
792: * if a block is already in core
793: * and is not busy
794: * then get and release to freshen it in the LRU
795: */
796: if ( !(tbp->b_flags & B_BUSY)) {
797: tbp = getblk(vp, lbn, size, 0, 0);
798: brelse(tbp);
799: }
800: lbn++;
801: run--;
802: blkno += inc;
803: }
804: if (run == 0) {
805: *ioblkno = lbn;
806: return (NULL);
807: }
808: }
809: b_save = _MALLOC((sizeof(struct buf *) * run) + sizeof(struct cluster_save), M_SEGMENT, M_NOWAIT);
810:
811: if (b_save)
812: cbp = alloc_io_buf(vp);
813: else
814: cbp = NULL;
815:
816: if (b_save == NULL || cbp == NULL) {
817: if (b_save)
818: _FREE(b_save, M_SEGMENT);
819: if (cbp)
820: free_io_buf(cbp);
821: *ioblkno = lbn;
822:
823: return (NULL);
824: }
825: b_save->bs_bufsize = size;
826: b_save->bs_nchildren = 0;
827: b_save->bs_children = (struct buf **)(b_save + 1);
828:
829: cbp->b_saveaddr = (caddr_t)b_save;
830: cbp->b_iodone = cluster_callback;
831: cbp->b_blkno = blkno;
832: cbp->b_lblkno = lbn;
833: cbp->b_flags |= (B_READ | B_ASYNC | B_CALL);
834:
835: cp = (char *)cbp->b_data;
836:
837: for (bn = blkno, i = 0; i < run; ++i, bn += inc, ++lbn) {
838: if (bp && bp->b_lblkno == lbn)
839: tbp = bp;
840: else {
841: if (tbp = incore(vp, lbn)) {
842: /*
843: * A component of the cluster is already in core,
844: * terminate the cluster early.
845: * if its not busy then also
846: * get and release to freshen it in the LRU
847: */
848: if ( !(tbp->b_flags & B_BUSY)) {
849: tbp = getblk(vp, lbn, size, 0, 0);
850: brelse(tbp);
851: }
852: break;
853: }
854: tbp = getblk(vp, lbn, size, 0, 0);
855: }
856: pagemove(tbp->b_data, cp, size);
857:
858: tbp->b_bufsize -= size;
859: tbp->b_blkno = bn;
860: cbp->b_bcount += size;
861: cbp->b_bufsize += size;
862: cp += size;
863:
864: if (tbp != bp)
865: tbp->b_flags |= (B_READ | B_ASYNC | flags);
866: b_save->bs_children[i] = tbp;
867: b_save->bs_nchildren++;
868: }
869: *ioblkno = lbn;
870: /*
871: * The cluster may have been terminated early
872: * If no cluster could be formed, deallocate the cluster save info.
873: */
874: if (cbp->b_bcount == 0) {
875: _FREE(b_save, M_SEGMENT);
876: free_io_buf(cbp);
877: return(NULL);
878: }
879: return(cbp);
880: }
881:
882:
883: /*
884: * Cleanup after a clustered read or write.
885: * This is complicated by the fact that any of the buffers might have
886: * extra memory (if there were no empty buffer headers at allocbuf time)
887: * that we will need to shift around.
888: */
889: void
890: cluster_callback(bp)
891: struct buf *bp;
892: {
893: struct cluster_save *b_save;
894: struct buf **bpp, *tbp;
895: long bsize;
896: int xsize;
897: int n;
898: caddr_t cp;
899: int error = 0;
900:
901: /*
902: * Must propogate errors to all the components.
903: */
904: if (bp->b_flags & B_ERROR)
905: error = bp->b_error;
906: b_save = (struct cluster_save *)(bp->b_saveaddr);
907:
908: bsize = b_save->bs_bufsize;
909: xsize = bp->b_bcount - bp->b_resid;
910: cp = (char *)bp->b_data;
911: /*
912: * Move memory from the large cluster buffer into the component
913: * buffers and mark IO as done on these.
914: */
915: for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
916: tbp = *bpp;
917: pagemove(cp, tbp->b_data, bsize);
918: tbp->b_bufsize += bsize;
919:
920: n = min(bsize, xsize);
921: xsize -= n;
922:
923: if ((tbp->b_bcount = n) == 0)
924: tbp->b_flags |= B_INVAL;
925: tbp->b_resid = bsize - n;
926:
927: if (error) {
928: tbp->b_flags |= B_ERROR;
929: tbp->b_error = error;
930: }
931: biodone(tbp);
932: bp->b_bufsize -= bsize;
933: cp += bsize;
934: }
935: _FREE(b_save, M_SEGMENT);
936:
937: free_io_buf(bp);
938: }
939:
940:
941: /*
942: * on close, flush out any remaining cluster
943: *
944: */
945: cluster_close(vp, bsize, secsize)
946: struct vnode *vp;
947: int bsize;
948: long secsize;
949: {
950: int cursize;
951:
952: if (vp->v_clen) {
953: cursize = vp->v_lastw - vp->v_cstart + 1;
954:
955: cluster_wbuild(vp, NULL, bsize, vp->v_cstart, cursize, -1, secsize, 0);
956:
957: vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
958: }
959: }
960:
961:
962: /*
963: * Do clustered write for FFS.
964: *
965: * Three cases:
966: * 1. Write is not sequential (write asynchronously)
967: * Write is sequential:
968: * 2. beginning of cluster - begin cluster
969: * 3. middle of a cluster - add to cluster
970: * 4. end of a cluster - asynchronously write cluster
971: */
972:
973: cluster_write(bp, filesize, secsize)
974: struct buf *bp;
975: u_quad_t filesize;
976: long secsize;
977: {
978: struct vnode *vp;
979: daddr_t lbn;
980: daddr_t bn;
981: int cursize;
982: int need_commit;
983: int need_sync;
984: int bsize;
985: int error = 0;
986:
987: need_commit = (bp->b_flags & B_CLUST_COMMIT);
988: need_sync = (bp->b_flags & B_CLUST_SYNC);
989: bp->b_flags &= ~(B_CLUST_COMMIT | B_CLUST_SYNC);
990:
991: vp = bp->b_vp;
992: bn = bp->b_blkno;
993: lbn = bp->b_lblkno;
994: bsize = bp->b_bcount;
995:
996: if ((bsize & (PAGE_SIZE - 1)) || bsize > MAXBSIZE) {
997: bp->b_flags |= B_AGE;
998: bawrite(bp);
999:
1000: return (error);
1001: }
1002: /* Initialize vnode to beginning of file. */
1003: if (lbn == 0)
1004: vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
1005:
1006:
1007: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
1008: bp->b_lblkno,
1009: bp->b_bcount,
1010: vp,
1011: 0,
1012: 0);
1013:
1014: if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bn != vp->v_lasta + btodb(bsize, secsize)))
1015: {
1016: if (vp->v_clen) {
1017: /*
1018: * Current block is neither logically or physically sequential to last written
1019: *
1020: * If we are not writing at the end of file, or the process
1021: * seeked to another point in the file since its
1022: * last write, then push the previous cluster.
1023: * Otherwise try reallocating to make it sequential.
1024: */
1025: cursize = vp->v_lastw - vp->v_cstart + 1;
1026:
1027: if (((u_quad_t)(lbn + 1)) * (u_quad_t)bsize != filesize || lbn != vp->v_lastw + 1) {
1028: cluster_wbuild(vp, NULL, bsize,
1029: vp->v_cstart, cursize, lbn, secsize, need_sync);
1030: } else {
1031: struct buf **bpp, **endbp;
1032: struct cluster_save *buflist;
1033:
1034: buflist = cluster_collectbufs(vp, bp);
1035:
1036: if (buflist == NULL) {
1037: cluster_wbuild(vp, NULL, bsize,
1038: vp->v_cstart, cursize, lbn, secsize, need_sync);
1039: } else {
1040:
1041: endbp = &buflist->bs_children
1042: [buflist->bs_nchildren - 1];
1043: if (VOP_REALLOCBLKS(vp, buflist)) {
1044: /*
1045: * Failed, push the previous cluster.
1046: */
1047: for (bpp = buflist->bs_children;
1048: bpp < endbp; bpp++)
1049: brelse(*bpp);
1050: _FREE(buflist, M_SEGMENT);
1051:
1052: cluster_wbuild(vp, NULL, bsize,
1053: vp->v_cstart, cursize, lbn, secsize, need_sync);
1054: } else {
1055: /*
1056: * Succeeded, keep building cluster.
1057: * don't bdwrite the last bp, we'll
1058: * first check to see if we now have a full
1059: * cluster, or the caller has requested a SYNC write
1060: */
1061: for (bpp = buflist->bs_children;
1062: bpp < endbp; bpp++)
1063: bdwrite(*bpp);
1064: _FREE(buflist, M_SEGMENT);
1065: /*
1066: * update the physical block number because,
1067: * VOP_REALLOCBLKS will have changed it
1068: */
1069: bn = bp->b_blkno;
1070: goto chk_cluster_full;
1071: }
1072: }
1073: }
1074: }
1075: if (need_commit) { /* we're being asked to do IO_SYNC and this is the last */
1076: vp->v_clen = 0; /* chunk of the I/O request, so we can't start a new cluster yet */
1077:
1078: if (need_sync)
1079: bwrite(bp);
1080: else
1081: bawrite(bp);
1082:
1083: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
1084: bp->b_lblkno,
1085: bp->b_blkno,
1086: bp->b_bcount,
1087: 2,
1088: 0 );
1089: } else {
1090: /*
1091: * begin a new cluster... limiting the size to MAXPHYSIO
1092: */
1093: vp->v_cstart = lbn;
1094: vp->v_clen = (MAXPHYSIO / bsize) - 1;
1095:
1096: bdwrite(bp);
1097:
1098: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
1099: bp->b_lblkno,
1100: bp->b_blkno,
1101: bp->b_bcount,
1102: 3,
1103: 0 );
1104: }
1105: goto check_for_commit;
1106: }
1107: chk_cluster_full:
1108: if ((lbn == vp->v_cstart + vp->v_clen) || need_commit) {
1109: /*
1110: * At end of cluster, write it out.
1111: */
1112: cluster_wbuild(vp, bp, bsize, vp->v_cstart,
1113: (lbn - vp->v_cstart) + 1, lbn, secsize, need_sync);
1114:
1115: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
1116: vp->v_cstart,
1117: vp->v_clen + 1,
1118: lbn,
1119: 4,
1120: 0 );
1121: vp->v_clen = 0;
1122: } else {
1123: /*
1124: * In the middle of a cluster, so just delay the
1125: * I/O for now.
1126: */
1127: bdwrite(bp);
1128:
1129: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
1130: bp->b_lblkno,
1131: bp->b_blkno,
1132: vp->v_cstart,
1133: 5,
1134: 0);
1135: }
1136: check_for_commit:
1137: vp->v_lastw = lbn;
1138: vp->v_lasta = bn;
1139:
1140: if (need_commit) {
1141: bp = getblk(vp, lbn, bsize, 0, 0);
1142:
1143: if (bp->b_flags & B_ERROR)
1144: error = (bp->b_error ? bp->b_error : EIO);
1145: brelse(bp);
1146: }
1147: return (error);
1148: }
1149:
1150:
1151: /*
1152: * This is an awful lot like cluster_rbuild...wish they could be combined.
1153: * The last lbn argument is the current block on which I/O is being
1154: * performed. Check to see that it doesn't fall in the middle of
1155: * the current block (if last_bp == NULL).
1156: */
1157: void
1158: cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn, secsize, need_sync)
1159: struct vnode *vp;
1160: struct buf *last_bp;
1161: long size;
1162: daddr_t start_lbn;
1163: int len;
1164: daddr_t lbn;
1165: long secsize;
1166: int need_sync;
1167: {
1168: struct cluster_save *b_save;
1169: struct buf *bp, *tbp;
1170: caddr_t cp;
1171: int i, s;
1172:
1173: #if DIAGNOSTIC
1174: if (size != vp->v_mount->mnt_stat.f_iosize)
1175: panic("cluster_wbuild: size %d != filesize %d\n",
1176: size, vp->v_mount->mnt_stat.f_iosize);
1177: #endif
1178: redo:
1179: while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
1180: ++start_lbn;
1181: --len;
1182: }
1183: /* Get more memory for current buffer */
1184: if (len <= 1) {
1185: if (last_bp) {
1186: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
1187: last_bp->b_lblkno,
1188: last_bp->b_blkno,
1189: last_bp->b_bcount,
1190: 10,
1191: 0 );
1192: if (need_sync)
1193: bwrite(last_bp);
1194: else
1195: bawrite(last_bp);
1196: } else if (len) {
1197: bp = getblk(vp, start_lbn, size, 0, 0);
1198:
1199: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
1200: bp->b_lblkno,
1201: bp->b_blkno,
1202: bp->b_bcount,
1203: 11,
1204: 0 );
1205: if (bp->b_flags & B_DELWRI) {
1206: if (need_sync)
1207: bwrite(bp);
1208: else
1209: bawrite(bp);
1210: } else
1211: brelse(bp);
1212: }
1213: return;
1214: }
1215: b_save = _MALLOC(sizeof(struct buf *) * len + sizeof(struct cluster_save),
1216: M_SEGMENT, M_NOWAIT);
1217: if (b_save)
1218: bp = alloc_io_buf(vp);
1219: else
1220: bp = NULL;
1221:
1222: if (b_save == NULL || bp == NULL) {
1223: if (bp)
1224: free_io_buf(bp);
1225: if (b_save)
1226: _FREE(b_save, M_SEGMENT);
1227:
1228: for (i = 0; i < len; ++i, ++start_lbn) {
1229: if (!incore(vp, start_lbn))
1230: continue;
1231: if (last_bp == NULL || start_lbn != lbn) {
1232: tbp = getblk(vp, start_lbn, size, 0, 0);
1233:
1234: if (tbp->b_flags & B_DELWRI) {
1235: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
1236: tbp->b_lblkno,
1237: tbp->b_blkno,
1238: tbp->b_bcount,
1239: 12,
1240: 0 );
1241:
1242: if (need_sync)
1243: bwrite(tbp);
1244: else
1245: bawrite(tbp);
1246: } else
1247: brelse(tbp);
1248: }
1249: }
1250: if (last_bp) {
1251: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
1252: last_bp->b_lblkno,
1253: last_bp->b_blkno,
1254: last_bp->b_bcount,
1255: 13,
1256: 0 );
1257: if (need_sync)
1258: bwrite(last_bp);
1259: else
1260: bawrite(last_bp);
1261: }
1262: return;
1263: }
1264: b_save->bs_bufsize = size;
1265: b_save->bs_nchildren = 0;
1266: b_save->bs_children = (struct buf **)(b_save + 1);
1267:
1268: bp->b_saveaddr = (caddr_t)b_save;
1269: bp->b_iodone = cluster_callback;
1270: bp->b_flags |= (B_WRITEINPROG | B_CALL | B_ASYNC);
1271:
1272: cp = (char *)bp->b_data;
1273:
1274: for (start_lbn, i = 0; i < len; ++i, ++start_lbn) {
1275: /*
1276: * Block is not in core or the non-sequential block
1277: * ending our cluster was part of the cluster (in which
1278: * case we don't want to write it twice).
1279: */
1280: if (!incore(vp, start_lbn) ||
1281: (last_bp == NULL && start_lbn == lbn))
1282: break;
1283:
1284: /*
1285: * Get the desired block buffer (unless it is the final
1286: * sequential block whose buffer was passed in explictly
1287: * as last_bp).
1288: */
1289: if (last_bp == NULL || start_lbn != lbn) {
1290: tbp = getblk(vp, start_lbn, size, 0, 0);
1291: if (!(tbp->b_flags & B_DELWRI)) {
1292: brelse(tbp);
1293: break;
1294: }
1295: } else
1296: tbp = last_bp;
1297:
1298: if (i == 0) {
1299: bp->b_blkno = tbp->b_blkno;
1300: bp->b_lblkno= tbp->b_lblkno;
1301: } else {
1302: if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize, secsize))) {
1303: brelse(tbp);
1304: break;
1305: }
1306: }
1307: /* Move memory from children to parent */
1308: pagemove(tbp->b_data, cp, size);
1309: bp->b_bcount += size;
1310: bp->b_bufsize += size;
1311: cp += size;
1312:
1313: tbp->b_bufsize -= size;
1314: tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
1315: tbp->b_flags |= (B_ASYNC | B_AGE);
1316:
1317: s = splbio();
1318: reassignbuf(tbp, tbp->b_vp); /* put on clean list */
1319: ++tbp->b_vp->v_numoutput;
1320: splx(s);
1321:
1322: b_save->bs_children[i] = tbp;
1323: b_save->bs_nchildren++;
1324: }
1325:
1326: if (i == 0) {
1327: /* None to cluster */
1328: free_io_buf(bp);
1329: _FREE(b_save, M_SEGMENT);
1330: } else {
1331: if (bp->b_bcount > MAXPHYSIO)
1332: panic("cluster_wbuild: bp->b_bcount = %x\n", bp->b_bcount);
1333:
1334: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_NONE,
1335: bp->b_lblkno,
1336: bp->b_bcount,
1337: vp,
1338: 0xbbbbbbaa,
1339: 0 );
1340: VOP_STRATEGY(bp);
1341: }
1342: if (i < len) {
1343: len -= i + 1;
1344: start_lbn += 1;
1345: goto redo;
1346: }
1347: }
1348:
1349: /*
1350: * Collect together all the buffers in a cluster.
1351: * Plus add one additional buffer.
1352: */
1353: struct cluster_save *
1354: cluster_collectbufs(vp, last_bp)
1355: struct vnode *vp;
1356: struct buf *last_bp;
1357: {
1358: struct cluster_save *buflist;
1359: daddr_t lbn;
1360: int i, j, len;
1361:
1362: len = vp->v_lastw - vp->v_cstart + 1;
1363: buflist = _MALLOC(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
1364: M_SEGMENT, M_NOWAIT);
1365:
1366: if (buflist == NULL)
1367: return (NULL);
1368:
1369: buflist->bs_nchildren = 0;
1370: buflist->bs_children = (struct buf **)(buflist + 1);
1371: for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
1372: (void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
1373: &buflist->bs_children[i]);
1374: if(!(buflist->bs_children[i]->b_flags & B_DELWRI)) {
1375: for (j=0; j<=i; j++)
1376: brelse(buflist->bs_children[j]);
1377: _FREE(buflist, M_SEGMENT);
1378: return(NULL);
1379: }
1380: }
1381: buflist->bs_children[i] = last_bp;
1382: buflist->bs_nchildren = i + 1;
1383: return (buflist);
1384: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.