|
|
1.1 root 1: #include <u.h>
2: #include <libc.h>
3: #include <libg.h>
4: #include "/sys/include/gnot.h"
5: /*
6: * Compiling bitblt - for each call of gbitblt, generate
7: * machine code for the specific arguments passed in,
8: * then execute that code as a subroutine call.
9: *
10: * The files bb?.h (where ? is replaced by the
11: * architectures single-letter code: v, k, etc., depending
12: * on which of Tmips, Tsparc, etc., is defined) define macros
13: * which define operations on an abstract machine.
14: * If there is no architecture-specific file, bbc.h is used,
15: * which interprets a program with the same operations as
16: * the assumed abstract machine. See bbc.h for a description
17: * of the machine's registers and operations, and for the
18: * macros that must be defined to make a new bb?.h
19: *
20: * The code will work for bitblts to and from any bitmaps
21: * with ldepths 0, 1, 2, or 3. Some of those conversions
22: * may be #ifdef'd out, because no Plan 9 code currently
23: * uses them, and the tables take up space. The converting
24: * bitblts will need work if they are to work with ldepths > 3.
25: *
26: * This file also contains a thorough bitblt tester.
27: * When TEST is defined, a main program is created to
28: * try forward and backward cases for all bitblt opcodes,
29: * first on single pixels, and then on parts of the middle two rows
30: * of random 4-row bitmaps. If anything fails, it prints information
31: * about the case that failed (or perhaps it will just die,
32: * if bad code has been generated).
33: * The testing programs takes as arguments two numbers: the source
34: * and destination ldepths (both 0 by default). A -s flag says to
35: * use simpler, repeatable tests. A -i num flag says how many
36: * iterations to do for each opcode.
37: */
38:
39: /* Bitblt cases:
40: * - bitmap overlap sometimes dictates that you go forward (f) through
41: * the bitmaps, sometimes backwards (b)
42: * - different relative alignments of the source and destination
43: * starting points within a word require different code:
44: * the bit offsets may be the same (e), the source may start
45: * later in the word (g), or the source may start earlier in the word (l)
46: * - when a row of the destination is all within one word (o), better
47: * code can be generated
48: * - when the object machine has bitfield extraction/insertion instructions,
49: * it is better to do < 32bit wide bitblts using them (bf)
50: * - if the source and destination bitmaps have different depths,
51: * it is either and expansion (exp) or a contraction (con) of pixels.
52: * These two cases aren't further differentiated into f vs. b, etc.
53: */
54:
55: /*
56: * To calculate the potential size of the bitblt program, use the following
57: * formulas:
58: * nonconverting max (bshg): X + 10L + 6S + 2F + 3E + 8LX + 3SX
59: * converting max (contracting by factor of 8, 32-bit memory accesses):
60: * X + XT + 46L + 48S + 96T + 64A + 32AX + 2F + 3E + 30LX + 48SX
61: *
62: * where X = Initsd+Extrainit+Iloop+Oloop+Rts ; XT = Inittab
63: * E = Emitop ; F = Field
64: * L = load, fetch or store ; LX = extra if pre or post decrement
65: * S = shift (sha or shb) ; SX = extra if OR too
66: * T = Table ; A = Assemble ; AX = Assemblex
67: */
68: enum
69: {
70: Tfshe = 0, /* each of the triples must be in order e, l, g */
71: Tfshl,
72: Tfshg,
73:
74: Tbshe,
75: Tbshl,
76: Tbshg,
77:
78: Toshe,
79: Toshl,
80: Toshg,
81:
82: Tobf,
83:
84: Texp,
85:
86: Tcon,
87:
88: Tlast, /* total number of cases */
89: };
90:
91: #ifdef TEST
92: /*
93: * globals used for testing
94: */
95: int FORCEFORW;
96: int FORCEBAKW;
97: GBitmap *curdm, *cursm;
98: Point curpt;
99: Rectangle curr;
100: Fcode curf;
101: void *mem;
102: #endif
103:
104: /*
105: * set up to compile -DT$objtype
106: */
107: #ifdef Tmips
108: #include "bbv.h"
109: #else
110: #ifdef T68020
111: #include "bb2.h"
112: #else
113: #ifdef Tsparc
114: #include "bbk.h"
115: #else
116: #ifdef T386
117: #include "bb8l.h"
118: #else
119: #ifdef Thobbit
120: #include "bbcl.h"
121: #else
122: #include "bbc.h"
123: #endif
124: #endif
125: #endif
126: #endif
127: #endif
128:
129: /*
130: * bitblt operates a 'word' at a time.
131: * WBITS is the number of bits in a word
132: * LWBITS=log2(WBITS),
133: * W2L is the number of words in a long
134: * WMASK has bits set for the low order word of a long
135: * WType is a pointer to a word
136: * if LENDIAN is true, then left-to-right in bitmap
137: * means low-order-bit to high-order-bit within a word,
138: * otherwise it is high-order-bit to low-order-bit.
139: */
140: #ifndef WBITS
141: #define WBITS 32
142: #define LWBITS 5
143: #define W2L 1
144: #define WMASK ~0UL
145: typedef ulong *WType;
146: #endif
147: /*
148: * scrshl(v,o) shifts a word v by o bits screen-leftward
149: * scrshr(v,o) shifts a word v by o bits screen-rightward
150: * scrpix(v,i,l) gets the value of pixel i within word v when ldepth is l
151: * scrmask(i,l) has ones for pixel i when ldepth is l
152: */
153: #define scrshl(v,o) (LENDIAN? ((v)>>(o)) : ((v)<<(o)))
154: #define scrshr(v,o) (LENDIAN? ((v)<<(o)) : ((v)>>(o)))
155: #define scrpix(v,i,l) (LENDIAN? (((v)>>((i)<<(l)))&((1<<(1<<(l)))-1)) : (((v)>>(32-(((i)+1)<<(l))))&((1<<(1<<(l)))-1)))
156: #define scrmask(i,l) (LENDIAN? (((1<<(1<<(l)))-1)<<((i)<<(l))) : (((1<<(1<<(l)))-1)<<(32-(((i)+1)<<l))))
157:
158: void
159: gbitblt(GBitmap *dm, Point pt, GBitmap *sm, Rectangle r, Fcode fcode)
160: {
161: int type; /* category of bitblt: Tfshe or ... */
162: int width; /* width in bits of dst */
163: int height; /* height in pixels minus 1 */
164: int sh; /* left shift of src to align with dst */
165: int soff; /* bit offset of src start point */
166: int doff; /* bit offset of dst start point */
167: int sc; /* src words used so far */
168: int dc; /* dst words used so far */
169: int le; /* log expansion factor */
170: int sspan; /* words between scanlines in src */
171: int dspan; /* words between scanlines in dst */
172: int sdep; /* src ldepth */
173: int ddep; /* dst ldepth */
174: int onstack; /* compiling to stack arena */
175: int backward; /* does bitblt have to go backwards? */
176: ulong* saddr; /* addr of word in src containing start point */
177: ulong* daddr; /* addr of word in dst containing start point */
178: ulong lmask; /* affected pixels in leftmost dst word */
179: ulong rmask; /* affected pixels in rightmost dst word */
180: Type* lo; /* addr in program for beginning of outer loop */
181: Type* li; /* addr in program for beginning of inner loop */
182: uchar *tab; /* conversion table */
183: int osiz; /* size of table entries, in bytes */
184: Type* memstart; /* start of program */
185: Type* p; /* next free address in program */
186: Type* fi; /* pointer to beginning of instrs for Rs f= Rd */
187: int fin; /* number of Types to copy after fi */
188: long v; /* for use in Emitop macro */
189: int c; /* a count */
190: int fs; /* if need to fetch source */
191: int fd; /* if need to fetch dest */
192: int b; /* for expansion: take b bits at a time from src */
193: int db; /* number of bits yielded by table lookup */
194: int dl; /* log2 number of bytes yielded by table lookup */
195: int sf; /* a bit offset in src */
196: int df; /* a bit offset in dst */
197: int firstd; /* doing first part of dst ? */
198: int firsts; /* doing first part of src ? */
199: int f; /* int version of fcode */
200: long tmp; /* for use by some macros */
201: int sha; /* |sh|%32 */
202: int shb; /* WBITS-sha, if sha!=0 */
203: int sfo; /* bit offset origin (needed if WBITS==8) */
204: Fstr *pf;
205: Type arena[Progmaxnoconv]; /* for non-converting bitblts */
206:
207: onstack = bbonstack();
208: gbitbltclip(&dm);
209:
210: #ifdef TEST
211: curdm = dm;
212: cursm = sm;
213: curpt = pt;
214: curr = r;
215: curf = fcode;
216: #endif
217:
218: width = r.max.x - r.min.x;
219: if(width <= 0)
220: return;
221: height = r.max.y - r.min.y - 1;
222: if(height < 0)
223: return;
224: ddep = dm->ldepth;
225: pt.x <<= ddep;
226: width <<= ddep;
227:
228: sdep = sm->ldepth;
229: r.min.x <<= sdep;
230: r.max.x <<= sdep;
231:
232: dspan = dm->width * W2L;
233: sspan = sm->width * W2L;
234:
235: daddr = (ulong*)((WType)dm->base
236: + dm->zero*W2L + pt.y*dspan
237: + (pt.x >> LWBITS));
238: saddr = (ulong*)((WType)sm->base
239: + sm->zero*W2L + r.min.y*sspan
240: + (r.min.x >> LWBITS));
241:
242: c = doff = pt.x & (WBITS-1);
243: soff = r.min.x & (WBITS-1);
244:
245: pf = &fstr[(f = fcode&0xF)];
246: fs = pf->fetchs;
247: fd = pf->fetchd;
248: fin = pf->n;
249: fi = (Type *)pf->instr;
250:
251: if(ddep == sdep || !fs) {
252: #ifdef TEST
253: if(!FORCEBAKW &&
254: (FORCEFORW || sm != dm || !fs || saddr > daddr ||
255: (saddr == daddr && soff > doff)))
256: backward = 0;
257: else
258: backward = 1;
259: #else
260: if(sm != dm || !fs || saddr > daddr ||
261: (saddr == daddr && soff > doff))
262: backward = 0;
263: else
264: backward = 1;
265: #endif
266: #ifdef HAVEBF
267: if(width <= WBITS) {
268: sh = 0;
269: type = Tobf;
270: if(backward) {
271: daddr = (ulong *)((WType)daddr + height*dspan);
272: saddr = (ulong *)((WType)saddr + height*sspan);
273: }
274: goto init;
275: }
276: #else
277: if(doff+width <= WBITS) {
278: type = Toshe;
279: if(backward) {
280: daddr = (ulong *)((WType)daddr + height*dspan);
281: saddr = (ulong *)((WType)saddr + height*sspan);
282: }
283: }
284: #endif
285: else {
286: if(!backward)
287: type = Tfshe;
288: else {
289: type = Tbshe;
290: doff = (WBITS-(doff+width)) & (WBITS-1);
291: soff = (WBITS-(soff+width)) & (WBITS-1);
292: daddr = (ulong*)((WType)dm->base
293: + dm->zero*W2L + (pt.y+height)*dspan
294: + ((pt.x + width+(WBITS-1))>>LWBITS));
295: saddr = (ulong*)((WType)sm->base
296: + sm->zero*W2L + (r.max.y-1)*sspan
297: + ((r.max.x + (WBITS-1))>>LWBITS));
298: }
299: }
300: if(fs) {
301: if((sh = soff - doff) != 0) {
302: if(sh < 0)
303: type += Tbshl-Tbshe;
304: else
305: type += Tbshg-Tbshe;
306: }
307: } else
308: sh = 0;
309: } else {
310: if(sdep < 0 || sdep > 3 ||
311: ddep < 0 || ddep > 3 ||
312: (tab = tabs[sdep][ddep]) == 0)
313: return; /* sorry, conversion not enabled */
314:
315: osiz = tabosiz[sdep][ddep];
316: le = ddep - sdep;
317: if(le > 0) {
318: type = Texp;
319: sh = soff - (doff >> le);
320: } else {
321: type = Tcon;
322: sh = soff - (doff << -le);
323: }
324: onstack = 0;
325: backward = 0;
326: }
327:
328: /* c has original doff (relative to beginning) */
329: lmask = scrshr(WMASK,c);
330: rmask = scrshl(WMASK,(WBITS - ((c+width) & (WBITS-1))))&WMASK;
331: if(!rmask)
332: rmask = WMASK;
333: if(sh != 0) {
334: if(sh > 0)
335: sha = sh;
336: else
337: sha = (-sh)&(WBITS-1);
338: shb = WBITS - sha;
339: }
340:
341: /* init: set up constant regs and outer loop */
342: init:
343: if(onstack)
344: memstart = arena;
345: else
346: memstart = (Type*)bbmalloc(Progmax * sizeof(Type));
347: p = memstart;
348: Initsd(saddr,daddr);
349: if(sh) {
350: Initsh(sha,shb);
351: }
352: Extrainit;
353:
354: if(height > 0) {
355: Olabel(height+1);
356: lo = p;
357: }
358: sc = 0;
359: dc = 0;
360:
361: /* emit inner loop */
362: switch(type){
363: #ifdef HAVEBF
364: case Tobf:
365: if(fd) {
366: Bfextu_RdAd(doff,width);
367: }
368: if(fs) {
369: Bfextu_RsAs(soff,width);
370: }
371: Emitop;
372: Bfins_AdRs(doff,width);
373: break;
374: #else
375: case Toshe:
376: /* one word dest, src and dest offsets same (or src not involved) */
377: lmask &= rmask;
378: if(fs) {
379: Load_Rs(0);
380: }
381: Fetch_Rd(1);
382: Emitop;
383: Ofield(lmask);
384: Store_Rs;
385: break;
386:
387: case Toshl:
388: /* one word dest, src offset less than dest offset */
389: lmask &= rmask;
390: Loadzx_Rt(0);
391: Fetch_Rd(0);
392: Orsha_RsRt;
393: Emitop;
394: Ofield(lmask);
395: Store_Rs;
396: break;
397:
398: case Toshg:
399: /* one word dest, src offset greater than dest offset */
400: lmask &= rmask;
401: if(sha+doff+width > WBITS) {
402: Load_Rt_P;
403: Olsha_RsRt;
404: Loadzx_Rt(0);
405: Fetch_Rd(0);
406: Oorrshb_RsRt;
407: if(backward)
408: sc--;
409: else
410: sc++;
411: } else {
412: Load_Rt(0);
413: Fetch_Rd(0);
414: Olsha_RsRt;
415: }
416: Emitop;
417: Ofield(lmask);
418: Store_Rs;
419: break;
420: #endif /* HAVEBF */
421:
422: case Tfshe:
423: /* forward, src and dest offsets same (or src not involved) */
424: Fetch_Rd(0);
425: if(fs) {
426: Load_Rs_P;
427: sc++;
428: } else {
429: Nop;
430: }
431: Emitop;
432: Ofield(lmask);
433: Store_Rs_P;
434: dc++;
435: width -= WBITS - doff;
436:
437: c = width >> LWBITS;
438: if(c) {
439: if(f == Zero || f == F) {
440: /* set up Rs outside loop */
441: Emitop;
442: }
443: li = 0;
444: if(c > 1) {
445: Ilabel(c);
446: li = p;
447: }
448: if(fd) {
449: Fetch_Rd(!fs);
450: }
451: if(fs) {
452: Load_Rs_P;
453: sc += c;
454: }
455: if(!(f == Zero || f == F)) {
456: Emitop;
457: }
458: Store_Rs_P;
459: dc += c;
460: if(c > 1) {
461: Iloop(li);
462: }
463: }
464:
465: if(width & (WBITS-1)) {
466: if(fs) {
467: Load_Rs(0);
468: }
469: Fetch_Rd(1);
470: Emitop;
471: Ofield(rmask);
472: Store_Rs;
473: }
474: break;
475:
476: case Tfshl:
477: /* forward, src offset less than dest offset */
478: Loadzx_Rt_P;
479: Fetch_Rd(0);
480: Orsha_RsRt;
481: sc++;
482: Emitop;
483: Ofield(lmask);
484: Store_Rs_P;
485: dc++;
486: width -= WBITS - doff;
487:
488: c = width >> LWBITS;
489: if(c) {
490: li = 0;
491: if(c > 1) {
492: Ilabel(c);
493: li = p;
494: }
495: Olshb_RsRt;
496: Loadzx_Rt_P;
497: if(fd) {
498: Fetch_Rd(0);
499: }
500: Oorrsha_RsRt;
501: sc += c;
502: Emitop;
503: Store_Rs_P;
504: dc += c;
505: if(c > 1) {
506: Iloop(li);
507: }
508: }
509:
510: width &= (WBITS-1);
511: if(width) {
512: Olshb_RsRt;
513: if(width > sha) {
514: Loadzx_Rt(0);
515: Fetch_Rd(0);
516: Oorrsha_RsRt;
517: } else {
518: Fetch_Rd(1);
519: }
520: Emitop;
521: Ofield(rmask);
522: Store_Rs;
523: }
524: break;
525:
526: case Tfshg:
527: /* forward, src offset greater than dest offset */
528: Load_Rt_P;
529: Olsha_RsRt;
530: Loadzx_Rt_P;
531: Fetch_Rd(0);
532: Oorrshb_RsRt;
533: sc += 2;
534: Emitop;
535: Ofield(lmask);
536: Store_Rs_P;
537: dc++;
538: width -= WBITS - doff;
539:
540: c = width >> LWBITS;
541: if(c) {
542: li = 0;
543: if(c > 1) {
544: Ilabel(c);
545: li = p;
546: }
547: Olsha_RsRt;
548: Loadzx_Rt_P;
549: if(fd) {
550: Fetch_Rd(0);
551: }
552: Oorrshb_RsRt;
553: sc += c;
554: Emitop;
555: Store_Rs_P;
556: dc += c;
557: if(c > 1) {
558: Iloop(li);
559: }
560: }
561:
562: width &= WBITS-1;
563: if(width) {
564: Olsha_RsRt;
565: if(width > shb) {
566: Loadzx_Rt(0);
567: Fetch_Rd(0);
568: Oorrshb_RsRt;
569: } else {
570: Fetch_Rd(1);
571: }
572: Emitop;
573: Ofield(rmask);
574: Store_Rs;
575: }
576: break;
577:
578: case Tbshe:
579: /* backward, src and dest offsets same (or src not involved) */
580: Load_Rs_D(0);
581: sc++;
582: Fetch_Rd_D(1);
583: Emitop;
584: Ofield(rmask);
585: Store_Rs;
586: dc++;
587: width -= WBITS - doff;
588:
589: c = width >> LWBITS;
590: if(c) {
591: li = 0;
592: if(c > 1) {
593: Ilabel(c);
594: li = p;
595: }
596: Load_Rs_D(0);
597: sc += c;
598: if(fd) {
599: Fetch_Rd_D(1);
600: Emitop;
601: Store_Rs;
602: } else {
603: Nop;
604: Emitop;
605: Store_Rs_D;
606: }
607: dc += c;
608: if(c > 1) {
609: Iloop(li);
610: }
611: }
612:
613: if(width & (WBITS-1)) {
614: Load_Rs_D(0);
615: sc++;
616: Fetch_Rd_D(1);
617: dc++;
618: Emitop;
619: Ofield(lmask);
620: Store_Rs;
621: }
622: break;
623:
624: case Tbshl:
625: /* backward, src offset less than dest offset */
626: Loadzx_Rt_D(0);
627: Fetch_Rd_D(0);
628: Olsha_RsRt;
629: sc++;
630: Emitop;
631: Ofield(rmask);
632: Store_Rs;
633: dc++;
634: width -= WBITS - doff;
635:
636: c = width >> LWBITS;
637: if(c) {
638: li = 0;
639: if(c > 1) {
640: Ilabel(c);
641: li = p;
642: }
643: Orshb_RsRt;
644: Loadzx_Rt_D(0);
645: if(fd) {
646: Fetch_Rd_D(0);
647: } else {
648: Nop;
649: }
650: Oorlsha_RsRt;
651: sc += c;
652: Emitop;
653: if(fd) {
654: Store_Rs;
655: } else {
656: Store_Rs_D;
657: }
658: dc += c;
659: if(c > 1) {
660: Iloop(li);
661: }
662: }
663:
664: width &= (WBITS-1);
665: if(width) {
666: Orshb_RsRt;
667: if(width > sha) {
668: Load_Rt_D(0);
669: Fetch_Rd_D(0);
670: Oorlsha_RsRt;
671: sc++;
672: } else {
673: Fetch_Rd_D(1);
674: }
675: dc++;
676: Emitop;
677: Ofield(lmask);
678: Store_Rs;
679: }
680: break;
681:
682: case Tbshg:
683: /* backward, src offset greater than dest offset */
684: Loadzx_Rt_D(0);
685: Fetch_Rd_D(0);
686: Orsha_RsRt;
687: Loadzx_Rt_D(1);
688: Oorlshb_RsRt;
689: sc += 2;
690: Emitop;
691: Ofield(rmask);
692: Store_Rs;
693: dc++;
694: width -= WBITS - doff;
695:
696: c = width >> LWBITS;
697: if(c) {
698: li = 0;
699: if(c > 1) {
700: Ilabel(c);
701: li = p;
702: }
703: Orsha_RsRt;
704: Loadzx_Rt_D(0);
705: if(fd) {
706: Fetch_Rd_D(0);
707: } else {
708: Nop;
709: }
710: Oorlshb_RsRt;
711: sc += c;
712: Emitop;
713: if(fd) {
714: Store_Rs;
715: } else {
716: Store_Rs_D;
717: }
718: dc += c;
719: if(c > 1) {
720: Iloop(li);
721: }
722: }
723:
724: width &= WBITS-1;
725: if(width) {
726: Orsha_RsRt;
727: if(width > shb) {
728: Loadzx_Rt_D(0);
729: Fetch_Rd_D(0);
730: sc++;
731: } else {
732: Fetch_Rd_D(1);
733: }
734: dc++;
735: Oorlshb_RsRt;
736: Emitop;
737: Ofield(lmask);
738: Store_Rs;
739: }
740: break;
741:
742: case Texp:
743: /* expansion: dest ldepth > src ldepth */
744: if(WBITS == 8) {
745: b = 8 >> le;
746: /* db == 8, dl == 0 */
747: } else {
748: b = (le <= 2) ? 8 : 32 / (1 << le);
749: db = b << le;
750: dl = (le <= 2) ? le : 2;
751: }
752: Inittab(tab,osiz);
753:
754: /*
755: * method:
756: * load the source a word at a time, into Rt;
757: * (if there is a shift, use Ru to hold next or last partial word,
758: * or, if WBITS == 8, it is <<8 in Rt)
759: * take b bits at a time from source and convert via table into Rd
760: * (each table lookup yields db bits, in 1<<dl bytes);
761: * assemble into Rs until have WBITS bits;
762: * fetch dest word into Rd, operate into Rs, store in dest
763: *
764: * this code needs reworking for expansion factor > 8
765: */
766:
767: if(WBITS == 8) {
768: if(sh == 0) {
769: Load_Rt_P;
770: sfo = 24;
771: } else if(sh > 0) {
772: Load_Rt_P;
773: Olsh_RtRt(8);
774: if((doff+width)>>le > shb) {
775: Loador_Rt_P;
776: sc++;
777: }
778: /* relevant source bits: Rt[16+sh..23+sh] */
779: sfo = 16 + sh;
780: } else {
781: Load_Rt_P;
782: /* relevant source bits: Rt[16+(8+sh)..23+(8+sh) */
783: sfo = 24 + sh;
784: }
785: sf = 0;
786: firstd = 1;
787: } else {
788: if(sh == 0) {
789: Load_Rt_P;
790: } else if(sh > 0) {
791: Load_Rt_P;
792:
793: Olsha_RtRt;
794: if((doff+width)>>le > shb) {
795: Load_Ru_P;
796: Oorrshb_RtRu;
797: sc++;
798: }
799: } else {
800: Load_Ru_P;
801: Orsha_RtRu;
802: }
803: sf = (soff - sh) & ~(b-1);
804: firstd = 1;
805: firsts = 1;
806: }
807: sc++;
808: while(sf < WBITS && width > 0) {
809: if(WBITS == 8) {
810: Table_RsRt(sf+sfo,b,0);
811: sf += b;
812: } else {
813: if(firstd)
814: df = (sf << le) & (WBITS-1);
815: else
816: df = 0;
817: while(df < WBITS) {
818: Table_RdRt(sf,b,dl);
819: if(df==0 || firsts) {
820: c = WBITS - (df + db);
821: Olsh_RsRd(c);
822: } else if(df == WBITS - db) {
823: Oor_RsRd;
824: } else {
825: c = WBITS - (df + db);
826: Oorlsh_RsRd(c);
827: }
828: sf += b;
829: df += db;
830: firsts = 0;
831: }
832: }
833: Fetch_Rd(1);
834: Emitop;
835: if(firstd) {
836: width -= WBITS - doff;
837: if(width > 0 && lmask != WMASK) {
838: Ofield(lmask);
839: } else if(width <= 0) {
840: lmask &= rmask;
841: Ofield(lmask);
842: }
843: } else {
844: width -= WBITS;
845: if(width < 0) {
846: Ofield(rmask);
847: }
848: }
849: Store_Rs_P;
850: dc++;
851: firstd = 0;
852: }
853: if(width <= 0)
854: break;
855:
856: c = width >> (LWBITS+le);
857: if(c) {
858: li = 0;
859: if(c > 1) {
860: Ilabel(c);
861: li = p;
862: }
863: if(WBITS == 8) {
864: Olsh_RtRt(8);
865: Loador_Rt_P;
866: } else {
867: if(sh == 0) {
868: Load_Rt_P;
869: } else if(sh > 0) {
870: Olsha_RtRu;
871: Load_Ru_P;
872: Oorrshb_RtRu;
873: } else {
874: Olshb_RtRu;
875: Load_Ru_P;
876: Oorrsha_RtRu;
877: }
878: }
879: sc += c;
880: for(sf = 0; sf < WBITS;) {
881: if(WBITS == 8) {
882: Table_RsRt(sf+sfo,b,0);
883: sf += b;
884: } else {
885: for(df = 0; df < WBITS;) {
886: Table_RdRt(sf,b,dl);
887: Assemblex(df,db);
888: sf += b;
889: df += db;
890: }
891: }
892: if(fd) {
893: Fetch_Rd(1);
894: }
895: Emitop;
896: Store_Rs_P;
897: dc += c;
898: }
899: if(c > 1) {
900: Iloop(li);
901: }
902: }
903: width -= c << (LWBITS+le);
904: if(width <= 0)
905: break;
906:
907: if(WBITS == 8) {
908: if(sh == 0) {
909: Load_Rt_P;
910: sc++;
911: } else if(sh > 0) {
912: Olsh_RtRt(8);
913: if(width>>le > shb) {
914: Loador_Rt_P;
915: sc++;
916: }
917: } else {
918: Olsh_RtRt(8);
919: if(width>>le > sha) {
920: Loador_Rt_P;
921: sc++;
922: }
923: }
924: } else {
925: if(sh == 0) {
926: Load_Rt_P;
927: sc++;
928: } else if(sh > 0) {
929: Olsha_RtRu;
930: if(width>>le > shb) {
931: Load_Ru_P;
932: Oorrshb_RtRu;
933: sc++;
934: }
935: } else {
936: Olshb_RtRu;
937: if(width>>le > sha) {
938: Load_Ru_P;
939: Oorrsha_RtRu;
940: sc++;
941: }
942: }
943: }
944: for(sf = 0; sf < WBITS && width > 0; ) {
945: if(WBITS == 8) {
946: Table_RsRt(sf+sfo,b,0);
947: sf += b;
948: } else {
949: for(df = 0; df < WBITS;) {
950: Table_RdRt(sf,b,dl);
951: Assemblex(df,db);
952: sf += b;
953: df += db;
954: }
955: }
956: Fetch_Rd(1);
957: Emitop;
958: width -= WBITS;
959: if(width < 0) {
960: Ofield(rmask);
961: }
962: Store_Rs_P;
963: dc++;
964: }
965: break;
966:
967: case Tcon:
968: /* contraction: dest ldepth < src ldepth */
969: db = 8 >> -le;
970: Inittab(tab,osiz);
971:
972: /*
973: * method:
974: * load the source a word at a time, into Rt;
975: * (if there is a shift, use Ru to hold next or last partial word,
976: * or, if WBITS==8, it is <<8 in Rt)
977: * take 8 bits at a time from source and convert via table into Rd
978: * (each table lookup yields db bits, in 1 byte);
979: * assemble into Rs until have WBITS bits (takes several src words);
980: * fetch dest word into Rd, operate into Rs, store in dest
981: *
982: * Something should be done to improve this code, but
983: * it isn't used much.
984: */
985:
986: if(sh < 0) {
987: c = (-sh)/WBITS;
988: sh += c*WBITS;
989: if(WBITS == 8) sfo = 24 + sh;
990: } else {
991: c = 0;
992: if(WBITS == 8) sfo = sh ? 16 + sh : 24;
993: }
994: firstd = 1;
995: firsts = 1;
996: for(df = c*db*(4/W2L); df < WBITS && df < doff + width; ) {
997: c = (doff + width - df) << -le;
998: /* c = number of source bits needed to fill rest */
999: if(WBITS == 8) {
1000: if(sh == 0) {
1001: Load_Rt_P;
1002: sc++;
1003: } else if(sh > 0) {
1004: if(firsts) {
1005: Load_Rt_P;
1006: Olsh_RtRt(8);
1007: sc++;
1008: } else {
1009: Olsh_RtRt(8);
1010: }
1011: if(shb < c) {
1012: Loador_Rt_P;
1013: sc++;
1014: }
1015: } else {
1016: if(firsts) {
1017: Load_Rt_P;
1018: sc++;
1019: } else {
1020: Olsh_RtRt(8);
1021: if(sha < c) {
1022: Loador_Rt_P;
1023: sc++;
1024: }
1025: }
1026: }
1027: } else {
1028: if(sh == 0) {
1029: Load_Rt_P;
1030: sc++;
1031: } else if(sh > 0) {
1032: if(firsts) {
1033: Load_Rt_P;
1034: Olsha_RtRt;
1035: sc++;
1036: } else {
1037: Olsha_RtRu;
1038: }
1039: if(shb < c) {
1040: Load_Ru_P;
1041: sc++;
1042: Oorrshb_RtRu;
1043: }
1044: } else {
1045: if(!firsts) {
1046: Olshb_RtRu;
1047: }
1048: if(sha < c) {
1049: Load_Ru_P;
1050: sc++;
1051: if(firsts) {
1052: Orsha_RtRu;
1053: } else {
1054: Oorrsha_RtRu;
1055: }
1056: }
1057: }
1058: }
1059: firsts = 0;
1060: if(WBITS == 8) {
1061: Table_RdRt(sfo,8,0);
1062: if(firstd) {
1063: Olsh_RsRd(8-(df+db));
1064: } else {
1065: Assemble(df,db);
1066: }
1067: df += db;
1068: firstd = 0;
1069: } else {
1070: for(sf = 0; sf < WBITS; ) {
1071: Table_RdRt(sf,8,0);
1072: if(firstd) {
1073: c = WBITS-(df+db);
1074: Olsh_RsRd(c);
1075: } else {
1076: Assemble(df,db);
1077: }
1078: df += db;
1079: sf += 8;
1080: firstd = 0;
1081: }
1082: }
1083: }
1084: Fetch_Rd(1);
1085: Emitop;
1086: width -= WBITS - doff;
1087: if(width > 0 && lmask != WMASK) {
1088: Ofield(lmask);
1089: } else if(width <= 0) {
1090: lmask &= rmask;
1091: Ofield(lmask);
1092: }
1093: Store_Rs_P;
1094: dc++;
1095: if(width <= 0)
1096: break;
1097:
1098: c = width >> LWBITS;
1099: if(c) {
1100: li = 0;
1101: if(c > 1) {
1102: Ilabel(c);
1103: li = p;
1104: }
1105: for(df = 0; df < WBITS; ) {
1106: if(WBITS == 8) {
1107: if(sh == 0) {
1108: Load_Rt_P;
1109: } else {
1110: Olsh_RtRt(8);
1111: Loador_Rt_P;
1112: }
1113: sc += c;
1114: Table_RdRt(sfo,8,0);
1115: Assemble(df,db);
1116: df += db;
1117: } else {
1118: if(sh == 0) {
1119: Load_Rt_P;
1120: } else if(sh > 0) {
1121: Olsha_RtRu;
1122: Load_Ru_P;
1123: Oorrshb_RtRu;
1124: } else {
1125: Olshb_RtRu;
1126: Load_Ru_P;
1127: Oorrsha_RtRu;
1128: }
1129: sc += c;
1130: for(sf = 0; sf < WBITS; ) {
1131: Table_RdRt(sf,8,0);
1132: Assemblex(df,db);
1133: df += db;
1134: sf += 8;
1135: }
1136: }
1137: }
1138: if(fd) {
1139: Fetch_Rd(1);
1140: }
1141: Emitop;
1142: Store_Rs_P;
1143: dc += c;
1144: if(c > 1) {
1145: Iloop(li);
1146: }
1147: }
1148:
1149: width -= c << LWBITS;
1150: if(width <= 0)
1151: break;
1152:
1153: for(df = 0; df < width; ) {
1154: c = (width - df) << -le;
1155: if(WBITS == 8) {
1156: if(sh == 0) {
1157: Load_Rt_P;
1158: sc++;
1159: } else if(sh > 0) {
1160: Olsh_RtRt(8);
1161: if(shb < c) {
1162: sc++;
1163: Loador_Rt_P;
1164: }
1165: } else {
1166: Olsh_RtRt(8);
1167: if(sha < c) {
1168: Loador_Rt_P;
1169: sc++;
1170: }
1171: }
1172: Table_RdRt(sfo,8,0);
1173: Assemble(df,db);
1174: df += db;
1175: } else {
1176: if(sh == 0) {
1177: Load_Rt_P;
1178: sc++;
1179: } else if(sh > 0) {
1180: Olsha_RtRu;
1181: if(shb < c) {
1182: Load_Ru_P;
1183: Oorrshb_RtRu;
1184: sc++;
1185: }
1186: } else {
1187: Olshb_RtRu;
1188: if(sha < c) {
1189: Load_Ru_P;
1190: Oorrsha_RtRu;
1191: sc++;
1192: }
1193: }
1194: for(sf = 0; sf < 32; ) {
1195: Table_RdRt(sf,8,0);
1196: Assemble(df,db);
1197: df += db;
1198: sf += 8;
1199: }
1200: }
1201: }
1202: Fetch_Rd(1);
1203: Emitop;
1204: Ofield(rmask);
1205: Store_Rs_P;
1206: dc++;
1207: break;
1208:
1209: }
1210:
1211: /* finish outer loop, put in rts, and execute */
1212:
1213: if(height > 0) {
1214: if(backward)
1215: c = (dc - dspan) * (WBITS/8);
1216: else
1217: c = (dspan - dc) * (WBITS/8);
1218: if(c) {
1219: Add_Ad(c);
1220: }
1221: if(fs) {
1222: if(backward)
1223: c = (sc - sspan) * (WBITS/8);
1224: else
1225: c = (sspan - sc) * (WBITS/8);
1226: if(c) {
1227: Add_As(c);
1228: }
1229: }
1230: Oloop(lo);
1231: }
1232: Orts;
1233: #ifdef TEST
1234: if(onstack && p - memstart > Progmaxnoconv)
1235: print("Increase Progmaxnoconv to at least %d!\n", p - memstart);
1236: else if(p - memstart > Progmax)
1237: print("Increase Progmax to at least %d!\n", p - memstart);
1238: mem = memstart;
1239: #endif
1240: bbexec((void*)memstart, (p-memstart)*sizeof(Type), onstack);
1241: }
1242:
1243: #ifdef TEST
1244: void prprog(void);
1245: GBitmap *bb1, *bb2;
1246: ulong *src, *dst, *xdst, *xans;
1247: int swds, dwds;
1248: long ticks;
1249: int timeit;
1250:
1251: #ifdef BYTEREV
1252: ulong
1253: byterev(ulong v)
1254: {
1255: return (v>>24)|((v>>8)&0x0000FF00)|((v<<8)&0x00FF0000)|(v<<24);
1256: }
1257: #endif
1258: #ifdef T386
1259: long _clock;
1260: #endif
1261:
1262: long
1263: func(int f, long s, int sld, long d, int dld)
1264: {
1265: long a;
1266: int sh, i, db, sb;
1267:
1268: db = 1 << dld;
1269: sb = 1 << sld;
1270: sh = db - sb;
1271: if(sh > 0) {
1272: a = s;
1273: for(i = sb; i<db; i += sb){
1274: a <<= sb;
1275: s |= a;
1276: }
1277: } else if(sh < 0)
1278: s >>= -sh;
1279:
1280: switch(f){
1281: case Zero: d = 0; break;
1282: case DnorS: d = ~(d|s); break;
1283: case DandnotS: d = d & ~s; break;
1284: case notS: d = ~s; break;
1285: case notDandS: d = ~d & s; break;
1286: case notD: d = ~d; break;
1287: case DxorS: d = d ^ s; break;
1288: case DnandS: d = ~(d&s); break;
1289: case DandS: d = d & s; break;
1290: case DxnorS: d = ~(d^s); break;
1291: case S: d = s; break;
1292: case DornotS: d = d | ~s; break;
1293: case D: d = d; break;
1294: case notDorS: d = ~d | s; break;
1295: case DorS: d = d | s; break;
1296: case F: d = ~0; break;
1297: }
1298:
1299: d &= ((1<<db)-1);
1300: return d;
1301: }
1302:
1303: void
1304: run(int fr, int to, int w, int op)
1305: {
1306: int i, j, f, t, fy, ty;
1307: extern long *_clock;
1308:
1309: fr += bb2->r.min.x;
1310: to += bb1->r.min.x;
1311: fy = bb2->r.min.y + 1;
1312: ty = bb1->r.min.y + 1;
1313: if(timeit) {
1314: memcpy(dst, xdst, dwds * sizeof(long));
1315: ticks -= *_clock;
1316: gbitblt(bb1, Pt(to,ty), bb2, Rect(fr,fy,fr+w,fy+2), op);
1317: ticks += *_clock;
1318: return;
1319: }
1320: f = fr;
1321: t = to;
1322: memcpy(dst, xdst, dwds * sizeof(long));
1323: for(i=0; i<w; i++) {
1324: gbitblt(bb1, Pt(t,ty), bb2, Rect(f,fy,f+1,fy+1), op);
1325: gbitblt(bb1, Pt(t,ty+1), bb2, Rect(f,fy+1,f+1,fy+2), op);
1326: f++;
1327: t++;
1328: }
1329: memcpy(xans, dst, dwds * sizeof(long));
1330:
1331: memcpy(dst, xdst, dwds * sizeof(long));
1332: gbitblt(bb1, Pt(to,ty), bb2, Rect(fr,fy,fr+w,fy+2), op);
1333:
1334: if(memcmp(xans, dst, dwds * sizeof(long))) {
1335: /*
1336: * print src and dst row offset, width in bits, and forw/back
1337: * then print for each of the four rows: the source (s),
1338: * the dest (d), the good value of the answer (g),
1339: * and the actual bad value of the answer (b)
1340: */
1341: print("fr=%d to=%d w=%d fb=%d%d\n",
1342: fr, to, w, FORCEFORW, FORCEBAKW);
1343: print("dst bitmap b %#lux, z %d, w %d, ld %d, r [%d,%d][%d,%d]\n",
1344: bb1->base, bb1->zero, bb1->width, bb1->ldepth,
1345: bb1->r.min.x, bb1->r.min.y, bb1->r.max.x, bb1->r.max.y);
1346: print("src bitmap b %#lux, z %d, w %d, ld %d, r [%d,%d][%d,%d]\n",
1347: bb2->base, bb2->zero, bb2->width, bb2->ldepth,
1348: bb2->r.min.x, bb2->r.min.y, bb2->r.max.x, bb2->r.max.y);
1349: for(j=0; 7*j < dwds; j++) {
1350: print("\ns");
1351: for(i=0; i<7 && 7*j+i < dwds; i++)
1352: print(" %.8lux", src[7*j + i]);
1353: print("\nd");
1354: for(i=0; i<7 && 7*j+i < dwds; i++)
1355: print(" %.8lux", xdst[7*j + i]);
1356: print("\ng");
1357: for(i=0; i<7 && 7*j+i < dwds; i++)
1358: print(" %.8lux", xans[7*j + i]);
1359: print("\nb");
1360: for(i=0; i<7 && 7*j+i < dwds; i++)
1361: print(" %.8lux", dst[7*j + i]);
1362: print("\n");
1363: }
1364: prprog();
1365: }
1366: }
1367:
1368: void
1369: main(int argc, char *argv[])
1370: {
1371: int f, t, w, i, sld, dld, op, iters, simple;
1372: ulong s, d, spix, dpix, apix, fpix, m, *ps, *pd;
1373: Point sorg, dorg;
1374: GBitmap *bs, *bd;
1375: long seed;
1376: char *ct;
1377:
1378: sld = 0;
1379: dld = 0;
1380: timeit = 0;
1381: iters = 200;
1382: simple = 0;
1383: ARGBEGIN {
1384: case 'i':
1385: iters = atoi(ARGF());
1386: break;
1387: case 's':
1388: simple = 1;
1389: break;
1390: case 't':
1391: timeit = 1;
1392: ct = ARGF();
1393: if(ct)
1394: iters = atoi(ct);
1395: break;
1396: } ARGEND
1397: if(argc > 0)
1398: sld = atoi(argv[0]);
1399: if(argc > 1)
1400: dld = atoi(argv[1]);
1401: if(sld < 0 || sld > 3 || dld < 0 || dld > 3 ||
1402: (sld != dld && !tabs[sld][dld])){
1403: print("conversion from ldepth %d to %d not enabled\n",
1404: sld, dld);
1405: exits(0);
1406: }
1407: if(!timeit && !simple) {
1408: seed = time(0);
1409: print("seed %lux\n", seed); srand(seed); /**/
1410: }
1411:
1412: print("sld %d dld %d\n", sld, dld);
1413: op = 1/*Zero*/;
1414:
1415: /* bitmaps for 1-bit tests */
1416: bd = gballoc(Rect(0,0,32,1), dld);
1417: bs = gballoc(Rect(0,0,32,1), sld);
1418: for(i=0; i<bs->width; i++)
1419: bs->base[i] = lrand();
1420:
1421: /* bitmaps for rect tests */
1422: if(simple) {
1423: dorg = Pt(0,0);
1424: sorg = Pt(0,0);
1425: } else {
1426: dorg = Pt(nrand(63)-31,nrand(63)-31);
1427: sorg = Pt(nrand(63)-31,nrand(63)-31);
1428: }
1429: bb1 = gballoc(Rpt(dorg,add(dorg,Pt(200,4))), dld);
1430: bb2 = gballoc(Rpt(sorg,add(sorg,Pt(200,4))), sld);
1431: dwds = bb1->width * Dy(bb1->r);
1432: swds = bb2->width * Dy(bb2->r);
1433: dst = bb1->base;
1434: src = bb2->base;
1435: xdst = malloc(dwds * sizeof(long));
1436: xans = malloc(dwds * sizeof(long));
1437: for(i=0; i<swds; i++)
1438: src[i] = lrand();
1439: for(i=0; i<dwds; i++)
1440: xdst[i] = lrand();
1441: loop:
1442: print("Op %d\n", op);
1443: if(!timeit) {
1444: print("one pixel\n");
1445: ps = bs->base;
1446: pd = bd->base;
1447: FORCEFORW = 1;
1448: FORCEBAKW = 0;
1449: for(i=0; i<1000; i++, FORCEFORW = !FORCEFORW, FORCEBAKW = !FORCEBAKW) {
1450: f = nrand(32 >> sld);
1451: t = nrand(32 >> dld);
1452: s = lrand();
1453: d = lrand();
1454: ps[0] = s;
1455: pd[0] = d;
1456: #ifdef BYTEREV
1457: spix = scrpix(byterev(s),f,sld);
1458: dpix = scrpix(byterev(d),t,dld);
1459: #else
1460: spix = scrpix(s,f,sld);
1461: dpix = scrpix(d,t,dld);
1462: #endif
1463: apix = func(op, spix, sld, dpix, dld);
1464: gbitblt(bd, Pt(t,0), bs, Rect(f,0,f+1,1), op);
1465: if(ps[0] != s) {
1466: print("bb src %.8lux %.8lux %d %d\n", ps[0], s, f, t);
1467: exits("error");
1468: }
1469: m = scrmask(t,dld);
1470: #ifdef BYTEREV
1471: m = byterev(m);
1472: #endif
1473: if((pd[0] & ~m) != (d & ~m)) {
1474: print("bb dst1 %.8lux %.8lux\n",
1475: s, d);
1476: print("bb %.8lux %.8lux %d %d\n",
1477: ps[0], pd[0], f, t);
1478: prprog();
1479: exits("error");
1480: }
1481: #ifdef BYTEREV
1482: fpix = scrpix(byterev(pd[0]),t,dld);
1483: #else
1484: fpix = scrpix(pd[0],t,dld);
1485: #endif
1486: if(apix != fpix) {
1487: print("bb dst2 %.8lux %.8lux\n",
1488: s, d);
1489: print("bb %.8lux %.8lux %d %d\n",
1490: ps[0], pd[0], f, t);
1491: print("bb %.8lux %.8lux %.8lux %.8lux\n",
1492: spix, dpix, apix, fpix);
1493: prprog();
1494: exits("error");
1495: }
1496: }
1497: }
1498:
1499: print("for\n");
1500: FORCEFORW = 1;
1501: FORCEBAKW = 0;
1502:
1503: for(i=0; i<iters; i++) {
1504: f = nrand(64);
1505: t = nrand(64);
1506: w = nrand(130);
1507: run(f, t, w, op);
1508: }
1509:
1510: if(sld == dld) {
1511: print("bak\n");
1512: FORCEFORW = 0;
1513: FORCEBAKW = 1;
1514:
1515: for(i=0; i<iters; i++) {
1516: f = nrand(64);
1517: t = nrand(64);
1518: w = nrand(130);
1519: run(f, t, w, op);
1520: }
1521: }
1522:
1523: if(op < F) {
1524: op++;
1525: goto loop;
1526: }
1527: if(timeit)
1528: print("time: %d ticks\n", ticks);
1529: exits(0);
1530: }
1531:
1532:
1533: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.