|
|
1.1 root 1: //
2: // d_draw.s
3: // x86 assembly-language horizontal 8-bpp span-drawing code.
4: //
5:
6: #include "asm_i386.h"
7: #include "quakeasm.h"
8: #include "asm_draw.h"
9: #include "d_ifacea.h"
10:
1.1.1.2 ! root 11: #if id386
1.1 root 12:
13: //----------------------------------------------------------------------
14: // 8-bpp horizontal span drawing code for polygons, with no transparency.
15: //
16: // Assumes there is at least one span in pspans, and that every span
17: // contains at least one pixel
18: //----------------------------------------------------------------------
19:
1.1.1.2 ! root 20: .data
! 21:
! 22: //-------------------------------------------------------
! 23: // global refresh variables
! 24: //-------------------------------------------------------
! 25:
! 26: // FIXME: put all refresh variables into one contiguous block. Make into one
! 27: // big structure, like cl or sv?
! 28:
! 29: .align 4
! 30: .globl C(d_sdivzstepu)
! 31: .globl C(d_tdivzstepu)
! 32: .globl C(d_zistepu)
! 33: .globl C(d_sdivzstepv)
! 34: .globl C(d_tdivzstepv)
! 35: .globl C(d_zistepv)
! 36: .globl C(d_sdivzorigin)
! 37: .globl C(d_tdivzorigin)
! 38: .globl C(d_ziorigin)
! 39: C(d_sdivzstepu): .single 0
! 40: C(d_tdivzstepu): .single 0
! 41: C(d_zistepu): .single 0
! 42: C(d_sdivzstepv): .single 0
! 43: C(d_tdivzstepv): .single 0
! 44: C(d_zistepv): .single 0
! 45: C(d_sdivzorigin): .single 0
! 46: C(d_tdivzorigin): .single 0
! 47: C(d_ziorigin): .single 0
! 48:
! 49: .globl C(sadjust)
! 50: .globl C(tadjust)
! 51: .globl C(bbextents)
! 52: .globl C(bbextentt)
! 53: C(sadjust): .long 0
! 54: C(tadjust): .long 0
! 55: C(bbextents): .long 0
! 56: C(bbextentt): .long 0
! 57:
! 58: .globl C(cacheblock)
! 59: .globl C(d_viewbuffer)
! 60: .globl C(cachewidth)
! 61: .globl C(d_pzbuffer)
! 62: .globl C(d_zrowbytes)
! 63: .globl C(d_zwidth)
! 64: C(cacheblock): .long 0
! 65: C(cachewidth): .long 0
! 66: C(d_viewbuffer): .long 0
! 67: C(d_pzbuffer): .long 0
! 68: C(d_zrowbytes): .long 0
! 69: C(d_zwidth): .long 0
! 70:
! 71:
! 72: //-------------------------------------------------------
! 73: // ASM-only variables
! 74: //-------------------------------------------------------
! 75: .globl cw, full_cw, izi, float_1, float_particle_z_clip, float_point5
! 76: .globl float_minus_1, float_0
! 77: izi: .long 0
! 78: float_0: .single 0
! 79: float_1: .single 1
! 80: float_minus_1: .single -1
! 81: float_particle_z_clip: .single PARTICLE_Z_CLIP
! 82: float_point5: .single 0.5
! 83:
! 84: .globl fp_16, fp_64k, pbase, s, t, sfracf, tfracf, snext, tnext, fp_1m
! 85: .globl spancountminus1, zi16stepu, sdivz16stepu, tdivz16stepu, fp_1m_minus_1
! 86: .globl FloatZero
! 87: s: .long 0
! 88: t: .long 0
! 89: snext: .long 0
! 90: tnext: .long 0
! 91: sfracf: .long 0
! 92: tfracf: .long 0
! 93: pbase: .long 0
! 94: pdestspan: .long 0
! 95: zi8stepu: .long 0
! 96: sdivz8stepu: .long 0
! 97: tdivz8stepu: .long 0
! 98: zi16stepu: .long 0
! 99: sdivz16stepu: .long 0
! 100: tdivz16stepu: .long 0
! 101: fp_1m: .single 1048576.0
! 102: fp_1m_minus_1: .single 1048575.0
! 103: fp_64k: .single 65536.0
! 104: fp_8: .single 8.0
! 105: fp_16: .single 16.0
! 106: spancountminus1: .long 0
! 107:
! 108: FloatZero: .long 0
! 109: Float2ToThe31nd: .long 0x4f000000
! 110: FloatMinus2ToThe31nd: .long 0xcf000000
! 111: izistep: .long 0
! 112:
! 113: //-------------------------------------------------------
! 114: // local variables for d_draw16.s
! 115: //-------------------------------------------------------
! 116:
! 117: .globl reciprocal_table_16, entryvec_table_16
! 118: // 1/2, 1/3, 1/4, 1/5, 1/6, 1/7, 1/8, 1/9, 1/10, 1/11, 1/12, 1/13,
! 119: // 1/14, and 1/15 in 0.32 form
! 120: reciprocal_table_16: .long 0x40000000, 0x2aaaaaaa, 0x20000000
! 121: .long 0x19999999, 0x15555555, 0x12492492
! 122: .long 0x10000000, 0xe38e38e, 0xccccccc, 0xba2e8ba
! 123: .long 0xaaaaaaa, 0x9d89d89, 0x9249249, 0x8888888
! 124:
! 125: entryvec_table_16: .long 0, Entry2_16, Entry3_16, Entry4_16
! 126: .long Entry5_16, Entry6_16, Entry7_16, Entry8_16
! 127: .long Entry9_16, Entry10_16, Entry11_16, Entry12_16
! 128: .long Entry13_16, Entry14_16, Entry15_16, Entry16_16
! 129:
! 130: //-------------------------------------------------------
! 131: // local variables for d_parta.s
! 132: //-------------------------------------------------------
! 133: .globl DP_Count, DP_u, DP_v, DP_0x8000, DP_Color, DP_Pix, DP_EntryTable
! 134: DP_Count: .long 0
! 135: DP_u: .long 0
! 136: DP_v: .long 0
! 137: DP_0x8000: .single 32768.0
! 138: DP_Color: .long 0
! 139: DP_Pix: .long 0
! 140: DP_EntryTable: .long DP_1x1, DP_2x2, DP_3x3, DP_4x4
! 141:
! 142: //
! 143: // advancetable is 8 bytes, but points to the middle of that range so negative
! 144: // offsets will work
! 145: //
! 146: .globl advancetable, sstep, tstep, pspantemp, counttemp, jumptemp
! 147: advancetable: .long 0, 0
! 148: sstep: .long 0
! 149: tstep: .long 0
! 150:
! 151: pspantemp: .long 0
! 152: counttemp: .long 0
! 153: jumptemp: .long 0
! 154:
! 155: // 1/2, 1/3, 1/4, 1/5, 1/6, and 1/7 in 0.32 form
! 156: reciprocal_table: .long 0x40000000, 0x2aaaaaaa, 0x20000000
! 157: .long 0x19999999, 0x15555555, 0x12492492
! 158:
! 159: entryvec_table: .long 0, LEntry2_8, LEntry3_8, LEntry4_8
! 160: .long LEntry5_8, LEntry6_8, LEntry7_8, LEntry8_8
! 161:
1.1 root 162: .text
163:
164: // out-of-line, rarely-needed clamping code
165:
166: LClampHigh0:
167: movl C(bbextents),%esi
168: jmp LClampReentry0
169: LClampHighOrLow0:
170: jg LClampHigh0
171: xorl %esi,%esi
172: jmp LClampReentry0
173:
174: LClampHigh1:
175: movl C(bbextentt),%edx
176: jmp LClampReentry1
177: LClampHighOrLow1:
178: jg LClampHigh1
179: xorl %edx,%edx
180: jmp LClampReentry1
181:
182: LClampLow2:
183: movl $2048,%ebp
184: jmp LClampReentry2
185: LClampHigh2:
186: movl C(bbextents),%ebp
187: jmp LClampReentry2
188:
189: LClampLow3:
190: movl $2048,%ecx
191: jmp LClampReentry3
192: LClampHigh3:
193: movl C(bbextentt),%ecx
194: jmp LClampReentry3
195:
196: LClampLow4:
197: movl $2048,%eax
198: jmp LClampReentry4
199: LClampHigh4:
200: movl C(bbextents),%eax
201: jmp LClampReentry4
202:
203: LClampLow5:
204: movl $2048,%ebx
205: jmp LClampReentry5
206: LClampHigh5:
207: movl C(bbextentt),%ebx
208: jmp LClampReentry5
209:
210:
211: #define pspans 4+16
212:
213: .align 4
214: .globl C(D_DrawSpans8)
215: C(D_DrawSpans8):
216: pushl %ebp // preserve caller's stack frame
217: pushl %edi
218: pushl %esi // preserve register variables
219: pushl %ebx
220:
221: //
222: // set up scaled-by-8 steps, for 8-long segments; also set up cacheblock
223: // and span list pointers
224: //
225: // TODO: any overlap from rearranging?
226: flds C(d_sdivzstepu)
227: fmuls fp_8
228: movl C(cacheblock),%edx
229: flds C(d_tdivzstepu)
230: fmuls fp_8
231: movl pspans(%esp),%ebx // point to the first span descriptor
232: flds C(d_zistepu)
233: fmuls fp_8
234: movl %edx,pbase // pbase = cacheblock
235: fstps zi8stepu
236: fstps tdivz8stepu
237: fstps sdivz8stepu
238:
239: LSpanLoop:
240: //
241: // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
242: // initial s and t values
243: //
244: // FIXME: pipeline FILD?
245: fildl espan_t_v(%ebx)
246: fildl espan_t_u(%ebx)
247:
248: fld %st(1) // dv | du | dv
249: fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv
250: fld %st(1) // du | dv*d_sdivzstepv | du | dv
251: fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
252: fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
253: fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu |
254: // dv*d_sdivzstepv | du | dv
255: fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu |
256: // dv*d_sdivzstepv | du | dv
257: faddp %st(0),%st(2) // du*d_tdivzstepu |
258: // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
259: fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
260: // du*d_tdivzstepu | du | dv
261: fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv |
262: // du*d_tdivzstepu | du | dv
263: fmuls C(d_tdivzstepv) // dv*d_tdivzstepv |
264: // du*d_sdivzstepu + dv*d_sdivzstepv |
265: // du*d_tdivzstepu | du | dv
266: fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
267: // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
268: fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
269: // du*d_sdivzstepu; stays in %st(2) at end
270: fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
271: // s/z
272: fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv |
273: // du*d_tdivzstepu | du | s/z
274: fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv |
275: // du*d_tdivzstepu | du | s/z
276: faddp %st(0),%st(2) // dv*d_zistepv |
277: // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
278: fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu |
279: // dv*d_zistepv | s/z
280: fmuls C(d_zistepu) // du*d_zistepu |
281: // dv*d_tdivzstepv + du*d_tdivzstepu |
282: // dv*d_zistepv | s/z
283: fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu |
284: // du*d_zistepu | dv*d_zistepv | s/z
285: fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv +
286: // du*d_tdivzstepu; stays in %st(1) at end
287: fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z
288: faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z
289:
290: flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
291: fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
292: fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv +
293: // du*d_zistepu; stays in %st(0) at end
294: // 1/z | fp_64k | t/z | s/z
295: //
296: // calculate and clamp s & t
297: //
298: fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z
299:
300: //
301: // point %edi to the first pixel in the span
302: //
303: movl C(d_viewbuffer),%ecx
304: movl espan_t_v(%ebx),%eax
305: movl %ebx,pspantemp // preserve spans pointer
306:
307: movl C(tadjust),%edx
308: movl C(sadjust),%esi
309: movl C(d_scantable)(,%eax,4),%edi // v * screenwidth
310: addl %ecx,%edi
311: movl espan_t_u(%ebx),%ecx
312: addl %ecx,%edi // pdest = &pdestspan[scans->u];
313: movl espan_t_count(%ebx),%ecx
314:
315: //
316: // now start the FDIV for the end of the span
317: //
318: cmpl $8,%ecx
319: ja LSetupNotLast1
320:
321: decl %ecx
322: jz LCleanup1 // if only one pixel, no need to start an FDIV
323: movl %ecx,spancountminus1
324:
325: // finish up the s and t calcs
326: fxch %st(1) // z*64k | 1/z | t/z | s/z
327:
328: fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
1.1.1.2 ! root 329: fmul %st(4),%st // s | z*64k | 1/z | t/z | s/z
1.1 root 330: fxch %st(1) // z*64k | s | 1/z | t/z | s/z
1.1.1.2 ! root 331: fmul %st(3),%st // t | s | 1/z | t/z | s/z
1.1 root 332: fxch %st(1) // s | t | 1/z | t/z | s/z
333: fistpl s // 1/z | t | t/z | s/z
334: fistpl t // 1/z | t/z | s/z
335:
336: fildl spancountminus1
337:
338: flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1
339: flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
340: fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
341: fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
342: fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
343: fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
344: fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
345: // C(d_tdivzstepu)*scm1
346: fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
347: // C(d_tdivzstepu)*scm1
348: faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
349: fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
350: faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
351: faddp %st(0),%st(3)
352:
353: flds fp_64k
1.1.1.2 ! root 354: fdiv %st(1),%st // this is what we've gone to all this trouble to
1.1 root 355: // overlap
356: jmp LFDIVInFlight1
357:
358: LCleanup1:
359: // finish up the s and t calcs
360: fxch %st(1) // z*64k | 1/z | t/z | s/z
361:
362: fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
1.1.1.2 ! root 363: fmul %st(4),%st // s | z*64k | 1/z | t/z | s/z
1.1 root 364: fxch %st(1) // z*64k | s | 1/z | t/z | s/z
1.1.1.2 ! root 365: fmul %st(3),%st // t | s | 1/z | t/z | s/z
1.1 root 366: fxch %st(1) // s | t | 1/z | t/z | s/z
367: fistpl s // 1/z | t | t/z | s/z
368: fistpl t // 1/z | t/z | s/z
369: jmp LFDIVInFlight1
370:
371: .align 4
372: LSetupNotLast1:
373: // finish up the s and t calcs
374: fxch %st(1) // z*64k | 1/z | t/z | s/z
375:
376: fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
1.1.1.2 ! root 377: fmul %st(4),%st // s | z*64k | 1/z | t/z | s/z
1.1 root 378: fxch %st(1) // z*64k | s | 1/z | t/z | s/z
1.1.1.2 ! root 379: fmul %st(3),%st // t | s | 1/z | t/z | s/z
1.1 root 380: fxch %st(1) // s | t | 1/z | t/z | s/z
381: fistpl s // 1/z | t | t/z | s/z
382: fistpl t // 1/z | t/z | s/z
383:
384: fadds zi8stepu
385: fxch %st(2)
386: fadds sdivz8stepu
387: fxch %st(2)
388: flds tdivz8stepu
389: faddp %st(0),%st(2)
390: flds fp_64k
1.1.1.2 ! root 391: fdiv %st(1),%st // z = 1/1/z
! 392: // this is what we've gone to all this trouble to
! 393: // overlap
1.1 root 394: LFDIVInFlight1:
395:
396: addl s,%esi
397: addl t,%edx
398: movl C(bbextents),%ebx
399: movl C(bbextentt),%ebp
400: cmpl %ebx,%esi
401: ja LClampHighOrLow0
402: LClampReentry0:
403: movl %esi,s
404: movl pbase,%ebx
405: shll $16,%esi
406: cmpl %ebp,%edx
407: movl %esi,sfracf
408: ja LClampHighOrLow1
409: LClampReentry1:
410: movl %edx,t
411: movl s,%esi // sfrac = scans->sfrac;
412: shll $16,%edx
413: movl t,%eax // tfrac = scans->tfrac;
414: sarl $16,%esi
415: movl %edx,tfracf
416:
417: //
418: // calculate the texture starting address
419: //
420: sarl $16,%eax
421: movl C(cachewidth),%edx
1.1.1.2 ! root 422: imul %edx,%eax // (tfrac >> 16) * cachewidth
1.1 root 423: addl %ebx,%esi
424: addl %eax,%esi // psource = pbase + (sfrac >> 16) +
425: // ((tfrac >> 16) * cachewidth);
426:
427: //
428: // determine whether last span or not
429: //
430: cmpl $8,%ecx
431: jna LLastSegment
432:
433: //
434: // not the last segment; do full 8-wide segment
435: //
436: LNotLastSegment:
437:
438: //
439: // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
440: // get there
441: //
442:
443: // pick up after the FDIV that was left in flight previously
444:
1.1.1.2 ! root 445: fld %st(0) // duplicate it
! 446: fmul %st(4),%st // s = s/z * z
1.1 root 447: fxch %st(1)
1.1.1.2 ! root 448: fmul %st(3),%st // t = t/z * z
1.1 root 449: fxch %st(1)
450: fistpl snext
451: fistpl tnext
452: movl snext,%eax
453: movl tnext,%edx
454:
455: movb (%esi),%bl // get first source texel
456: subl $8,%ecx // count off this segments' pixels
457: movl C(sadjust),%ebp
458: movl %ecx,counttemp // remember count of remaining pixels
459:
460: movl C(tadjust),%ecx
461: movb %bl,(%edi) // store first dest pixel
462:
463: addl %eax,%ebp
464: addl %edx,%ecx
465:
466: movl C(bbextents),%eax
467: movl C(bbextentt),%edx
468:
469: cmpl $2048,%ebp
470: jl LClampLow2
471: cmpl %eax,%ebp
472: ja LClampHigh2
473: LClampReentry2:
474:
475: cmpl $2048,%ecx
476: jl LClampLow3
477: cmpl %edx,%ecx
478: ja LClampHigh3
479: LClampReentry3:
480:
481: movl %ebp,snext
482: movl %ecx,tnext
483:
484: subl s,%ebp
485: subl t,%ecx
486:
487: //
488: // set up advancetable
489: //
490: movl %ecx,%eax
491: movl %ebp,%edx
492: sarl $19,%eax // tstep >>= 16;
493: jz LZero
494: sarl $19,%edx // sstep >>= 16;
495: movl C(cachewidth),%ebx
1.1.1.2 ! root 496: imul %ebx,%eax
1.1 root 497: jmp LSetUp1
498:
499: LZero:
500: sarl $19,%edx // sstep >>= 16;
501: movl C(cachewidth),%ebx
502:
503: LSetUp1:
504:
505: addl %edx,%eax // add in sstep
506: // (tstep >> 16) * cachewidth + (sstep >> 16);
507: movl tfracf,%edx
508: movl %eax,advancetable+4 // advance base in t
509: addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth +
510: // (sstep >> 16);
511: shll $13,%ebp // left-justify sstep fractional part
512: movl sfracf,%ebx
513: shll $13,%ecx // left-justify tstep fractional part
514: movl %eax,advancetable // advance extra in t
515:
516: movl %ecx,tstep
517: addl %ecx,%edx // advance tfrac fractional part by tstep frac
518:
519: sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none)
520: addl %ebp,%ebx // advance sfrac fractional part by sstep frac
521: adcl advancetable+4(,%ecx,4),%esi // point to next source texel
522:
523: addl tstep,%edx
524: sbbl %ecx,%ecx
525: movb (%esi),%al
526: addl %ebp,%ebx
527: movb %al,1(%edi)
528: adcl advancetable+4(,%ecx,4),%esi
529:
530: addl tstep,%edx
531: sbbl %ecx,%ecx
532: addl %ebp,%ebx
533: movb (%esi),%al
534: adcl advancetable+4(,%ecx,4),%esi
535:
536: addl tstep,%edx
537: sbbl %ecx,%ecx
538: movb %al,2(%edi)
539: addl %ebp,%ebx
540: movb (%esi),%al
541: adcl advancetable+4(,%ecx,4),%esi
542:
543: addl tstep,%edx
544: sbbl %ecx,%ecx
545: movb %al,3(%edi)
546: addl %ebp,%ebx
547: movb (%esi),%al
548: adcl advancetable+4(,%ecx,4),%esi
549:
550:
551: //
552: // start FDIV for end of next segment in flight, so it can overlap
553: //
554: movl counttemp,%ecx
555: cmpl $8,%ecx // more than one segment after this?
556: ja LSetupNotLast2 // yes
557:
558: decl %ecx
559: jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV
560: movl %ecx,spancountminus1
561: fildl spancountminus1
562:
563: flds C(d_zistepu) // C(d_zistepu) | spancountminus1
564: fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1
565: flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
566: fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
567: fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
568: faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1
569: fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1
570: fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
571: fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
572: faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
573: flds fp_64k // 64k | C(d_sdivzstepu)*scm1
574: fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k
575: faddp %st(0),%st(4) // 64k
576:
1.1.1.2 ! root 577: fdiv %st(1),%st // this is what we've gone to all this trouble to
1.1 root 578: // overlap
579: jmp LFDIVInFlight2
580:
581: .align 4
582: LSetupNotLast2:
583: fadds zi8stepu
584: fxch %st(2)
585: fadds sdivz8stepu
586: fxch %st(2)
587: flds tdivz8stepu
588: faddp %st(0),%st(2)
589: flds fp_64k
1.1.1.2 ! root 590: fdiv %st(1),%st // z = 1/1/z
! 591: // this is what we've gone to all this trouble to
! 592: // overlap
1.1 root 593: LFDIVInFlight2:
594: movl %ecx,counttemp
595:
596: addl tstep,%edx
597: sbbl %ecx,%ecx
598: movb %al,4(%edi)
599: addl %ebp,%ebx
600: movb (%esi),%al
601: adcl advancetable+4(,%ecx,4),%esi
602:
603: addl tstep,%edx
604: sbbl %ecx,%ecx
605: movb %al,5(%edi)
606: addl %ebp,%ebx
607: movb (%esi),%al
608: adcl advancetable+4(,%ecx,4),%esi
609:
610: addl tstep,%edx
611: sbbl %ecx,%ecx
612: movb %al,6(%edi)
613: addl %ebp,%ebx
614: movb (%esi),%al
615: adcl advancetable+4(,%ecx,4),%esi
616:
617: addl $8,%edi
618: movl %edx,tfracf
619: movl snext,%edx
620: movl %ebx,sfracf
621: movl tnext,%ebx
622: movl %edx,s
623: movl %ebx,t
624:
625: movl counttemp,%ecx // retrieve count
626:
627: //
628: // determine whether last span or not
629: //
630: cmpl $8,%ecx // are there multiple segments remaining?
631: movb %al,-1(%edi)
632: ja LNotLastSegment // yes
633:
634: //
635: // last segment of scan
636: //
637: LLastSegment:
638:
639: //
640: // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
641: // get there. The number of pixels left is variable, and we want to land on the
642: // last pixel, not step one past it, so we can't run into arithmetic problems
643: //
644: testl %ecx,%ecx
645: jz LNoSteps // just draw the last pixel and we're done
646:
647: // pick up after the FDIV that was left in flight previously
648:
649:
1.1.1.2 ! root 650: fld %st(0) // duplicate it
! 651: fmul %st(4),%st // s = s/z * z
1.1 root 652: fxch %st(1)
1.1.1.2 ! root 653: fmul %st(3),%st // t = t/z * z
1.1 root 654: fxch %st(1)
655: fistpl snext
656: fistpl tnext
657:
658: movb (%esi),%al // load first texel in segment
659: movl C(tadjust),%ebx
660: movb %al,(%edi) // store first pixel in segment
661: movl C(sadjust),%eax
662:
663: addl snext,%eax
664: addl tnext,%ebx
665:
666: movl C(bbextents),%ebp
667: movl C(bbextentt),%edx
668:
669: cmpl $2048,%eax
670: jl LClampLow4
671: cmpl %ebp,%eax
672: ja LClampHigh4
673: LClampReentry4:
674: movl %eax,snext
675:
676: cmpl $2048,%ebx
677: jl LClampLow5
678: cmpl %edx,%ebx
679: ja LClampHigh5
680: LClampReentry5:
681:
682: cmpl $1,%ecx // don't bother
683: je LOnlyOneStep // if two pixels in segment, there's only one step,
684: // of the segment length
685: subl s,%eax
686: subl t,%ebx
687:
688: addl %eax,%eax // convert to 15.17 format so multiply by 1.31
689: addl %ebx,%ebx // reciprocal yields 16.48
690:
691: imull reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)
692: movl %edx,%ebp
693:
694: movl %ebx,%eax
695: imull reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)
696:
697: LSetEntryvec:
698: //
699: // set up advancetable
700: //
701: movl entryvec_table(,%ecx,4),%ebx
702: movl %edx,%eax
703: movl %ebx,jumptemp // entry point into code for RET later
704: movl %ebp,%ecx
705: sarl $16,%edx // tstep >>= 16;
706: movl C(cachewidth),%ebx
707: sarl $16,%ecx // sstep >>= 16;
1.1.1.2 ! root 708: imul %ebx,%edx
1.1 root 709:
710: addl %ecx,%edx // add in sstep
711: // (tstep >> 16) * cachewidth + (sstep >> 16);
712: movl tfracf,%ecx
713: movl %edx,advancetable+4 // advance base in t
714: addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth +
715: // (sstep >> 16);
716: shll $16,%ebp // left-justify sstep fractional part
717: movl sfracf,%ebx
718: shll $16,%eax // left-justify tstep fractional part
719: movl %edx,advancetable // advance extra in t
720:
721: movl %eax,tstep
722: movl %ecx,%edx
723: addl %eax,%edx
724: sbbl %ecx,%ecx
725: addl %ebp,%ebx
726: adcl advancetable+4(,%ecx,4),%esi
727:
728: jmp *jumptemp // jump to the number-of-pixels handler
729:
730: //----------------------------------------
731:
732: LNoSteps:
733: movb (%esi),%al // load first texel in segment
734: subl $7,%edi // adjust for hardwired offset
735: jmp LEndSpan
736:
737:
738: LOnlyOneStep:
739: subl s,%eax
740: subl t,%ebx
741: movl %eax,%ebp
742: movl %ebx,%edx
743: jmp LSetEntryvec
744:
745: //----------------------------------------
746:
1.1.1.2 ! root 747: LEntry2_8:
1.1 root 748: subl $6,%edi // adjust for hardwired offsets
749: movb (%esi),%al
750: jmp LLEntry2_8
751:
752: //----------------------------------------
753:
1.1.1.2 ! root 754: LEntry3_8:
1.1 root 755: subl $5,%edi // adjust for hardwired offsets
756: addl %eax,%edx
757: movb (%esi),%al
758: sbbl %ecx,%ecx
759: addl %ebp,%ebx
760: adcl advancetable+4(,%ecx,4),%esi
761: jmp LLEntry3_8
762:
763: //----------------------------------------
764:
1.1.1.2 ! root 765: LEntry4_8:
1.1 root 766: subl $4,%edi // adjust for hardwired offsets
767: addl %eax,%edx
768: movb (%esi),%al
769: sbbl %ecx,%ecx
770: addl %ebp,%ebx
771: adcl advancetable+4(,%ecx,4),%esi
772: addl tstep,%edx
773: jmp LLEntry4_8
774:
775: //----------------------------------------
776:
1.1.1.2 ! root 777: LEntry5_8:
1.1 root 778: subl $3,%edi // adjust for hardwired offsets
779: addl %eax,%edx
780: movb (%esi),%al
781: sbbl %ecx,%ecx
782: addl %ebp,%ebx
783: adcl advancetable+4(,%ecx,4),%esi
784: addl tstep,%edx
785: jmp LLEntry5_8
786:
787: //----------------------------------------
788:
1.1.1.2 ! root 789: LEntry6_8:
1.1 root 790: subl $2,%edi // adjust for hardwired offsets
791: addl %eax,%edx
792: movb (%esi),%al
793: sbbl %ecx,%ecx
794: addl %ebp,%ebx
795: adcl advancetable+4(,%ecx,4),%esi
796: addl tstep,%edx
797: jmp LLEntry6_8
798:
799: //----------------------------------------
800:
1.1.1.2 ! root 801: LEntry7_8:
1.1 root 802: decl %edi // adjust for hardwired offsets
803: addl %eax,%edx
804: movb (%esi),%al
805: sbbl %ecx,%ecx
806: addl %ebp,%ebx
807: adcl advancetable+4(,%ecx,4),%esi
808: addl tstep,%edx
809: jmp LLEntry7_8
810:
811: //----------------------------------------
812:
1.1.1.2 ! root 813: LEntry8_8:
1.1 root 814: addl %eax,%edx
815: movb (%esi),%al
816: sbbl %ecx,%ecx
817: addl %ebp,%ebx
818: adcl advancetable+4(,%ecx,4),%esi
819:
820: addl tstep,%edx
821: sbbl %ecx,%ecx
822: movb %al,1(%edi)
823: addl %ebp,%ebx
824: movb (%esi),%al
825: adcl advancetable+4(,%ecx,4),%esi
826: addl tstep,%edx
827: LLEntry7_8:
828: sbbl %ecx,%ecx
829: movb %al,2(%edi)
830: addl %ebp,%ebx
831: movb (%esi),%al
832: adcl advancetable+4(,%ecx,4),%esi
833: addl tstep,%edx
834: LLEntry6_8:
835: sbbl %ecx,%ecx
836: movb %al,3(%edi)
837: addl %ebp,%ebx
838: movb (%esi),%al
839: adcl advancetable+4(,%ecx,4),%esi
840: addl tstep,%edx
841: LLEntry5_8:
842: sbbl %ecx,%ecx
843: movb %al,4(%edi)
844: addl %ebp,%ebx
845: movb (%esi),%al
846: adcl advancetable+4(,%ecx,4),%esi
847: addl tstep,%edx
848: LLEntry4_8:
849: sbbl %ecx,%ecx
850: movb %al,5(%edi)
851: addl %ebp,%ebx
852: movb (%esi),%al
853: adcl advancetable+4(,%ecx,4),%esi
854: LLEntry3_8:
855: movb %al,6(%edi)
856: movb (%esi),%al
857: LLEntry2_8:
858:
859: LEndSpan:
860:
861: //
862: // clear s/z, t/z, 1/z from FP stack
863: //
864: fstp %st(0)
865: fstp %st(0)
866: fstp %st(0)
867:
868: movl pspantemp,%ebx // restore spans pointer
869: movl espan_t_pnext(%ebx),%ebx // point to next span
870: testl %ebx,%ebx // any more spans?
871: movb %al,7(%edi)
872: jnz LSpanLoop // more spans
873:
874: popl %ebx // restore register variables
875: popl %esi
876: popl %edi
877: popl %ebp // restore the caller's stack frame
878: ret
879:
880: //----------------------------------------------------------------------
881: // 8-bpp horizontal span z drawing codefor polygons, with no transparency.
882: //
883: // Assumes there is at least one span in pzspans, and that every span
884: // contains at least one pixel
885: //----------------------------------------------------------------------
886:
887: .text
888:
889: // z-clamp on a non-negative gradient span
890: LClamp:
891: movl $0x40000000,%edx
892: xorl %ebx,%ebx
893: fstp %st(0)
894: jmp LZDraw
895:
896: // z-clamp on a negative gradient span
897: LClampNeg:
898: movl $0x40000000,%edx
899: xorl %ebx,%ebx
900: fstp %st(0)
901: jmp LZDrawNeg
902:
903:
904: #define pzspans 4+16
905:
906: .globl C(D_DrawZSpans)
907: C(D_DrawZSpans):
908: pushl %ebp // preserve caller's stack frame
909: pushl %edi
910: pushl %esi // preserve register variables
911: pushl %ebx
912:
913: flds C(d_zistepu)
914: movl C(d_zistepu),%eax
915: movl pzspans(%esp),%esi
1.1.1.2 ! root 916: test %eax,%eax
1.1 root 917: jz LFNegSpan
918:
919: fmuls Float2ToThe31nd
920: fistpl izistep // note: we are relying on FP exceptions being turned
921: // off here to avoid range problems
922: movl izistep,%ebx // remains loaded for all spans
923:
924: LFSpanLoop:
925: // set up the initial 1/z value
926: fildl espan_t_v(%esi)
927: fildl espan_t_u(%esi)
928: movl espan_t_v(%esi),%ecx
929: movl C(d_pzbuffer),%edi
930: fmuls C(d_zistepu)
931: fxch %st(1)
932: fmuls C(d_zistepv)
933: fxch %st(1)
934: fadds C(d_ziorigin)
935: imull C(d_zrowbytes),%ecx
936: faddp %st(0),%st(1)
937:
938: // clamp if z is nearer than 2 (1/z > 0.5)
939: fcoms float_point5
940: addl %ecx,%edi
941: movl espan_t_u(%esi),%edx
942: addl %edx,%edx // word count
1.1.1.2 ! root 943: mov espan_t_count(%esi),%ecx
1.1 root 944: addl %edx,%edi // pdest = &pdestspan[scans->u];
945: pushl %esi // preserve spans pointer
1.1.1.2 ! root 946: fnstsw
1.1 root 947: testb $0x45,%ah
948: jz LClamp
949:
950: fmuls Float2ToThe31nd
951: fistpl izi // note: we are relying on FP exceptions being turned
952: // off here to avoid problems when the span is closer
953: // than 1/(2**31)
954: movl izi,%edx
955:
956: // at this point:
957: // %ebx = izistep
958: // %ecx = count
959: // %edx = izi
960: // %edi = pdest
961:
962: LZDraw:
963:
964: // do a single pixel up front, if necessary to dword align the destination
965: testl $2,%edi
966: jz LFMiddle
967: movl %edx,%eax
968: addl %ebx,%edx
969: shrl $16,%eax
970: decl %ecx
971: movw %ax,(%edi)
972: addl $2,%edi
973:
974: // do middle a pair of aligned dwords at a time
975: LFMiddle:
976: pushl %ecx
977: shrl $1,%ecx // count / 2
978: jz LFLast // no aligned dwords to do
979: shrl $1,%ecx // (count / 2) / 2
980: jnc LFMiddleLoop // even number of aligned dwords to do
981:
982: movl %edx,%eax
983: addl %ebx,%edx
984: shrl $16,%eax
985: movl %edx,%esi
986: addl %ebx,%edx
987: andl $0xFFFF0000,%esi
988: orl %esi,%eax
989: movl %eax,(%edi)
990: addl $4,%edi
991: andl %ecx,%ecx
992: jz LFLast
993:
994: LFMiddleLoop:
995: movl %edx,%eax
996: addl %ebx,%edx
997: shrl $16,%eax
998: movl %edx,%esi
999: addl %ebx,%edx
1000: andl $0xFFFF0000,%esi
1001: orl %esi,%eax
1002: movl %edx,%ebp
1003: movl %eax,(%edi)
1004: addl %ebx,%edx
1005: shrl $16,%ebp
1006: movl %edx,%esi
1007: addl %ebx,%edx
1008: andl $0xFFFF0000,%esi
1009: orl %esi,%ebp
1010: movl %ebp,4(%edi) // FIXME: eliminate register contention
1011: addl $8,%edi
1012:
1013: decl %ecx
1014: jnz LFMiddleLoop
1015:
1016: LFLast:
1017: popl %ecx // retrieve count
1018: popl %esi // retrieve span pointer
1019:
1020: // do the last, unaligned pixel, if there is one
1021: andl $1,%ecx // is there an odd pixel left to do?
1022: jz LFSpanDone // no
1023: shrl $16,%edx
1024: movw %dx,(%edi) // do the final pixel's z
1025:
1026: LFSpanDone:
1027: movl espan_t_pnext(%esi),%esi
1028: testl %esi,%esi
1029: jnz LFSpanLoop
1030:
1031: jmp LFDone
1032:
1033: LFNegSpan:
1034: fmuls FloatMinus2ToThe31nd
1035: fistpl izistep // note: we are relying on FP exceptions being turned
1036: // off here to avoid range problems
1037: movl izistep,%ebx // remains loaded for all spans
1038:
1039: LFNegSpanLoop:
1040: // set up the initial 1/z value
1041: fildl espan_t_v(%esi)
1042: fildl espan_t_u(%esi)
1043: movl espan_t_v(%esi),%ecx
1044: movl C(d_pzbuffer),%edi
1045: fmuls C(d_zistepu)
1046: fxch %st(1)
1047: fmuls C(d_zistepv)
1048: fxch %st(1)
1049: fadds C(d_ziorigin)
1050: imull C(d_zrowbytes),%ecx
1051: faddp %st(0),%st(1)
1052:
1053: // clamp if z is nearer than 2 (1/z > 0.5)
1054: fcoms float_point5
1055: addl %ecx,%edi
1056: movl espan_t_u(%esi),%edx
1057: addl %edx,%edx // word count
1.1.1.2 ! root 1058: mov espan_t_count(%esi),%ecx
1.1 root 1059: addl %edx,%edi // pdest = &pdestspan[scans->u];
1060: pushl %esi // preserve spans pointer
1.1.1.2 ! root 1061: fnstsw
1.1 root 1062: testb $0x45,%ah
1063: jz LClampNeg
1064:
1065: fmuls Float2ToThe31nd
1066: fistpl izi // note: we are relying on FP exceptions being turned
1067: // off here to avoid problems when the span is closer
1068: // than 1/(2**31)
1069: movl izi,%edx
1070:
1071: // at this point:
1072: // %ebx = izistep
1073: // %ecx = count
1074: // %edx = izi
1075: // %edi = pdest
1076:
1077: LZDrawNeg:
1078:
1079: // do a single pixel up front, if necessary to dword align the destination
1080: testl $2,%edi
1081: jz LFNegMiddle
1082: movl %edx,%eax
1083: subl %ebx,%edx
1084: shrl $16,%eax
1085: decl %ecx
1086: movw %ax,(%edi)
1087: addl $2,%edi
1088:
1089: // do middle a pair of aligned dwords at a time
1090: LFNegMiddle:
1091: pushl %ecx
1092: shrl $1,%ecx // count / 2
1093: jz LFNegLast // no aligned dwords to do
1094: shrl $1,%ecx // (count / 2) / 2
1095: jnc LFNegMiddleLoop // even number of aligned dwords to do
1096:
1097: movl %edx,%eax
1098: subl %ebx,%edx
1099: shrl $16,%eax
1100: movl %edx,%esi
1101: subl %ebx,%edx
1102: andl $0xFFFF0000,%esi
1103: orl %esi,%eax
1104: movl %eax,(%edi)
1105: addl $4,%edi
1106: andl %ecx,%ecx
1107: jz LFNegLast
1108:
1109: LFNegMiddleLoop:
1110: movl %edx,%eax
1111: subl %ebx,%edx
1112: shrl $16,%eax
1113: movl %edx,%esi
1114: subl %ebx,%edx
1115: andl $0xFFFF0000,%esi
1116: orl %esi,%eax
1117: movl %edx,%ebp
1118: movl %eax,(%edi)
1119: subl %ebx,%edx
1120: shrl $16,%ebp
1121: movl %edx,%esi
1122: subl %ebx,%edx
1123: andl $0xFFFF0000,%esi
1124: orl %esi,%ebp
1125: movl %ebp,4(%edi) // FIXME: eliminate register contention
1126: addl $8,%edi
1127:
1128: decl %ecx
1129: jnz LFNegMiddleLoop
1130:
1131: LFNegLast:
1132: popl %ecx // retrieve count
1133: popl %esi // retrieve span pointer
1134:
1135: // do the last, unaligned pixel, if there is one
1136: andl $1,%ecx // is there an odd pixel left to do?
1137: jz LFNegSpanDone // no
1138: shrl $16,%edx
1139: movw %dx,(%edi) // do the final pixel's z
1140:
1141: LFNegSpanDone:
1142: movl espan_t_pnext(%esi),%esi
1143: testl %esi,%esi
1144: jnz LFNegSpanLoop
1145:
1146: LFDone:
1147: popl %ebx // restore register variables
1148: popl %esi
1149: popl %edi
1150: popl %ebp // restore the caller's stack frame
1151: ret
1152:
1153: #endif // id386
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.