|
|
1.1 root 1: .data
2: .asciz "@(#)Fmuld.s 1.1 86/02/03 Copyr 1985 Sun Micro"
3: .even
4: .text
5:
6: | Copyright (c) 1985 by Sun Microsystems, Inc.
7:
8: #include "fpcrtdefs.h"
9:
10: /*
11: * double-precision floating math run-time support
12: *
13: * copyright 1981, 1982 Richard E. James III
14: * translated to SUN idiom 10/11 March 1983 rt
15: * parameter passing re-done 22 July 1983 rt
16: */
17:
18: ARG2PTR = a0
19:
20: /*
21: * ieee double floating divide
22: * copyright 1981, Richard E. James III
23: * translated to SUN idiom 10 March 1983 rt
24: */
25:
26: /*
27: * entry conditions:
28: * first argument in d0/d1
29: * second argument on stack
30: * exit conditions:
31: * result (8 bytes) in d0/d1
32: *
33: * register conventions:
34: * d0/d1 divisor
35: * d2/d3 dividend
36: * d4 count
37: * d5 sign an exponent
38: * d6 result exponent
39: * d6/d7 quotient
40: */
41: SAVEMASK = 0x3f00 | registers d2-d7
42: RESTMASK = 0xfc
43: NSAVED = 6*4 | 6 registers * sizeof(register)
44:
45: RTENTRY(Fdivd)
46: | save registers and load operands into registers
47: moveml #0x3f00,sp@- | registers d2-d7
48: movl d0,d2
49: movl d1,d3
50: movl ARG2PTR@+,d0
51: movl ARG2PTR@ ,d1
52: | save sign of result
53: movl d0,d5
54: eorl d2,d5 | sign of result
55: clrl d4 | flag for divide
56: bsrs extrem
57: | compute resulting exponent
58: movw d6,d5
59: subw d7,d5
60: | do top 30-31 bits of divide (d_rcp will post normalize)
61: movw #30,d4 | count 30..0
62: bsr shsub | shift and subtract loop
63: movl d7,d6 | top of answer
64: | do next 22 bits of divide
65: movw #23,d4 | count 23..0 (total = 54-55 bits)
66: bsr shsub
67: | put together answer
68: lsll #8,d7 | line up bottom
69: orl d3,d2
70: beqs 1$
71: bset #1,d7 | sticky bit on if remainder <> 0
72: 1$: lsll #1,d7
73: roxll #1,d6
74: | normalize (once), round, check result exp, pack
75: movl d6,d2 | upper
76: movl d7,d3 | lower
77: movw d5,d6 | exponent
78: addw #1023,d6 | re-bias
79: jbsr d_nrcp | norm once, rnd, ck, pack
80: bra dmsign
81:
82:
83: /*
84: * round, check for over/underflow, and pack in the exponent.
85: * d_nrcp does one normalize and then calls d_rcp.
86: * d_rcp rounds the double value and packs the exponent in,
87: * catching infinity, zero, and denormalized numbers.
88: * d_usel puts together the larger argument.
89: *
90: * input:
91: * d2/d3 mantissa (- if norm)
92: * d6 biased exponent
93: * (need sign, sticky)
94: * output:
95: * d2/d3 most of number, no sign or hidden bit,
96: * waiting to shift sign in.
97: * other:
98: * d4 lost
99: * d5 unchanged
100: */
101:
102: d_nrcp:
103: tstl d2
104: jmi d_rcp | already normalized
105: subqw #1,d6
106: lsll #1,d3 | do extra normalize (for mul/div)
107: roxll #1,d2
108: jra d_rcp
109:
110: /*
111: * subroutine for unpacking one operand, and normalizing a denormalized number
112: * input:
113: * d0/d1 number
114: * output:
115: * d0/d1 mantissa
116: * d7.w exponent
117: * z on iff mantissa is zero_
118: *
119: * unchanged:
120: * d4 bottom = 0xf77
121: */
122:
123: unp: movl d0,d7 | start getting exp
124: andl #0xfffff,d0 | clear out sign and exp
125: swap d7
126: lsrw #(16-1-11),d7
127: andw d4,d7 | expondnt
128: bnes 3$ | normal number
129: | denormalized number or zero:
130: tstl d0 | upper
131: bnes 1$
132: tstl d1 | lower
133: beqs 3$ |zero
134: 1$: addqw #1,d7
135: 2$: subql #1,d7
136: lsll #1,d1
137: roxll #1,d0 | normalize
138: btst #20,d0
139: beqs 2$ | loop until normalized
140: 3$: rts
141:
142:
143: /*
144: * subroutine for extracting and taking care of 0/GU/INF/NAN.
145: * input:
146: * d0/d1, d2/d3
147: * d4 + for div; - for mod
148: *
149: * output:
150: * d0/d1 (bottom) and d2/d3 (top) converted to
151: * 000XXXXX/XXXXXXXX
152: * d6 exponent of top
153: * d7 exponent of bottom
154: *
155: * unchanged:
156: * d5 sign
157: */
158:
159: extrem:
160: movw #0x7ff,d4 | mask, sign.l untouched
161: exg d2,d0
162: exg d3,d1
163: bsrs unp | unpack top
164: exg d0,d2
165: exg d1,d3
166: exg d7,d6
167: beqs topzero | top is zero
168: cmpw d4,d6
169: beqs topbig | top is inf or nan
170: bset #20,d2 | set hidden bit
171: topzero:bsrs unp | unpack bottom
172: beqs botzero | bottom is 0
173: cmpw d4,d7
174: beqs botbig | bottom is inf or nan
175: bset #20,d0
176: lsll #1,d1 | left shift bottom
177: roxll #1,d0
178: rts
179:
180: /*
181: * EXCEPTION HANDLING
182: */
183:
184: topbig: | top is inf or nan
185: bsrs unp | unpack bottom
186: tstl d4
187: bmis isnan | inf or nan / ...
188: cmpw d6,d7
189: beqs isnan | both inf/nan
190: tstl d2 | top
191: beqs geninf | inf / ... = inf
192: bras isnan | nan / ... = nan
193:
194: botzero:| bottom is 0
195: tstl d4
196: bmis gennan | dmod(... 0) = nan
197: orl d2,d3 | top
198: beqs gennan | 0/0 = nan
199: | nonzero/0 = inf
200: | generate infinity for answer
201: geninf: movl #0x7ff00000*2,d2 | infinity
202: bras clrbot
203:
204: botbig: tstl d0 | bottom = nan, result = nan
205: bnes isnan
206: tstl d4
207: bpls genzero | ... / inf = 0
208: addqw #4,sp
209: addw #11,d6 | append sign
210: jbsr d_norm | normalize
211: jbsr d_rcp | adjust exp, ck extremes, pack
212: dmsign: roxll #1,d5 | sign -> x
213: roxrl #1, d2 | rotate in sign
214: | answer is now in:
215: | d2 most significant 32 bits
216: | d3 least significant 32 bits
217: dmexit: movl d2,d0
218: movl d3,d1
219: | restore registers and split
220: moveml sp@+,#RESTMASK
221: RET
222:
223:
224: | invalid operand/operation
225: isnan: cmpw d7,d6
226: bnes 1$ | exponents equal
227: cmpl d0, d2
228: 1$: bges 2$
229: movw d7,d6
230: movl d0,d2
231: 2$: swap d2
232: lslw #(16-1-11),d6
233: orw d6,d2 | put back together
234: swap d2
235: lsll #1,d2
236: cmpl #0x7ff00000*2, d2
237: bhis gotnan | use larger nan
238:
239: gennan: movl #0x7ff00004*2,d2
240: tstl d4
241: bpls gotnan | nan 4 for div
242: addql #2,d2 | nan 5 for mod
243: gotnan: clrl d5
244: bras clrbot
245:
246: genzero:clrl d2
247: clrbot: clrl d3
248: sign: addqw #4,sp
249: bra dmsign
250:
251: /*
252: * the shsub subroutine does a shift-subtract loop
253: * that is the heart of divide and mod.
254: * the algorithm is a simple shift and subtract loop,
255: * but it adds when it overshoots.
256: * why not use the divs/divu instructions? That approach is slower!
257: *
258: * registers:
259: * d2/d3 current dividend (updated)
260: * d0/d1 divisor (unchanged)
261: * d4.w (inpout) number of interations -1, and bit number
262: * d5/d6 -untouched-
263: * d7 quotient being devloped (ignored by mod)
264: */
265:
266:
267: shsub:
268: clrl d7 | quotient
269: | shift once, see if subtract needed
270: 1$: addl d3,d3
271: addxl d2,d2 |(64-bit left shift)
272: cmpl d0,d2
273: dbge d4,1$ | loop while divident is small
274: | tally quotient and subtract
275: bset d4,d7 | quotient bit
276: subl d1,d3
277: subxl d0,d2 | 64-bit subtract
278: dbmi d4,1$ | loop (d4) times
279: | now one of three things has happeded:
280: | 1. count exhausted and extra subtract done (first DB hit count)
281: | 2. count exhausted in second DB
282: | 3. overshot because compare didn't check lower parts
283: bpls 2$ | case 2
284: addl d1,d3 | take care of overshoot
285: addxl d0,d2
286: bclr d4,d7
287: tstw d4
288: dblt d4,1$ | case 3
289: | case 1
290: 2$: rts
291:
292: /*
293: * ieee double floating multiply
294: * copyright 1981, Richard E. James III
295: * translated to SUN idiom 10 March 1983 rt
296: */
297:
298: /*
299: Revised to do correct IEEE rounding by dgh 24 Aug 84
300: * Conventions in the main multiplication section are as follows:
301: * r is the argument passed in the registers d0/d1
302: * s is the argument passed on the stack, saved in a0/a1
303: * while multiplying
304: * r1,r2,r3,r4 are the 16 bit components of r in descending order
305: * s1,s2,s3,s4 are the 16 bit components of s
306: * r is kept in d0 and d1, sometimes with words swapped
307: * s is kept in a0 and a1 unchanged
308: * the product is kept in d2/d3/d4/d5
309: * d6 contains a current partial product
310: * d7 contains a current partial product
311: * d7 contains 0 when needed for addxl
312: * a2 saves the sign and exponent of the result
313: *
314: * At the end, d4 and d5 if nonzero are jammed into lsb of d3.
315: */
316: /*
317: * entry conditions:
318: * first argument in d0/d1
319: * second argument on stack
320: * exit conditions:
321: * result (8 bytes) in d0/d1
322: *
323: * register conventions:
324: * d0-d3 operands or pieces of result
325: * d5 exponent of larger
326: * d5 temp for multiplying
327: * d6 sign and exponent
328: * d7 exponent of smaller
329: * d7 zero
330: */
331: SAVEMASK = 0x3fe0 | registers d2-d7,a0-a2
332: RESTMASK = 0x7fc
333: NSAVED = 6*4 | 6 registers * sizeof(register)
334:
335: SAVE03 = 0xf000 | registers d0-d3
336: FETCH03 = 0x000f
337:
338: RTENTRY(Fsqrd)
339: moveml #SAVEMASK,sp@- | registers d2-d7
340: movel d0,d2
341: movel d1,d3
342: bras Fmuld2
343: RTENTRY(Fmuld)
344: | save registers and load operands into registers
345: moveml #SAVEMASK,sp@- | registers d2-d7
346: movl ARG2PTR@+,d2
347: movl ARG2PTR@ ,d3
348: Fmuld2:
349: | save sign of result
350: movl d0,d5
351: eorl d2,d5 | sign of result
352: asll #1,d0 | toss sign
353: asll #1,d2 | EEmmmm0
354: cmpl d2,d0
355: | order operands (exponents at least)
356: blss eswap
357: exg d0,d2 | d2/d3 = larger
358: exg d1,d3
359: | extract and check exponents
360: eswap: jbsr d_exte
361: movw d6,d5 | larger exp
362: movl d5,d6
363: addw d7,d6 | result exp (and sign)
364: cmpw #0x7ff,d5 | check larger
365: jeq ovfl | inf or nan
366: tstw d7
367: jeq ufl | 0 or gradual underflow
368: | set hidden bits
369: bset #31,d0
370: back: bset #31,d2
371:
372: | Main multiply sequence.
373:
374: movl d2,a0 | a0 gets s1s2.
375: movl d3,a1 | a1 gets s3s4.
376: movl d6,a2 | a2 saves sign and exponent of result.
377:
378: movl a0,d3 | d3 gets s1s2.
379: swap d3 | d3 gets s2s1.
380: movw d3,d4 | d4 gets s1.
381: mulu d1,d4 | d4 gets r4*s1.
382: mulu d0,d3 | d3 gets r2*s1.
383: clrw d2 | d2 gets 0; only gets carries in this phase.
384:
385: movl a1,d6 | d6 gets s3s4.
386: swap d6 | d6 gets s4s3.
387: movw d6,d5 | d5 gets s3.
388: jeq s3zero | Branch if s3=0 to avoid following.
389: mulu d1,d5 | d5 gets r4*s3.
390: movw d6,d7 | d7 gets s3.
391: mulu d0,d7 | d7 gets r2*s3.
392: addl d7,d4
393: clrl d7 | Make a zero for addx.
394: addxl d7,d3
395: addxw d7,d2
396: phase3:
397: swap d0 | d0 gets r2r1.
398: swap d1 | d1 gets r4r3.
399: movw a0,d6 | d6 gets s2.
400: beqs 4$ | Skip following if s2=0.
401: movw d6,d7 | d7 gets s2.
402: mulu d0,d6 | d6 gets r1*s2.
403: mulu d1,d7 | d7 gets r3*s2.
404: addl d7,d4
405: addxl d6,d3
406: clrw d7
407: addxw d7,d2
408: 4$:
409: movw a1,d6 | d6 gets s4.
410: beqs 5$ | Skip if s4=0.
411: movw d6,d7 | d7 gets s4.
412: mulu d0,d6 | d6 gets r1*s4.
413: mulu d1,d7 | d7 gets r3*s4.
414: addl d7,d5
415: addxl d6,d4
416: clrl d7
417: addxl d7,d3
418: addxw d7,d2
419: 5$:
420: swap d2 | Exchange order of registers which contain
421: swap d3 | all the "odd" partial products.
422: swap d4
423: swap d5
424: movw d3,d2 | It's really a 16 bit shift!
425: movw d4,d3
426: movw d5,d4
427: clrw d5
428:
429: movl a1,d6 | d6 gets s3s4.
430: swap d6 | d6 gets s4s3.
431: movw d6,d7 | d7 gets s3.
432: beqs 6$
433: mulu d0,d6 | d6 gets r1*s3.
434: mulu d1,d7 | d7 gets r3*s3.
435: addl d7,d4
436: addxl d6,d3
437: clrl d7
438: addxl d7,d2
439:
440: 6$:
441: movl a0,d6 | d6 gets s1s2.
442: swap d6 | d6 gets s2s1.
443: movw d6,d7 | d7 gets s1.
444: mulu d0,d6 | d6 gets r1*s1.
445: mulu d1,d7 | d7 gets r3*s1.
446: addl d7,d3
447: addxl d6,d2
448:
449: swap d0 | d0 gets r1r2.
450: swap d1 | d1 gets r3r4.
451: movw a0,d6 | d6 gets s2.
452: beqs 8$
453: movw d6,d7 | d7 gets s2.
454: mulu d0,d6 | d6 gets r2*s2.
455: mulu d1,d7 | d7 gets r4*s2.
456: addl d7,d4
457: addxl d6,d3
458: clrl d7
459: addxl d7,d2
460: 8$:
461: movw a1,d6 | d6 gets s4.
462: beqs 9$
463: movw d6,d7 | d7 gets s4.
464: mulu d0,d6 | d6 gets r2*s4.
465: mulu d1,d7 | d7 gets r4*s4.
466: addl d7,d5
467: addxl d6,d4
468: clrl d7
469: addxl d7,d3
470: addxl d7,d2
471: 9$:
472: orl d5,d4
473: beqs 10$ | Branch if no sticky bits.
474: bset #0,d3 | Set that sticky!
475: 10$:
476: movl a2,d6 | Restore sign/exponent of result.
477:
478: subw #1022,d6 | toss xtra bias
479: jbsr d_nrcp | norm, rnd, ck, pack
480: msign: roxll #1,d6
481: roxrl #1,d2 | append sign
482: mexit: | answer is now in d2/d3: put in d0/d1
483: movl d2,d0
484: movl d3,d1
485: | restore registers and split
486: moveml sp@+,#RESTMASK
487: | slide down return address to pop off junk
488: RET
489:
490: s3zero:
491: clrl d5 | We do need to clear d5 sometime.
492: jra phase3
493:
494: | EXCEPTION CASES
495:
496: ovfl: movl d2,d5 | larger mantissa, if it is nan, use it
497: orw d7,d5 | smaller exponent
498: orl d0,d5
499: orl d1,d5 | smaller value
500: beqs m_gennan| inf * 0
501: | else nan*x or inf*nonzero:
502: movw #0x7ff,d6
503: jbsr d_usel
504: bras msign
505:
506: ufl: movl d0,d7 | mantissa of smaller
507: orl d1,d7
508: beqs signed0 | 0*number
509: normu: subqw #1,d6
510: lsll #1,d1 | adjust denormalized number
511: roxll #1,d0
512: bpls normu
513: addqw #1,d6
514: jra back
515: | (if both are denormalized, answer will be zero anyway)
516: m_gennan:movl #0x7ff00002,d2
517: bras mexit
518: signed0:clrl d2
519: clrl d3
520: bras msign
521:
522: SAVEMASK = 0x3f00
523: RESTMASK = 0x00fc
524:
525: EXP = d2
526: TYPE = d3
527: /* type values: */
528: ZERO = 1 | wonderful
529: GU = 2
530: PLAIN = 3
531: INF = 4
532: NAN = 5
533:
534: RTENTRY(Fscaleid)
535: moveml #SAVEMASK,sp@- | state save
536: jbsr d_unpk
537: cmpb #PLAIN,TYPE | is it a funny number?
538: bgts gohome | yes -- return argument
539: cmpb #ZERO,TYPE
540: beqs gohome | is zero -- return arg
541: | normal path through here
542: movw EXP,d7
543: extl d7 | d7 gets long exponent.
544: addl a0@,d7 | d7 gets modified exponent.
545: bvcs nooflo | Branch if no overflow.
546: bmis posov | Branch if positive overflow to -.
547: | Branch if negative overflow to +.
548: negov:
549: movw #-2000,EXP
550: bras gohome
551: posov:
552: movw #2000,EXP
553: bras gohome
554: nooflo:
555: cmpl #2000,d7
556: bges posov
557: cmpl #-2000,d7
558: bles negov
559: movw d7,EXP | Final OK exponent.
560: gohome:
561: jbsr d_pack
562: gone: moveml sp@+,#RESTMASK
563: RET
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.