|
|
1.1 root 1: | Fast assembly routines for MC68020 (Sun-3)
2: | Assumptions:
3: | Arguments start at sp@(0x4)
4: | Return value is in d0
5: | d0/d1/a0/a1 are scratch
6: | P_SMUL needs MULTUNIT set to "unsigned long" in mpilib.c
7: | P_DMUL replaces mp_smul and mp_dmul in mpilib.c
8: |
9: | 92.9.21 - Tsutomu Shimomura, [email protected]
10:
11: .text
12:
13: | P_SETP(p) sets the current precision to be p longwords. No-op.
14: .proc
15: .globl _P_SETP
16: _P_SETP:
17: | movl #L2000, a0 |%
18: | jsr mcount |%
19: | .bss |%
20: | .even |%
21: |L2000: .skip 4 |%
22: | .text |%
23: rts
24:
25: | P_ADDC(*a, *b, c) performs a += b + c (carry). Carry is returned.
26: .proc
27: .globl _P_ADDC
28: _P_ADDC:
29: | movl #L2001, a0 |%
30: | jsr mcount |%
31: | .bss |%
32: | .even |%
33: |L2001: .skip 4 |%
34: | .text |%
35: movl sp@(0x4), a0 | claim arguments
36: movl sp@(0x8), a1
37: movl sp@(0xc), d0
38: movl d2, sp@- | preserve d2
39:
40: movw _global_precision, d1 | longword count
41: movw d1, d2 | save a copy
42:
43: lslw #2, d1
44: addw d1, a0 | adjust array pointers
45: addw d1, a1
46:
47: lsrw #1, d1 | compute initial branch offset
48: andw #0xe, d1
49: negw d1 | branch offset in d1
50:
51: lsrw #3, d2 | 8 longwords/loop; count in d2
52:
53: asrl #1, d0 | set X if necessary
54:
55: jmp pc@(0x12,d1:w)
56: 1:
57: addxl a1@-, a0@-
58: addxl a1@-, a0@-
59: addxl a1@-, a0@-
60: addxl a1@-, a0@-
61: addxl a1@-, a0@-
62: addxl a1@-, a0@-
63: addxl a1@-, a0@-
64: addxl a1@-, a0@-
65: dbf d2, 1b
66:
67: roxll #1, d0
68:
69: movl sp@+, d2
70: rts
71:
72: | P_SUBB(*a, *b, c) performs a -= b + c (borrow). Borrow is returned.
73: .proc
74: .globl _P_SUBB
75: _P_SUBB:
76: | movl #L2002, a0 |%
77: | jsr mcount |%
78: | .bss |%
79: | .even |%
80: |L2002: .skip 4 |%
81: | .text |%
82: movl sp@(0x4), a0 | claim arguments
83: movl sp@(0x8), a1
84: movl sp@(0xc), d0
85: movl d2, sp@- | preserve d2
86:
87: movw _global_precision, d1 | longword count
88: movw d1, d2 | save a copy
89:
90: lslw #2, d1
91: addw d1, a0 | adjust array pointers
92: addw d1, a1
93:
94: lsrw #1, d1 | compute initial branch offset
95: andw #0xe, d1
96: negw d1 | branch offset in d1
97:
98: lsrw #3, d2 | 8 longwords/loop; count in d2
99:
100: asrl #1, d0 | set X if necessary
101:
102: jmp pc@(0x12,d1:w)
103: 1:
104: subxl a1@-, a0@-
105: subxl a1@-, a0@-
106: subxl a1@-, a0@-
107: subxl a1@-, a0@-
108: subxl a1@-, a0@-
109: subxl a1@-, a0@-
110: subxl a1@-, a0@-
111: subxl a1@-, a0@-
112: dbf d2, 1b
113:
114: roxll #1, d0
115:
116: movl sp@+, d2
117: rts
118:
119: | P_ROTL(*a, c) performs a = (a<<1) | c (lo-bit). Hi-bit is returned.
120: .proc
121: .globl _P_ROTL
122: _P_ROTL:
123: | movl #L2003, a0 |%
124: | jsr mcount |%
125: | .bss |%
126: | .even |%
127: |L2003: .skip 4 |%
128: | .text |%
129: movl sp@(0x4), a0 | claim arguments
130: movl sp@(0x8), d0
131: movl d2, a1 | preserve d2
132:
133: movw _global_precision, d1 | longword count
134: movw d1, d2 | save a copy
135:
136: lslw #2, d1
137: addw d1, a0 | adjust array pointer
138:
139: andw #0x1c, d1
140: negw d1 | branch offset in d1
141:
142: lsrw #3, d2 | 8 longwords/loop; count in d2
143:
144: asrl #1, d0 | set X if necessary
145:
146: jmp pc@(0x22,d1:w)
147: 1:
148: roxlw a0@-
149: roxlw a0@-
150: roxlw a0@-
151: roxlw a0@-
152: roxlw a0@-
153: roxlw a0@-
154: roxlw a0@-
155: roxlw a0@-
156: roxlw a0@-
157: roxlw a0@-
158: roxlw a0@-
159: roxlw a0@-
160: roxlw a0@-
161: roxlw a0@-
162: roxlw a0@-
163: roxlw a0@-
164: dbf d2, 1b
165:
166: roxll #1, d0
167:
168: movl a1, d2
169: rts
170:
171: | P_SMUL(*a, *b, x) performs a += b * x. Pointers are to the LSB.
172: .proc
173: .globl _P_SMUL
174: _P_SMUL:
175: | movl #L2004, a0 |%
176: | jsr mcount |%
177: | .bss |%
178: | .even |%
179: |L2004: .skip 4 |%
180: | .text |%
181: movl sp@(0x4), a0 | claim arguments
182: movl sp@(0x8), a1
183: movl sp@(0xc), d1
184: tstl d1 | horrible kludge to speed multiply by 0
185: beq 3f
186: moveml #0x3c00, sp@- | d2/d3/d4/d5; Sun's optimizer is really *DUMB*
187: movw _global_precision, d5 | longword count; 0 will fail
188:
189: subqw #2, d5 | first longword not handled in loop
190: clrl d4
191:
192: movl a1@, d2
193: mulul d1, d3:d2 | d3 is carry
194: addl d2, a0@ | accumulate
195:
196: | tstw d5 | This code needed if global_precision < 2
197: | blt 2f | only one longword?
198: 1:
199: movl a0@-, d0
200: addxl d3, d0 | accumulate carry and X-bit
201: movl a1@-, d2
202: mulul d1, d3:d2 | d3 is carry
203: addxl d4, d3 | add X-bit to carry
204: addl d2, d0 | accumulate
205: movl d0, a0@
206: dbf d5, 1b
207: 2:
208: addxl d4, d3 | add X-bit to carry
209: movl d3, a0@-
210:
211: moveml sp@+, #0x3c | d2/d3/d4/d5; Sun's optimizer is really *DUMB*
212: 3:
213: rts
214:
215: | P_DMUL(*a, *b, *c) performs a = b * c.
216: .proc
217: .globl _P_DMUL
218: _P_DMUL:
219: | movl #L2005, a0 |%
220: | jsr mcount |%
221: | .bss |%
222: | .even |%
223: |L2005: .skip 4 |%
224: | .text |%
225: moveml #0x3f38, sp@- | d2-d7/a2-a4; Sun's optimizer is really *DUMB*
226: movl sp@(0x28), a0 | claim arguments
227: movl sp@(0x2c), a1
228: movl sp@(0x30), a2
229: movw _global_precision, d0
230: subqw #2, d0 | global_precision - 2
231:
232: movl a0, a4 | product
233:
234: movw d0, d7 | count for multiplicand
235: movl a1, a3 | multiplicand
236: bra 2f
237: 1:
238: clrl a4@+
239: 2:
240: tstl a3@+
241: dbne d7, 1b | d7 contains effective size of the multiplicand-2
242:
243: movl d0, d6 | count for multiplier
244: movl a2, a3 | multiplier
245: bra 2f
246: 1:
247: clrl a4@+
248: 2:
249: tstl a3@+
250: dbne d6, 1b | d6 contains effective size of the multiplier-2
251:
252: addqw #1, d0 | global_precision - 1
253: lslw #2, d0
254: addw d0, a1 | pointer to LSB of the multiplicand
255: addw d0, a2 | pointer to LSB of the multiplier
256: addw d0, a0
257: addw d0, a0
258: addql #4, a0 | pointer to LSB of product - KLUDGE!
259:
260: | First partial product not handled in loop
261: | Assumes that the X-bit is clear from the above contortions.
262: clrl d4
263:
264: movl a0, a3 | product
265: movl a1, a4 | multiplicand
266: movl a2@, d1 | one longword of the multiplier
267: movw d7, d5 | loop count
268:
269: movl a4@, d2
270: mulul d1, d3:d2 | d3 is carry
271: movl d2, a3@ | store product
272: 1:
273: movl a4@-, d2
274: mulul d1, d0:d2
275: addxl d3, d2
276: movl d0, d3
277: movl d2, a3@-
278: dbf d5, 1b
279:
280: addxl d4, d3
281: movl d3, a3@-
282:
283: | The other partial products
284:
285: 2:
286: movl a1, a4 | multiplicand
287: movl a2@-, d1 | another longword of the multiplier
288: movw d7, d5 | loop count
289:
290: movl a4@, d2
291: mulul d1, d3:d2 | d3 is carry
292: addl d2, a0@- | accumulate
293:
294: movl a0, a3 | product
295:
296: 1:
297: movl a3@-, d0
298: addxl d3, d0 | accumulate carry and X-bit
299: movl a4@-, d2
300: mulul d1, d3:d2 | d3 is carry
301: addxl d4, d3 | add X-bit to carry
302: addl d2, d0 | accumulate
303: movl d0, a3@
304: dbf d5, 1b
305:
306: addxl d4, d3 | add X-bit to carry
307: movl d3, a3@-
308:
309: dbf d6, 2b
310:
311: moveml sp@+, #0x1cfc | d2-d7/a2-a4; Sun's optimizer is really *DUMB*
312: rts
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.