|
|
1.1 root 1: | Fast assembly routines for MC68020 (Sun-3)
2: | Assumptions:
3: | Arguments start at sp@(0x4)
4: | Return value is in d0
5: | d0/d1/a0/a1 are scratch
6: | P_SMUL needs MULTUNIT set to "unsigned long" in mpilib.c
7: | P_DMUL replaces mp_smul and mp_dmul in mpilib.c
8: |
9: | 92.9.21 - Tsutomu Shimomura, [email protected]
1.1.1.2 ! root 10: | 93.5.14 - Bug in P_DMUL fixed -- now works with small bignums
1.1 root 11:
12: .text
13:
14: | P_SETP(p) sets the current precision to be p longwords. No-op.
15: .proc
16: .globl _P_SETP
17: _P_SETP:
18: | movl #L2000, a0 |%
19: | jsr mcount |%
20: | .bss |%
21: | .even |%
22: |L2000: .skip 4 |%
23: | .text |%
24: rts
25:
26: | P_ADDC(*a, *b, c) performs a += b + c (carry). Carry is returned.
27: .proc
28: .globl _P_ADDC
29: _P_ADDC:
30: | movl #L2001, a0 |%
31: | jsr mcount |%
32: | .bss |%
33: | .even |%
34: |L2001: .skip 4 |%
35: | .text |%
36: movl sp@(0x4), a0 | claim arguments
37: movl sp@(0x8), a1
38: movl sp@(0xc), d0
39: movl d2, sp@- | preserve d2
40:
41: movw _global_precision, d1 | longword count
42: movw d1, d2 | save a copy
43:
44: lslw #2, d1
45: addw d1, a0 | adjust array pointers
46: addw d1, a1
47:
48: lsrw #1, d1 | compute initial branch offset
49: andw #0xe, d1
50: negw d1 | branch offset in d1
51:
52: lsrw #3, d2 | 8 longwords/loop; count in d2
53:
54: asrl #1, d0 | set X if necessary
55:
56: jmp pc@(0x12,d1:w)
57: 1:
58: addxl a1@-, a0@-
59: addxl a1@-, a0@-
60: addxl a1@-, a0@-
61: addxl a1@-, a0@-
62: addxl a1@-, a0@-
63: addxl a1@-, a0@-
64: addxl a1@-, a0@-
65: addxl a1@-, a0@-
66: dbf d2, 1b
67:
68: roxll #1, d0
69:
70: movl sp@+, d2
71: rts
72:
73: | P_SUBB(*a, *b, c) performs a -= b + c (borrow). Borrow is returned.
74: .proc
75: .globl _P_SUBB
76: _P_SUBB:
77: | movl #L2002, a0 |%
78: | jsr mcount |%
79: | .bss |%
80: | .even |%
81: |L2002: .skip 4 |%
82: | .text |%
83: movl sp@(0x4), a0 | claim arguments
84: movl sp@(0x8), a1
85: movl sp@(0xc), d0
86: movl d2, sp@- | preserve d2
87:
88: movw _global_precision, d1 | longword count
89: movw d1, d2 | save a copy
90:
91: lslw #2, d1
92: addw d1, a0 | adjust array pointers
93: addw d1, a1
94:
95: lsrw #1, d1 | compute initial branch offset
96: andw #0xe, d1
97: negw d1 | branch offset in d1
98:
99: lsrw #3, d2 | 8 longwords/loop; count in d2
100:
101: asrl #1, d0 | set X if necessary
102:
103: jmp pc@(0x12,d1:w)
104: 1:
105: subxl a1@-, a0@-
106: subxl a1@-, a0@-
107: subxl a1@-, a0@-
108: subxl a1@-, a0@-
109: subxl a1@-, a0@-
110: subxl a1@-, a0@-
111: subxl a1@-, a0@-
112: subxl a1@-, a0@-
113: dbf d2, 1b
114:
115: roxll #1, d0
116:
117: movl sp@+, d2
118: rts
119:
120: | P_ROTL(*a, c) performs a = (a<<1) | c (lo-bit). Hi-bit is returned.
121: .proc
122: .globl _P_ROTL
123: _P_ROTL:
124: | movl #L2003, a0 |%
125: | jsr mcount |%
126: | .bss |%
127: | .even |%
128: |L2003: .skip 4 |%
129: | .text |%
130: movl sp@(0x4), a0 | claim arguments
131: movl sp@(0x8), d0
132: movl d2, a1 | preserve d2
133:
134: movw _global_precision, d1 | longword count
135: movw d1, d2 | save a copy
136:
137: lslw #2, d1
138: addw d1, a0 | adjust array pointer
139:
140: andw #0x1c, d1
141: negw d1 | branch offset in d1
142:
143: lsrw #3, d2 | 8 longwords/loop; count in d2
144:
145: asrl #1, d0 | set X if necessary
146:
147: jmp pc@(0x22,d1:w)
148: 1:
149: roxlw a0@-
150: roxlw a0@-
151: roxlw a0@-
152: roxlw a0@-
153: roxlw a0@-
154: roxlw a0@-
155: roxlw a0@-
156: roxlw a0@-
157: roxlw a0@-
158: roxlw a0@-
159: roxlw a0@-
160: roxlw a0@-
161: roxlw a0@-
162: roxlw a0@-
163: roxlw a0@-
164: roxlw a0@-
165: dbf d2, 1b
166:
167: roxll #1, d0
168:
169: movl a1, d2
170: rts
171:
172: | P_SMUL(*a, *b, x) performs a += b * x. Pointers are to the LSB.
173: .proc
174: .globl _P_SMUL
175: _P_SMUL:
176: | movl #L2004, a0 |%
177: | jsr mcount |%
178: | .bss |%
179: | .even |%
180: |L2004: .skip 4 |%
181: | .text |%
182: movl sp@(0x4), a0 | claim arguments
183: movl sp@(0x8), a1
184: movl sp@(0xc), d1
185: tstl d1 | horrible kludge to speed multiply by 0
186: beq 3f
187: moveml #0x3c00, sp@- | d2/d3/d4/d5; Sun's optimizer is really *DUMB*
188: movw _global_precision, d5 | longword count; 0 will fail
189:
190: subqw #2, d5 | first longword not handled in loop
191: clrl d4
192:
193: movl a1@, d2
194: mulul d1, d3:d2 | d3 is carry
195: addl d2, a0@ | accumulate
196:
197: | tstw d5 | This code needed if global_precision < 2
198: | blt 2f | only one longword?
199: 1:
200: movl a0@-, d0
201: addxl d3, d0 | accumulate carry and X-bit
202: movl a1@-, d2
203: mulul d1, d3:d2 | d3 is carry
204: addxl d4, d3 | add X-bit to carry
205: addl d2, d0 | accumulate
206: movl d0, a0@
207: dbf d5, 1b
208: 2:
209: addxl d4, d3 | add X-bit to carry
210: movl d3, a0@-
211:
212: moveml sp@+, #0x3c | d2/d3/d4/d5; Sun's optimizer is really *DUMB*
213: 3:
214: rts
215:
216: | P_DMUL(*a, *b, *c) performs a = b * c.
217: .proc
218: .globl _P_DMUL
219: _P_DMUL:
220: | movl #L2005, a0 |%
221: | jsr mcount |%
222: | .bss |%
223: | .even |%
224: |L2005: .skip 4 |%
225: | .text |%
226: moveml #0x3f38, sp@- | d2-d7/a2-a4; Sun's optimizer is really *DUMB*
227: movl sp@(0x28), a0 | claim arguments
228: movl sp@(0x2c), a1
229: movl sp@(0x30), a2
230: movw _global_precision, d0
231: subqw #2, d0 | global_precision - 2
232:
233: movl a0, a4 | product
234:
235: movw d0, d7 | count for multiplicand
236: movl a1, a3 | multiplicand
1.1.1.2 ! root 237: tstl a3@+
! 238: bne 2f
! 239: subqw #1, d7
1.1 root 240: 1:
241: clrl a4@+
242: tstl a3@+
1.1.1.2 ! root 243: dbne d7, 1b
! 244: addqw #1, d7 | d7 contains effective size of the multiplicand-2
! 245: 2:
1.1 root 246: movl d0, d6 | count for multiplier
247: movl a2, a3 | multiplier
1.1.1.2 ! root 248: tstl a3@+
! 249: bne 2f
! 250: subqw #1, d6
1.1 root 251: 1:
252: clrl a4@+
253: tstl a3@+
1.1.1.2 ! root 254: dbne d6, 1b
! 255: addqw #1, d6 | d6 contains effective size of the multiplier-2
! 256: 2:
1.1 root 257:
258: addqw #1, d0 | global_precision - 1
259: lslw #2, d0
260: addw d0, a1 | pointer to LSB of the multiplicand
261: addw d0, a2 | pointer to LSB of the multiplier
262: addw d0, a0
263: addw d0, a0
264: addql #4, a0 | pointer to LSB of product - KLUDGE!
265:
266: | First partial product not handled in loop
267: | Assumes that the X-bit is clear from the above contortions.
268: clrl d4
269:
270: movl a0, a3 | product
271: movl a1, a4 | multiplicand
272: movl a2@, d1 | one longword of the multiplier
273: movw d7, d5 | loop count
274:
275: movl a4@, d2
276: mulul d1, d3:d2 | d3 is carry
277: movl d2, a3@ | store product
278: 1:
279: movl a4@-, d2
280: mulul d1, d0:d2
281: addxl d3, d2
282: movl d0, d3
283: movl d2, a3@-
284: dbf d5, 1b
285:
286: addxl d4, d3
287: movl d3, a3@-
288:
289: | The other partial products
290:
291: 2:
292: movl a1, a4 | multiplicand
293: movl a2@-, d1 | another longword of the multiplier
294: movw d7, d5 | loop count
295:
296: movl a4@, d2
297: mulul d1, d3:d2 | d3 is carry
298: addl d2, a0@- | accumulate
299:
300: movl a0, a3 | product
301:
302: 1:
303: movl a3@-, d0
304: addxl d3, d0 | accumulate carry and X-bit
305: movl a4@-, d2
306: mulul d1, d3:d2 | d3 is carry
307: addxl d4, d3 | add X-bit to carry
308: addl d2, d0 | accumulate
309: movl d0, a3@
310: dbf d5, 1b
311:
312: addxl d4, d3 | add X-bit to carry
313: movl d3, a3@-
314:
315: dbf d6, 2b
316:
317: moveml sp@+, #0x1cfc | d2-d7/a2-a4; Sun's optimizer is really *DUMB*
318: rts
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.