|
|
1.1.1.3 ! root 1: | Fast assembly routines for MC68020 (Sun-3) ! 2: | Assumptions: ! 3: | Arguments start at sp@(0x4) ! 4: | Return value is in d0 ! 5: | d0/d1/a0/a1 are scratch ! 6: | P_SMUL needs MULTUNIT set to "unsigned long" in mpilib.c ! 7: | P_DMUL replaces mp_smul and mp_dmul in mpilib.c ! 8: | ! 9: | 92.9.21 - Tsutomu Shimomura, [email protected] ! 10: | 93.5.14 - Bug in P_DMUL fixed -- now works with small bignums ! 11: ! 12: .text ! 13: ! 14: | P_SETP(p) sets the current precision to be p longwords. No-op. ! 15: .proc ! 16: .globl _P_SETP ! 17: _P_SETP: ! 18: | movl #L2000, a0 |% ! 19: | jsr mcount |% ! 20: | .bss |% ! 21: | .even |% ! 22: |L2000: .skip 4 |% ! 23: | .text |% ! 24: rts ! 25: ! 26: | P_ADDC(*a, *b, c) performs a += b + c (carry). Carry is returned. ! 27: .proc ! 28: .globl _P_ADDC ! 29: _P_ADDC: ! 30: | movl #L2001, a0 |% ! 31: | jsr mcount |% ! 32: | .bss |% ! 33: | .even |% ! 34: |L2001: .skip 4 |% ! 35: | .text |% ! 36: movl sp@(0x4), a0 | claim arguments ! 37: movl sp@(0x8), a1 ! 38: movl sp@(0xc), d0 ! 39: movl d2, sp@- | preserve d2 ! 40: ! 41: movw _global_precision, d1 | longword count ! 42: movw d1, d2 | save a copy ! 43: ! 44: lslw #2, d1 ! 45: addw d1, a0 | adjust array pointers ! 46: addw d1, a1 ! 47: ! 48: lsrw #1, d1 | compute initial branch offset ! 49: andw #0xe, d1 ! 50: negw d1 | branch offset in d1 ! 51: ! 52: lsrw #3, d2 | 8 longwords/loop; count in d2 ! 53: ! 54: asrl #1, d0 | set X if necessary ! 55: ! 56: jmp pc@(0x12,d1:w) ! 57: 1: ! 58: addxl a1@-, a0@- ! 59: addxl a1@-, a0@- ! 60: addxl a1@-, a0@- ! 61: addxl a1@-, a0@- ! 62: addxl a1@-, a0@- ! 63: addxl a1@-, a0@- ! 64: addxl a1@-, a0@- ! 65: addxl a1@-, a0@- ! 66: dbf d2, 1b ! 67: ! 68: roxll #1, d0 ! 69: ! 70: movl sp@+, d2 ! 71: rts ! 72: ! 73: | P_SUBB(*a, *b, c) performs a -= b + c (borrow). Borrow is returned. ! 74: .proc ! 75: .globl _P_SUBB ! 76: _P_SUBB: ! 77: | movl #L2002, a0 |% ! 78: | jsr mcount |% ! 79: | .bss |% ! 80: | .even |% ! 81: |L2002: .skip 4 |% ! 82: | .text |% ! 83: movl sp@(0x4), a0 | claim arguments ! 84: movl sp@(0x8), a1 ! 85: movl sp@(0xc), d0 ! 86: movl d2, sp@- | preserve d2 ! 87: ! 88: movw _global_precision, d1 | longword count ! 89: movw d1, d2 | save a copy ! 90: ! 91: lslw #2, d1 ! 92: addw d1, a0 | adjust array pointers ! 93: addw d1, a1 ! 94: ! 95: lsrw #1, d1 | compute initial branch offset ! 96: andw #0xe, d1 ! 97: negw d1 | branch offset in d1 ! 98: ! 99: lsrw #3, d2 | 8 longwords/loop; count in d2 ! 100: ! 101: asrl #1, d0 | set X if necessary ! 102: ! 103: jmp pc@(0x12,d1:w) ! 104: 1: ! 105: subxl a1@-, a0@- ! 106: subxl a1@-, a0@- ! 107: subxl a1@-, a0@- ! 108: subxl a1@-, a0@- ! 109: subxl a1@-, a0@- ! 110: subxl a1@-, a0@- ! 111: subxl a1@-, a0@- ! 112: subxl a1@-, a0@- ! 113: dbf d2, 1b ! 114: ! 115: roxll #1, d0 ! 116: ! 117: movl sp@+, d2 ! 118: rts ! 119: ! 120: | P_ROTL(*a, c) performs a = (a<<1) | c (lo-bit). Hi-bit is returned. ! 121: .proc ! 122: .globl _P_ROTL ! 123: _P_ROTL: ! 124: | movl #L2003, a0 |% ! 125: | jsr mcount |% ! 126: | .bss |% ! 127: | .even |% ! 128: |L2003: .skip 4 |% ! 129: | .text |% ! 130: movl sp@(0x4), a0 | claim arguments ! 131: movl sp@(0x8), d0 ! 132: movl d2, a1 | preserve d2 ! 133: ! 134: movw _global_precision, d1 | longword count ! 135: movw d1, d2 | save a copy ! 136: ! 137: lslw #2, d1 ! 138: addw d1, a0 | adjust array pointer ! 139: ! 140: andw #0x1c, d1 ! 141: negw d1 | branch offset in d1 ! 142: ! 143: lsrw #3, d2 | 8 longwords/loop; count in d2 ! 144: ! 145: asrl #1, d0 | set X if necessary ! 146: ! 147: jmp pc@(0x22,d1:w) ! 148: 1: ! 149: roxlw a0@- ! 150: roxlw a0@- ! 151: roxlw a0@- ! 152: roxlw a0@- ! 153: roxlw a0@- ! 154: roxlw a0@- ! 155: roxlw a0@- ! 156: roxlw a0@- ! 157: roxlw a0@- ! 158: roxlw a0@- ! 159: roxlw a0@- ! 160: roxlw a0@- ! 161: roxlw a0@- ! 162: roxlw a0@- ! 163: roxlw a0@- ! 164: roxlw a0@- ! 165: dbf d2, 1b ! 166: ! 167: roxll #1, d0 ! 168: ! 169: movl a1, d2 ! 170: rts ! 171: ! 172: | P_SMUL(*a, *b, x) performs a += b * x. Pointers are to the LSB. ! 173: .proc ! 174: .globl _P_SMUL ! 175: _P_SMUL: ! 176: | movl #L2004, a0 |% ! 177: | jsr mcount |% ! 178: | .bss |% ! 179: | .even |% ! 180: |L2004: .skip 4 |% ! 181: | .text |% ! 182: movl sp@(0x4), a0 | claim arguments ! 183: movl sp@(0x8), a1 ! 184: movl sp@(0xc), d1 ! 185: tstl d1 | horrible kludge to speed multiply by 0 ! 186: beq 3f ! 187: moveml #0x3c00, sp@- | d2/d3/d4/d5; Sun's optimizer is really *DUMB* ! 188: movw _global_precision, d5 | longword count; 0 will fail ! 189: ! 190: subqw #2, d5 | first longword not handled in loop ! 191: clrl d4 ! 192: ! 193: movl a1@, d2 ! 194: mulul d1, d3:d2 | d3 is carry ! 195: addl d2, a0@ | accumulate ! 196: ! 197: | tstw d5 | This code needed if global_precision < 2 ! 198: | blt 2f | only one longword? ! 199: 1: ! 200: movl a0@-, d0 ! 201: addxl d3, d0 | accumulate carry and X-bit ! 202: movl a1@-, d2 ! 203: mulul d1, d3:d2 | d3 is carry ! 204: addxl d4, d3 | add X-bit to carry ! 205: addl d2, d0 | accumulate ! 206: movl d0, a0@ ! 207: dbf d5, 1b ! 208: 2: ! 209: addxl d4, d3 | add X-bit to carry ! 210: movl d3, a0@- ! 211: ! 212: moveml sp@+, #0x3c | d2/d3/d4/d5; Sun's optimizer is really *DUMB* ! 213: 3: ! 214: rts ! 215: ! 216: | P_DMUL(*a, *b, *c) performs a = b * c. ! 217: .proc ! 218: .globl _P_DMUL ! 219: _P_DMUL: ! 220: | movl #L2005, a0 |% ! 221: | jsr mcount |% ! 222: | .bss |% ! 223: | .even |% ! 224: |L2005: .skip 4 |% ! 225: | .text |% ! 226: moveml #0x3f38, sp@- | d2-d7/a2-a4; Sun's optimizer is really *DUMB* ! 227: movl sp@(0x28), a0 | claim arguments ! 228: movl sp@(0x2c), a1 ! 229: movl sp@(0x30), a2 ! 230: movw _global_precision, d0 ! 231: subqw #2, d0 | global_precision - 2 ! 232: ! 233: movl a0, a4 | product ! 234: ! 235: movw d0, d7 | count for multiplicand ! 236: movl a1, a3 | multiplicand ! 237: tstl a3@+ ! 238: bne 2f ! 239: subqw #1, d7 ! 240: 1: ! 241: clrl a4@+ ! 242: tstl a3@+ ! 243: dbne d7, 1b ! 244: addqw #1, d7 | d7 contains effective size of the multiplicand-2 ! 245: 2: ! 246: movl d0, d6 | count for multiplier ! 247: movl a2, a3 | multiplier ! 248: tstl a3@+ ! 249: bne 2f ! 250: subqw #1, d6 ! 251: 1: ! 252: clrl a4@+ ! 253: tstl a3@+ ! 254: dbne d6, 1b ! 255: addqw #1, d6 | d6 contains effective size of the multiplier-2 ! 256: 2: ! 257: ! 258: addqw #1, d0 | global_precision - 1 ! 259: lslw #2, d0 ! 260: addw d0, a1 | pointer to LSB of the multiplicand ! 261: addw d0, a2 | pointer to LSB of the multiplier ! 262: addw d0, a0 ! 263: addw d0, a0 ! 264: addql #4, a0 | pointer to LSB of product - KLUDGE! ! 265: ! 266: | First partial product not handled in loop ! 267: | Assumes that the X-bit is clear from the above contortions. ! 268: clrl d4 ! 269: ! 270: movl a0, a3 | product ! 271: movl a1, a4 | multiplicand ! 272: movl a2@, d1 | one longword of the multiplier ! 273: movw d7, d5 | loop count ! 274: ! 275: movl a4@, d2 ! 276: mulul d1, d3:d2 | d3 is carry ! 277: movl d2, a3@ | store product ! 278: 1: ! 279: movl a4@-, d2 ! 280: mulul d1, d0:d2 ! 281: addxl d3, d2 ! 282: movl d0, d3 ! 283: movl d2, a3@- ! 284: dbf d5, 1b ! 285: ! 286: addxl d4, d3 ! 287: movl d3, a3@- ! 288: ! 289: | The other partial products ! 290: ! 291: 2: ! 292: movl a1, a4 | multiplicand ! 293: movl a2@-, d1 | another longword of the multiplier ! 294: movw d7, d5 | loop count ! 295: ! 296: movl a4@, d2 ! 297: mulul d1, d3:d2 | d3 is carry ! 298: addl d2, a0@- | accumulate ! 299: ! 300: movl a0, a3 | product ! 301: ! 302: 1: ! 303: movl a3@-, d0 ! 304: addxl d3, d0 | accumulate carry and X-bit ! 305: movl a4@-, d2 ! 306: mulul d1, d3:d2 | d3 is carry ! 307: addxl d4, d3 | add X-bit to carry ! 308: addl d2, d0 | accumulate ! 309: movl d0, a3@ ! 310: dbf d5, 1b ! 311: ! 312: addxl d4, d3 | add X-bit to carry ! 313: movl d3, a3@- ! 314: ! 315: dbf d6, 2b ! 316: ! 317: moveml sp@+, #0x1cfc | d2-d7/a2-a4; Sun's optimizer is really *DUMB* ! 318: rts
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.