|
|
1.1 ! root 1: | Fast assembly routines for MC68020 (Sun-3) ! 2: | Assumptions: ! 3: | Arguments start at sp@(0x4) ! 4: | Return value is in d0 ! 5: | d0/d1/a0/a1 are scratch ! 6: | P_SMUL needs MULTUNIT set to "unsigned long" in mpilib.c ! 7: | P_DMUL replaces mp_smul and mp_dmul in mpilib.c ! 8: | ! 9: | 92.9.21 - Tsutomu Shimomura, [email protected] ! 10: ! 11: .text ! 12: ! 13: | P_SETP(p) sets the current precision to be p longwords. No-op. ! 14: .proc ! 15: .globl _P_SETP ! 16: _P_SETP: ! 17: | movl #L2000, a0 |% ! 18: | jsr mcount |% ! 19: | .bss |% ! 20: | .even |% ! 21: |L2000: .skip 4 |% ! 22: | .text |% ! 23: rts ! 24: ! 25: | P_ADDC(*a, *b, c) performs a += b + c (carry). Carry is returned. ! 26: .proc ! 27: .globl _P_ADDC ! 28: _P_ADDC: ! 29: | movl #L2001, a0 |% ! 30: | jsr mcount |% ! 31: | .bss |% ! 32: | .even |% ! 33: |L2001: .skip 4 |% ! 34: | .text |% ! 35: movl sp@(0x4), a0 | claim arguments ! 36: movl sp@(0x8), a1 ! 37: movl sp@(0xc), d0 ! 38: movl d2, sp@- | preserve d2 ! 39: ! 40: movw _global_precision, d1 | longword count ! 41: movw d1, d2 | save a copy ! 42: ! 43: lslw #2, d1 ! 44: addw d1, a0 | adjust array pointers ! 45: addw d1, a1 ! 46: ! 47: lsrw #1, d1 | compute initial branch offset ! 48: andw #0xe, d1 ! 49: negw d1 | branch offset in d1 ! 50: ! 51: lsrw #3, d2 | 8 longwords/loop; count in d2 ! 52: ! 53: asrl #1, d0 | set X if necessary ! 54: ! 55: jmp pc@(0x12,d1:w) ! 56: 1: ! 57: addxl a1@-, a0@- ! 58: addxl a1@-, a0@- ! 59: addxl a1@-, a0@- ! 60: addxl a1@-, a0@- ! 61: addxl a1@-, a0@- ! 62: addxl a1@-, a0@- ! 63: addxl a1@-, a0@- ! 64: addxl a1@-, a0@- ! 65: dbf d2, 1b ! 66: ! 67: roxll #1, d0 ! 68: ! 69: movl sp@+, d2 ! 70: rts ! 71: ! 72: | P_SUBB(*a, *b, c) performs a -= b + c (borrow). Borrow is returned. ! 73: .proc ! 74: .globl _P_SUBB ! 75: _P_SUBB: ! 76: | movl #L2002, a0 |% ! 77: | jsr mcount |% ! 78: | .bss |% ! 79: | .even |% ! 80: |L2002: .skip 4 |% ! 81: | .text |% ! 82: movl sp@(0x4), a0 | claim arguments ! 83: movl sp@(0x8), a1 ! 84: movl sp@(0xc), d0 ! 85: movl d2, sp@- | preserve d2 ! 86: ! 87: movw _global_precision, d1 | longword count ! 88: movw d1, d2 | save a copy ! 89: ! 90: lslw #2, d1 ! 91: addw d1, a0 | adjust array pointers ! 92: addw d1, a1 ! 93: ! 94: lsrw #1, d1 | compute initial branch offset ! 95: andw #0xe, d1 ! 96: negw d1 | branch offset in d1 ! 97: ! 98: lsrw #3, d2 | 8 longwords/loop; count in d2 ! 99: ! 100: asrl #1, d0 | set X if necessary ! 101: ! 102: jmp pc@(0x12,d1:w) ! 103: 1: ! 104: subxl a1@-, a0@- ! 105: subxl a1@-, a0@- ! 106: subxl a1@-, a0@- ! 107: subxl a1@-, a0@- ! 108: subxl a1@-, a0@- ! 109: subxl a1@-, a0@- ! 110: subxl a1@-, a0@- ! 111: subxl a1@-, a0@- ! 112: dbf d2, 1b ! 113: ! 114: roxll #1, d0 ! 115: ! 116: movl sp@+, d2 ! 117: rts ! 118: ! 119: | P_ROTL(*a, c) performs a = (a<<1) | c (lo-bit). Hi-bit is returned. ! 120: .proc ! 121: .globl _P_ROTL ! 122: _P_ROTL: ! 123: | movl #L2003, a0 |% ! 124: | jsr mcount |% ! 125: | .bss |% ! 126: | .even |% ! 127: |L2003: .skip 4 |% ! 128: | .text |% ! 129: movl sp@(0x4), a0 | claim arguments ! 130: movl sp@(0x8), d0 ! 131: movl d2, a1 | preserve d2 ! 132: ! 133: movw _global_precision, d1 | longword count ! 134: movw d1, d2 | save a copy ! 135: ! 136: lslw #2, d1 ! 137: addw d1, a0 | adjust array pointer ! 138: ! 139: andw #0x1c, d1 ! 140: negw d1 | branch offset in d1 ! 141: ! 142: lsrw #3, d2 | 8 longwords/loop; count in d2 ! 143: ! 144: asrl #1, d0 | set X if necessary ! 145: ! 146: jmp pc@(0x22,d1:w) ! 147: 1: ! 148: roxlw a0@- ! 149: roxlw a0@- ! 150: roxlw a0@- ! 151: roxlw a0@- ! 152: roxlw a0@- ! 153: roxlw a0@- ! 154: roxlw a0@- ! 155: roxlw a0@- ! 156: roxlw a0@- ! 157: roxlw a0@- ! 158: roxlw a0@- ! 159: roxlw a0@- ! 160: roxlw a0@- ! 161: roxlw a0@- ! 162: roxlw a0@- ! 163: roxlw a0@- ! 164: dbf d2, 1b ! 165: ! 166: roxll #1, d0 ! 167: ! 168: movl a1, d2 ! 169: rts ! 170: ! 171: | P_SMUL(*a, *b, x) performs a += b * x. Pointers are to the LSB. ! 172: .proc ! 173: .globl _P_SMUL ! 174: _P_SMUL: ! 175: | movl #L2004, a0 |% ! 176: | jsr mcount |% ! 177: | .bss |% ! 178: | .even |% ! 179: |L2004: .skip 4 |% ! 180: | .text |% ! 181: movl sp@(0x4), a0 | claim arguments ! 182: movl sp@(0x8), a1 ! 183: movl sp@(0xc), d1 ! 184: tstl d1 | horrible kludge to speed multiply by 0 ! 185: beq 3f ! 186: moveml #0x3c00, sp@- | d2/d3/d4/d5; Sun's optimizer is really *DUMB* ! 187: movw _global_precision, d5 | longword count; 0 will fail ! 188: ! 189: subqw #2, d5 | first longword not handled in loop ! 190: clrl d4 ! 191: ! 192: movl a1@, d2 ! 193: mulul d1, d3:d2 | d3 is carry ! 194: addl d2, a0@ | accumulate ! 195: ! 196: | tstw d5 | This code needed if global_precision < 2 ! 197: | blt 2f | only one longword? ! 198: 1: ! 199: movl a0@-, d0 ! 200: addxl d3, d0 | accumulate carry and X-bit ! 201: movl a1@-, d2 ! 202: mulul d1, d3:d2 | d3 is carry ! 203: addxl d4, d3 | add X-bit to carry ! 204: addl d2, d0 | accumulate ! 205: movl d0, a0@ ! 206: dbf d5, 1b ! 207: 2: ! 208: addxl d4, d3 | add X-bit to carry ! 209: movl d3, a0@- ! 210: ! 211: moveml sp@+, #0x3c | d2/d3/d4/d5; Sun's optimizer is really *DUMB* ! 212: 3: ! 213: rts ! 214: ! 215: | P_DMUL(*a, *b, *c) performs a = b * c. ! 216: .proc ! 217: .globl _P_DMUL ! 218: _P_DMUL: ! 219: | movl #L2005, a0 |% ! 220: | jsr mcount |% ! 221: | .bss |% ! 222: | .even |% ! 223: |L2005: .skip 4 |% ! 224: | .text |% ! 225: moveml #0x3f38, sp@- | d2-d7/a2-a4; Sun's optimizer is really *DUMB* ! 226: movl sp@(0x28), a0 | claim arguments ! 227: movl sp@(0x2c), a1 ! 228: movl sp@(0x30), a2 ! 229: movw _global_precision, d0 ! 230: subqw #2, d0 | global_precision - 2 ! 231: ! 232: movl a0, a4 | product ! 233: ! 234: movw d0, d7 | count for multiplicand ! 235: movl a1, a3 | multiplicand ! 236: bra 2f ! 237: 1: ! 238: clrl a4@+ ! 239: 2: ! 240: tstl a3@+ ! 241: dbne d7, 1b | d7 contains effective size of the multiplicand-2 ! 242: ! 243: movl d0, d6 | count for multiplier ! 244: movl a2, a3 | multiplier ! 245: bra 2f ! 246: 1: ! 247: clrl a4@+ ! 248: 2: ! 249: tstl a3@+ ! 250: dbne d6, 1b | d6 contains effective size of the multiplier-2 ! 251: ! 252: addqw #1, d0 | global_precision - 1 ! 253: lslw #2, d0 ! 254: addw d0, a1 | pointer to LSB of the multiplicand ! 255: addw d0, a2 | pointer to LSB of the multiplier ! 256: addw d0, a0 ! 257: addw d0, a0 ! 258: addql #4, a0 | pointer to LSB of product - KLUDGE! ! 259: ! 260: | First partial product not handled in loop ! 261: | Assumes that the X-bit is clear from the above contortions. ! 262: clrl d4 ! 263: ! 264: movl a0, a3 | product ! 265: movl a1, a4 | multiplicand ! 266: movl a2@, d1 | one longword of the multiplier ! 267: movw d7, d5 | loop count ! 268: ! 269: movl a4@, d2 ! 270: mulul d1, d3:d2 | d3 is carry ! 271: movl d2, a3@ | store product ! 272: 1: ! 273: movl a4@-, d2 ! 274: mulul d1, d0:d2 ! 275: addxl d3, d2 ! 276: movl d0, d3 ! 277: movl d2, a3@- ! 278: dbf d5, 1b ! 279: ! 280: addxl d4, d3 ! 281: movl d3, a3@- ! 282: ! 283: | The other partial products ! 284: ! 285: 2: ! 286: movl a1, a4 | multiplicand ! 287: movl a2@-, d1 | another longword of the multiplier ! 288: movw d7, d5 | loop count ! 289: ! 290: movl a4@, d2 ! 291: mulul d1, d3:d2 | d3 is carry ! 292: addl d2, a0@- | accumulate ! 293: ! 294: movl a0, a3 | product ! 295: ! 296: 1: ! 297: movl a3@-, d0 ! 298: addxl d3, d0 | accumulate carry and X-bit ! 299: movl a4@-, d2 ! 300: mulul d1, d3:d2 | d3 is carry ! 301: addxl d4, d3 | add X-bit to carry ! 302: addl d2, d0 | accumulate ! 303: movl d0, a3@ ! 304: dbf d5, 1b ! 305: ! 306: addxl d4, d3 | add X-bit to carry ! 307: movl d3, a3@- ! 308: ! 309: dbf d6, 2b ! 310: ! 311: moveml sp@+, #0x1cfc | d2-d7/a2-a4; Sun's optimizer is really *DUMB* ! 312: rts
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.