|
|
1.1.1.7 ! root 1: /* C source code for multiprecision arithmetic library routines. ! 2: Implemented Nov 86 by Philip Zimmermann ! 3: Last revised 27 Nov 91 by PRZ ! 4: ! 5: Boulder Software Engineering ! 6: 3021 Eleventh Street ! 7: Boulder, CO 80304 ! 8: (303) 541-0140 ! 9: ! 10: (c) Copyright 1986-1994 by Philip Zimmermann. All rights reserved. ! 11: The author assumes no liability for damages resulting from the use ! 12: of this software, even if the damage results from defects in this ! 13: software. No warranty is expressed or implied. The use of this ! 14: cryptographic software for developing weapon systems is expressly ! 15: forbidden. ! 16: ! 17: ! 18: These routines implement all of the multiprecision arithmetic ! 19: necessary for number-theoretic cryptographic algorithms such as ! 20: ElGamal, Diffie-Hellman, Rabin, or factoring studies for large ! 21: composite numbers, as well as Rivest-Shamir-Adleman (RSA) public ! 22: key cryptography. ! 23: ! 24: Although originally developed in Microsoft C for the IBM PC, this code ! 25: contains few machine dependencies. It assumes 2's complement ! 26: arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines, ! 27: lowbyte-highbyte order or highbyte-lowbyte order. This version ! 28: has been converted to ANSI C. ! 29: ! 30: ! 31: The internal representation for these extended precision integer ! 32: "registers" is an array of "units". A unit is a machine word, which ! 33: is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit ! 34: unsigned integer, depending on the machine's word size. For example, ! 35: an IBM PC or AT uses a unit size of 16 bits. To perform arithmetic ! 36: on these huge precision integers, we pass pointers to these unit ! 37: arrays to various subroutines. A pointer to an array of units is of ! 38: type unitptr. This is a pointer to a huge integer "register". ! 39: ! 40: When calling a subroutine, we always pass a pointer to the BEGINNING ! 41: of the array of units, regardless of the byte order of the machine. ! 42: On a lowbyte-first machine, such as the Intel 80x86, this unitptr ! 43: points to the LEAST significant unit, and the array of units increases ! 44: significance to the right. On a highbyte-first machine, such as the ! 45: Motorola 680x0, this unitptr points to the MOST significant unit, and ! 46: the array of units decreases significance to the right. ! 47: ! 48: Modified 8 Apr 92 - HAJK ! 49: Implement new VAX/VMS primitive support. 1.1.1.6 root 50: 1.1.1.7 ! root 51: Modified 30 Sep 92 -Castor Fu <[email protected]> ! 52: Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long) 1.1.1.6 root 53: 1.1.1.7 ! root 54: Modified 28 Nov 92 - Thad Smith ! 55: Added Smith modmult, generalized non-portable support. ! 56: */ 1.1.1.6 root 57: 1.1.1.7 ! root 58: /* #define COUNTMULTS *//* count modmults for performance studies */ 1.1.1.6 root 59: 60: #ifdef DEBUG 61: #ifdef MSDOS 1.1.1.7 ! root 62: #ifdef __GO32__ /* DJGPP */ 1.1.1.6 root 63: #include <pc.h> 64: #else 65: #include <conio.h> 1.1.1.7 ! root 66: #endif /* __GO32__ */ 1.1.1.6 root 67: #define poll_for_break() {while (kbhit()) getch();} 1.1.1.7 ! root 68: #endif /* MSDOS */ ! 69: #endif /* DEBUG */ 1.1.1.6 root 70: 71: #ifndef poll_for_break 1.1.1.7 ! root 72: #define poll_for_break() /* stub */ 1.1.1.6 root 73: #endif 74: 75: #include "mpilib.h" 76: 77: #ifdef mp_smula 78: #ifdef mp_smul 1.1.1.7 ! root 79: Error:Both mp_smula and mp_smul cannot be defined. 1.1.1.6 root 80: #else 81: #define mp_smul mp_smula 82: #endif 83: #endif 84: 85: /* set macros for MULTUNIT */ 86: #ifdef MUNIT8 87: #define MULTUNITSIZE 8 88: typedef unsigned char MULTUNIT; 89: #ifdef UNIT8 90: #define MULTUNIT_SIZE_SAME 91: #endif 1.1.1.7 ! root 92: #else /* not MUNIT8 */ 1.1.1.6 root 93: #ifdef MUNIT32 94: #define MULTUNITSIZE 32 95: typedef unsigned long MULTUNIT; 96: #ifdef UNIT32 97: #define MULTUNIT_SIZE_SAME 98: #else 99: /* #error is not portable, this has the same effect */ 100: #include "UNITSIZE cannot be smaller than MULTUNITSIZE" 101: #endif 1.1.1.7 ! root 102: #else /* assume MUNIT16 */ 1.1.1.6 root 103: #define MULTUNITSIZE 16 104: typedef unsigned short MULTUNIT; 105: #ifdef UNIT16 106: #define MULTUNIT_SIZE_SAME 1.1.1.7 ! root 107: #endif /* UNIT16 */ 1.1.1.6 root 108: #ifdef UNIT8 109: #include "UNITSIZE cannot be smaller than MULTUNITSIZE" 1.1.1.7 ! root 110: #endif /* UNIT8 */ ! 111: #endif /* MUNIT32 */ ! 112: #endif /* MUNIT8 */ 1.1.1.6 root 113: 1.1.1.7 ! root 114: #define MULTUNIT_msb ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of ! 115: MULTUNIT */ 1.1.1.6 root 116: #define DMULTUNIT_msb (1L << (2*MULTUNITSIZE-1)) 117: #define MULTUNIT_mask ((MULTUNIT)((1L << MULTUNITSIZE)-1)) 118: #define MULTUNITs_perunit (UNITSIZE/MULTUNITSIZE) 119: 120: 1.1.1.7 ! root 121: void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier); ! 122: void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier); 1.1.1.6 root 123: 1.1.1.7 ! root 124: short global_precision = 0; /* units of precision for all routines */ ! 125: /* global_precision is the unit precision last set by set_precision. ! 126: Initially, set_precision() should be called to define global_precision ! 127: before using any of these other multiprecision library routines. ! 128: i.e.: set_precision(MAX_UNIT_PRECISION); ! 129: */ 1.1.1.6 root 130: 131: /*************** multiprecision library primitives ****************/ 1.1.1.7 ! root 132: /* The following portable C primitives should be recoded in assembly. ! 133: The entry point name should be defined, in "mpilib.h" to the external ! 134: entry point name. If undefined, the C version will be used. ! 135: */ 1.1.1.6 root 136: 137: typedef unsigned long int ulint; 138: 139: #ifndef mp_addc 1.1.1.7 ! root 140: /* ! 141: multiprecision add with carry r2 to r1, result in r1 ! 142: carry is incoming carry flag-- value should be 0 or 1 ! 143: */ 1.1.1.6 root 144: boolean mp_addc 1.1.1.7 ! root 145: (register unitptr r1, register unitptr r2, register boolean carry) ! 146: { ! 147: register unit x; ! 148: short precision; /* number of units to add */ ! 149: precision = global_precision; ! 150: make_lsbptr(r1, precision); ! 151: make_lsbptr(r2, precision); ! 152: while (precision--) { ! 153: if (carry) { ! 154: x = *r1 + *r2 + 1; ! 155: carry = (*r2 >= (unit) (~*r1)); ! 156: } else { ! 157: x = *r1 + *r2; ! 158: carry = (x < *r1); 1.1.1.6 root 159: } 1.1.1.7 ! root 160: post_higherunit(r2); ! 161: *post_higherunit(r1) = x; ! 162: } ! 163: return carry; /* return the final carry flag bit */ ! 164: } /* mp_addc */ ! 165: #endif /* mp_addc */ 1.1.1.6 root 166: 167: #ifndef mp_subb 1.1.1.7 ! root 168: ! 169: /* ! 170: multiprecision subtract with borrow, r2 from r1, result in r1 ! 171: borrow is incoming borrow flag-- value should be 0 or 1 ! 172: */ 1.1.1.6 root 173: boolean mp_subb 1.1.1.7 ! root 174: (register unitptr r1, register unitptr r2, register boolean borrow) ! 175: { ! 176: register unit x; ! 177: short precision; /* number of units to subtract */ ! 178: precision = global_precision; ! 179: make_lsbptr(r1, precision); ! 180: make_lsbptr(r2, precision); ! 181: while (precision--) { ! 182: if (borrow) { ! 183: x = *r1 - *r2 - 1; ! 184: borrow = (*r1 <= *r2); ! 185: } else { ! 186: x = *r1 - *r2; ! 187: borrow = (*r1 < *r2); 1.1.1.6 root 188: } 1.1.1.7 ! root 189: post_higherunit(r2); ! 190: *post_higherunit(r1) = x; ! 191: } ! 192: return borrow; /* return the final carry/borrow flag bit */ ! 193: } /* mp_subb */ ! 194: #endif /* mp_subb */ 1.1.1.6 root 195: 196: #ifndef mp_rotate_left 1.1.1.7 ! root 197: ! 198: /* ! 199: multiprecision rotate left 1 bit with carry, result in r1. ! 200: carry is incoming carry flag-- value should be 0 or 1 ! 201: */ ! 202: boolean mp_rotate_left(register unitptr r1, register boolean carry) 1.1.1.6 root 203: { 1.1.1.7 ! root 204: register int precision; /* number of units to rotate */ ! 205: unsigned int mcarry = carry, nextcarry; /* int is supposed to be 1.1.1.6 root 206: * the efficient size for ops*/ 1.1.1.7 ! root 207: precision = global_precision; ! 208: make_lsbptr(r1, precision); ! 209: while (precision--) { ! 210: nextcarry = (((signedunit) * r1) < 0); ! 211: *r1 = (*r1 << 1) | mcarry; ! 212: mcarry = nextcarry; ! 213: pre_higherunit(r1); ! 214: } ! 215: return nextcarry; /* return the final carry flag bit */ ! 216: } /* mp_rotate_left */ ! 217: #endif /* mp_rotate_left */ 1.1.1.6 root 218: 219: /************** end of primitives that should be in assembly *************/ 220: 1.1.1.7 ! root 221: /* The mp_shift_right_bits function is not called in any time-critical ! 222: situations in public-key cryptographic functions, so it doesn't ! 223: need to be coded in assembly language. ! 224: */ 1.1.1.6 root 225: 1.1.1.7 ! root 226: /* ! 227: multiprecision shift right bits, result in r1. ! 228: bits is how many bits to shift, must be <= UNITSIZE. 1.1.1.6 root 229: */ 1.1.1.7 ! root 230: void mp_shift_right_bits(register unitptr r1, register short bits) ! 231: { ! 232: register short precision; /* number of units to shift */ ! 233: register unit carry, nextcarry, bitmask; ! 234: register short unbits; ! 235: if (bits == 0) ! 236: return; /* shift zero bits is a no-op */ ! 237: carry = 0; ! 238: bitmask = power_of_2(bits) - 1; ! 239: unbits = UNITSIZE - bits; /* shift bits must be <= UNITSIZE */ ! 240: precision = global_precision; ! 241: make_msbptr(r1, precision); ! 242: if (bits == UNITSIZE) { ! 243: while (precision--) { ! 244: nextcarry = *r1; ! 245: *r1 = carry; ! 246: carry = nextcarry; ! 247: pre_lowerunit(r1); 1.1.1.6 root 248: } 1.1.1.7 ! root 249: } else { ! 250: while (precision--) { ! 251: nextcarry = *r1 & bitmask; ! 252: *r1 >>= bits; ! 253: *r1 |= carry << unbits; ! 254: carry = nextcarry; ! 255: pre_lowerunit(r1); ! 256: } ! 257: } ! 258: } /* mp_shift_right_bits */ 1.1.1.6 root 259: 260: #ifndef mp_compare 1.1.1.7 ! root 261: 1.1.1.6 root 262: /* 263: * Compares multiprecision integers *r1, *r2, and returns: 264: * -1 iff *r1 < *r2 265: * 0 iff *r1 == *r2 266: * +1 iff *r1 > *r2 267: */ 1.1.1.7 ! root 268: short mp_compare(register unitptr r1, register unitptr r2) 1.1.1.6 root 269: { 1.1.1.7 ! root 270: register short precision; /* number of units to compare */ 1.1.1.6 root 271: 1.1.1.7 ! root 272: precision = global_precision; ! 273: make_msbptr(r1, precision); ! 274: make_msbptr(r2, precision); ! 275: do { ! 276: if (*r1 < *r2) ! 277: return -1; ! 278: if (*post_lowerunit(r1) > *post_lowerunit(r2)) ! 279: return 1; ! 280: } while (--precision); ! 281: return 0; /* *r1 == *r2 */ ! 282: } /* mp_compare */ ! 283: #endif /* mp_compare */ 1.1.1.6 root 284: 285: /* Increment multiprecision integer r. */ 1.1.1.7 ! root 286: boolean mp_inc(register unitptr r) 1.1.1.6 root 287: { 1.1.1.7 ! root 288: register short precision; ! 289: precision = global_precision; ! 290: make_lsbptr(r, precision); ! 291: do { ! 292: if (++(*r)) ! 293: return 0; /* no carry */ ! 294: post_higherunit(r); ! 295: } while (--precision); ! 296: return 1; /* return carry set */ ! 297: } /* mp_inc */ 1.1.1.6 root 298: 299: /* Decrement multiprecision integer r. */ 1.1.1.7 ! root 300: boolean mp_dec(register unitptr r) 1.1.1.6 root 301: { 1.1.1.7 ! root 302: register short precision; ! 303: precision = global_precision; ! 304: make_lsbptr(r, precision); ! 305: do { ! 306: if ((signedunit) (--(*r)) != (signedunit) - 1) ! 307: return 0; /* no borrow */ ! 308: post_higherunit(r); ! 309: } while (--precision); ! 310: return 1; /* return borrow set */ ! 311: } /* mp_dec */ 1.1.1.6 root 312: 1.1.1.7 ! root 313: /* Compute 2's complement, the arithmetic negative, of r */ 1.1.1.6 root 314: void mp_neg(register unitptr r) 315: { 1.1.1.7 ! root 316: register short precision; /* number of units to negate */ ! 317: precision = global_precision; ! 318: mp_dec(r); /* 2's complement is 1's complement plus 1 */ ! 319: do { /* now do 1's complement */ ! 320: *r = ~(*r); ! 321: r++; ! 322: } while (--precision); ! 323: } /* mp_neg */ 1.1.1.6 root 324: 325: #ifndef mp_move 1.1.1.7 ! root 326: ! 327: void mp_move(register unitptr dst, register unitptr src) 1.1.1.6 root 328: { 1.1.1.7 ! root 329: register short precision; /* number of units to move */ ! 330: precision = global_precision; ! 331: do { ! 332: *dst++ = *src++; ! 333: } while (--precision); ! 334: } /* mp_move */ ! 335: #endif /* mp_move */ 1.1.1.6 root 336: 1.1.1.7 ! root 337: /* Init multiprecision register r with short value. */ 1.1.1.6 root 338: void mp_init(register unitptr r, word16 value) 339: { /* Note that mp_init doesn't extend sign bit for >32767 */ 340: 1.1.1.7 ! root 341: unitfill0(r, global_precision); ! 342: make_lsbptr(r, global_precision); ! 343: *post_higherunit(r) = value; 1.1.1.6 root 344: #ifdef UNIT8 1.1.1.7 ! root 345: *post_higherunit(r) = value >> UNITSIZE; 1.1.1.6 root 346: #endif 1.1.1.7 ! root 347: } /* mp_init */ 1.1.1.6 root 348: 1.1.1.7 ! root 349: /* Returns number of significant units in r */ 1.1.1.6 root 350: short significance(register unitptr r) 351: { 1.1.1.7 ! root 352: register short precision; ! 353: precision = global_precision; ! 354: make_msbptr(r, precision); ! 355: do { ! 356: if (*post_lowerunit(r)) ! 357: return precision; ! 358: } while (--precision); ! 359: return precision; ! 360: } /* significance */ 1.1.1.6 root 361: 362: 363: #ifndef unitfill0 364: 1.1.1.7 ! root 365: /* Zero-fill the unit buffer r. */ ! 366: void unitfill0(unitptr r, word16 unitcount) ! 367: { ! 368: while (unitcount--) ! 369: *r++ = 0; ! 370: } /* unitfill0 */ ! 371: #endif /* unitfill0 */ 1.1.1.6 root 372: 1.1.1.7 ! root 373: /* Unsigned divide, treats both operands as positive. */ ! 374: int mp_udiv(register unitptr remainder, register unitptr quotient, ! 375: register unitptr dividend, register unitptr divisor) 1.1.1.6 root 376: { 1.1.1.7 ! root 377: int bits; ! 378: short dprec; ! 379: register unit bitmask; ! 380: ! 381: if (testeq(divisor, 0)) ! 382: return -1; /* zero divisor means divide error */ ! 383: mp_init0(remainder); ! 384: mp_init0(quotient); ! 385: /* normalize and compute number of bits in dividend first */ ! 386: init_bitsniffer(dividend, bitmask, dprec, bits); ! 387: /* rescale quotient to same precision (dprec) as dividend */ ! 388: rescale(quotient, global_precision, dprec); ! 389: make_msbptr(quotient, dprec); ! 390: ! 391: while (bits--) { ! 392: mp_rotate_left(remainder, ! 393: (boolean) (sniff_bit(dividend, bitmask) != 0)); ! 394: if (mp_compare(remainder, divisor) >= 0) { ! 395: mp_sub(remainder, divisor); ! 396: stuff_bit(quotient, bitmask); ! 397: } ! 398: bump_2bitsniffers(dividend, quotient, bitmask); ! 399: } ! 400: return 0; ! 401: } /* mp_udiv */ 1.1.1.6 root 402: 1.1.1.7 ! root 403: #ifdef UPTON_OR_SMITH 1.1.1.6 root 404: 1.1.1.7 ! root 405: #define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */ 1.1.1.6 root 406: 1.1.1.7 ! root 407: /* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */ ! 408: int mp_recip(register unitptr quotient, register unitptr divisor) ! 409: { ! 410: int bits; ! 411: short qprec; ! 412: register unit bitmask; ! 413: unit remainder[MAX_UNIT_PRECISION]; ! 414: if (testeq(divisor, 0)) ! 415: return -1; /* zero divisor means divide error */ ! 416: mp_init0(remainder); ! 417: mp_init0(quotient); ! 418: ! 419: /* normalize and compute number of bits in quotient first */ ! 420: bits = countbits(divisor) + RECIPMARGIN; ! 421: bitmask = bitmsk(bits); /* bitmask within a single unit */ ! 422: qprec = bits2units(bits + 1); ! 423: mp_setbit(remainder, (bits - RECIPMARGIN) - 1); ! 424: /* rescale quotient to precision of divisor + RECIPMARGIN bits */ ! 425: rescale(quotient, global_precision, qprec); ! 426: make_msbptr(quotient, qprec); ! 427: ! 428: while (bits--) { ! 429: mp_shift_left(remainder); ! 430: if (mp_compare(remainder, divisor) >= 0) { ! 431: mp_sub(remainder, divisor); ! 432: stuff_bit(quotient, bitmask); ! 433: } ! 434: bump_bitsniffer(quotient, bitmask); ! 435: } ! 436: mp_init0(remainder); /* burn sensitive data left on stack */ ! 437: return 0; ! 438: } /* mp_recip */ ! 439: #endif /* UPTON_OR_SMITH */ ! 440: ! 441: /* Signed divide, either or both operands may be negative. */ ! 442: int mp_div(register unitptr remainder, register unitptr quotient, ! 443: register unitptr dividend, register unitptr divisor) ! 444: { ! 445: boolean dvdsign, dsign; ! 446: int status; ! 447: dvdsign = (boolean) (mp_tstminus(dividend) != 0); ! 448: dsign = (boolean) (mp_tstminus(divisor) != 0); ! 449: if (dvdsign) ! 450: mp_neg(dividend); ! 451: if (dsign) ! 452: mp_neg(divisor); ! 453: status = mp_udiv(remainder, quotient, dividend, divisor); ! 454: if (dvdsign) ! 455: mp_neg(dividend); /* repair caller's dividend */ ! 456: if (dsign) ! 457: mp_neg(divisor); /* repair caller's divisor */ ! 458: if (status < 0) ! 459: return status; /* divide error? */ ! 460: if (dvdsign) ! 461: mp_neg(remainder); ! 462: if (dvdsign ^ dsign) ! 463: mp_neg(quotient); ! 464: return status; ! 465: } /* mp_div */ 1.1.1.6 root 466: 467: /* 468: * This function does a fast divide and mod on a multiprecision dividend 469: * using a short integer divisor returning a short integer remainder. 470: * This is an unsigned divide. It treats both operands as positive. 471: * It is used mainly for faster printing of large numbers in base 10. 472: */ 1.1.1.7 ! root 473: word16 mp_shortdiv(register unitptr quotient, ! 474: register unitptr dividend, register word16 divisor) 1.1.1.6 root 475: { 1.1.1.7 ! root 476: int bits; ! 477: short dprec; ! 478: register unit bitmask; ! 479: register word16 remainder; ! 480: if (!divisor) /* if divisor == 0 */ ! 481: return -1; /* zero divisor means divide error */ ! 482: remainder = 0; ! 483: mp_init0(quotient); ! 484: /* normalize and compute number of bits in dividend first */ ! 485: init_bitsniffer(dividend, bitmask, dprec, bits); ! 486: /* rescale quotient to same precision (dprec) as dividend */ ! 487: rescale(quotient, global_precision, dprec); ! 488: make_msbptr(quotient, dprec); ! 489: ! 490: while (bits--) { ! 491: remainder <<= 1; ! 492: if (sniff_bit(dividend, bitmask)) ! 493: remainder++; ! 494: if (remainder >= divisor) { ! 495: remainder -= divisor; ! 496: stuff_bit(quotient, bitmask); ! 497: } ! 498: bump_2bitsniffers(dividend, quotient, bitmask); ! 499: } ! 500: return remainder; ! 501: } /* mp_shortdiv */ 1.1.1.6 root 502: 503: /* Unsigned divide, treats both operands as positive. */ 1.1.1.7 ! root 504: int mp_mod(register unitptr remainder, ! 505: register unitptr dividend, register unitptr divisor) 1.1.1.6 root 506: { 1.1.1.7 ! root 507: int bits; ! 508: short dprec; ! 509: register unit bitmask; ! 510: if (testeq(divisor, 0)) ! 511: return -1; /* zero divisor means divide error */ ! 512: mp_init0(remainder); ! 513: /* normalize and compute number of bits in dividend first */ ! 514: init_bitsniffer(dividend, bitmask, dprec, bits); ! 515: ! 516: while (bits--) { ! 517: mp_rotate_left(remainder, ! 518: (boolean) (sniff_bit(dividend, bitmask) != 0)); ! 519: msub(remainder, divisor); ! 520: bump_bitsniffer(dividend, bitmask); ! 521: } ! 522: return 0; ! 523: } /* mp_mod */ 1.1.1.6 root 524: 525: /* 526: * This function does a fast mod operation on a multiprecision dividend 527: * using a short integer modulus returning a short integer remainder. 528: * This is an unsigned divide. It treats both operands as positive. 529: * It is used mainly for fast sieve searches for large primes. 530: */ 1.1.1.7 ! root 531: word16 mp_shortmod(register unitptr dividend, register word16 divisor) ! 532: { ! 533: int bits; ! 534: short dprec; ! 535: register unit bitmask; ! 536: register word16 remainder; ! 537: if (!divisor) /* if divisor == 0 */ ! 538: return -1; /* zero divisor means divide error */ ! 539: remainder = 0; ! 540: /* normalize and compute number of bits in dividend first */ ! 541: init_bitsniffer(dividend, bitmask, dprec, bits); ! 542: ! 543: while (bits--) { ! 544: remainder <<= 1; ! 545: if (sniff_bit(dividend, bitmask)) ! 546: remainder++; ! 547: if (remainder >= divisor) ! 548: remainder -= divisor; ! 549: bump_bitsniffer(dividend, bitmask); ! 550: } ! 551: return remainder; ! 552: } /* mp_shortmod */ 1.1.1.6 root 553: 554: 555: 1.1.1.7 ! root 556: #ifdef COMB_MULT /* use faster "comb" multiply algorithm */ ! 557: /* We are skipping this code because it has a bug... */ 1.1.1.6 root 558: 559: /* 560: * Computes multiprecision prod = multiplicand * multiplier 561: * 562: * Uses interleaved comb multiply algorithm. 563: * This improved multiply more than twice as fast as a Russian 564: * peasant multiply, because it does a lot fewer shifts. 565: * Must have global_precision set to the size of the multiplicand 566: * plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum 567: * of the lengths of the multiplier and multiplicand. 568: * 569: * BUG ALERT: Unfortunately, this code has a bug. It fails for 570: * some numbers. One such example: 571: * x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 572: * x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D 573: * 52C8 CDC7 AFB3 61C8 243C 741B 574: * --which is obviously wrong. The answer should be: 575: * x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A 576: * C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 577: * We'll have to fix this some day. In the meantime, we'll 578: * just have the compiler skip over this code. 579: * 580: * BUG NOTE: Possibly fixed. Needs testing. 581: */ 1.1.1.7 ! root 582: int mp_mult(register unitptr prod, ! 583: register unitptr multiplicand, register unitptr multiplier) 1.1.1.6 root 584: { 1.1.1.7 ! root 585: int bits; ! 586: register unit bitmask; ! 587: unitptr product, mplier, temp; ! 588: short mprec, mprec2; ! 589: unit mplicand[MAX_UNIT_PRECISION]; 1.1.1.6 root 590: 1.1.1.7 ! root 591: /* better clear full width--double precision */ ! 592: mp_init(prod + tohigher(global_precision), 0); 1.1.1.6 root 593: 1.1.1.7 ! root 594: if (testeq(multiplicand, 0)) ! 595: return 0; /* zero multiplicand means zero product */ 1.1.1.6 root 596: 1.1.1.7 ! root 597: mp_move(mplicand, multiplicand); /* save it from damage */ 1.1.1.6 root 598: 1.1.1.7 ! root 599: normalize(multiplier, mprec); ! 600: if (!mprec) ! 601: return 0; 1.1.1.6 root 602: 1.1.1.7 ! root 603: make_lsbptr(multiplier, mprec); ! 604: bitmask = 1; /* start scan at LSB of multiplier */ 1.1.1.6 root 605: 1.1.1.7 ! root 606: do { /* UNITSIZE times */ ! 607: /* do for bits 0-15 */ ! 608: product = prod; ! 609: mplier = multiplier; ! 610: mprec2 = mprec; ! 611: while (mprec2--) { /* do for each word in multiplier */ ! 612: ! 613: if (sniff_bit(mplier, bitmask)) { ! 614: if (mp_addc(product, multiplicand, 0)) { /* ripple carry */ ! 615: /* After 1st time thru, this is rarely encountered. */ ! 616: temp = msbptr(product, global_precision); ! 617: pre_higherunit(temp); ! 618: /* temp now points to LSU of carry region. */ ! 619: unmake_lsbptr(temp, global_precision); ! 620: mp_inc(temp); ! 621: } /* ripple carry */ ! 622: } ! 623: pre_higherunit(mplier); ! 624: pre_higherunit(product); ! 625: } ! 626: if (!(bitmask <<= 1)) ! 627: break; ! 628: mp_shift_left(multiplicand); ! 629: ! 630: } while (TRUE); ! 631: ! 632: mp_move(multiplicand, mplicand); /* recover */ ! 633: ! 634: return 0; /* normal return */ ! 635: } /* mp_mult */ ! 636: ! 637: #endif /* COMB_MULT */ ! 638: ! 639: ! 640: /* Because the "comb" multiply has a bug, we will use the slower ! 641: Russian peasant multiply instead. Fortunately, the mp_mult ! 642: function is not called from any time-critical code. ! 643: */ 1.1.1.6 root 644: 645: /* 646: * Computes multiprecision prod = multiplicand * multiplier 647: * Uses "Russian peasant" multiply algorithm. 648: */ 1.1.1.7 ! root 649: int mp_mult(register unitptr prod, ! 650: register unitptr multiplicand, register unitptr multiplier) 1.1.1.6 root 651: { 1.1.1.7 ! root 652: int bits; ! 653: register unit bitmask; ! 654: short mprec; ! 655: mp_init(prod, 0); ! 656: if (testeq(multiplicand, 0)) ! 657: return 0; /* zero multiplicand means zero product */ ! 658: /* normalize and compute number of bits in multiplier first */ ! 659: init_bitsniffer(multiplier, bitmask, mprec, bits); ! 660: ! 661: while (bits--) { ! 662: mp_shift_left(prod); ! 663: if (sniff_bit(multiplier, bitmask)) ! 664: mp_add(prod, multiplicand); ! 665: bump_bitsniffer(multiplier, bitmask); ! 666: } ! 667: return 0; ! 668: } /* mp_mult */ 1.1.1.6 root 669: 670: /* 671: * mp_modmult computes a multiprecision multiply combined with a 672: * modulo operation. This is the most time-critical function in 673: * this multiprecision arithmetic library for performing modulo 674: * exponentiation. We experimented with different versions of modmult, 675: * depending on the machine architecture and performance requirements. 676: * We will either use a Russian Peasant modmult (peasant_modmult), 677: * Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's 678: * modmult (upton_modmult), or Thad Smith's modmult (smith_modmult). 679: * On machines with a hardware atomic multiply instruction, 680: * Smith's modmult is fastest. It can utilize assembly subroutines to 681: * speed up the hardware multiply logic and trial quotient calculation. 682: * If the machine lacks a fast hardware multiply, Merritt's modmult 683: * is preferred, which doesn't call any assembly multiply routine. 684: * We use the alias names mp_modmult, stage_modulus, and modmult_burn 685: * for the corresponding true names, which depend on what flavor of 686: * modmult we are using. 687: * 688: * Before making the first call to mp_modmult, you must set up the 689: * modulus-dependant precomputated tables by calling stage_modulus. 690: * After making all the calls to mp_modmult, you call modmult_burn to 691: * erase the tables created by stage_modulus that were left in memory. 692: */ 693: 694: #ifdef COUNTMULTS 695: /* "number of modmults" counters, used for performance studies. */ 696: static unsigned int tally_modmults = 0; 697: static unsigned int tally_modsquares = 0; 1.1.1.7 ! root 698: #endif /* COUNTMULTS */ 1.1.1.6 root 699: 700: 701: #ifdef PEASANT 702: /* Conventional Russian peasant multiply with modulo algorithm. */ 703: 704: static unitptr pmodulus = 0; /* used only by mp_modmult */ 705: 706: /* 707: * Must pass modulus to stage_modulus before calling modmult. 708: * Assumes that global_precision has already been adjusted to the 709: * size of the modulus, plus SLOP_BITS. 710: */ 1.1.1.7 ! root 711: int stage_peasant_modulus(unitptr n) 1.1.1.6 root 712: { /* For this simple version of modmult, just copy unit pointer. */ 1.1.1.7 ! root 713: pmodulus = n; ! 714: return 0; /* normal return */ ! 715: } /* stage_peasant_modulus */ 1.1.1.6 root 716: 717: /* "Russian peasant" multiply algorithm, combined with a modulo 718: * operation. This is a simple naive replacement for Merritt's 719: * faster modmult algorithm. References global unitptr "modulus". 720: * Computes: prod = (multiplicand*multiplier) mod modulus 721: * WARNING: All the arguments must be less than the modulus! 722: */ 1.1.1.7 ! root 723: int peasant_modmult(register unitptr prod, ! 724: unitptr multiplicand, register unitptr multiplier) 1.1.1.6 root 725: { 1.1.1.7 ! root 726: int bits; ! 727: register unit bitmask; ! 728: short mprec; ! 729: mp_init(prod, 0); ! 730: /* if (testeq(multiplicand,0)) ! 731: return 0; *//* zero multiplicand means zero product */ ! 732: /* normalize and compute number of bits in multiplier first */ ! 733: init_bitsniffer(multiplier, bitmask, mprec, bits); ! 734: ! 735: while (bits--) { ! 736: mp_shift_left(prod); ! 737: msub(prod, pmodulus); /* turns mult into modmult */ ! 738: if (sniff_bit(multiplier, bitmask)) { ! 739: mp_add(prod, multiplicand); ! 740: msub(prod, pmodulus); /* turns mult into modmult */ ! 741: } ! 742: bump_bitsniffer(multiplier, bitmask); ! 743: } ! 744: return 0; ! 745: } /* peasant_modmult */ 1.1.1.6 root 746: 747: /* 748: * If we are using a version of mp_modmult that uses internal tables 749: * in memory, we have to call modmult_burn() at the end of mp_modexp. 750: * This is so that no sensitive data is left in memory after the program 751: * exits. The Russian peasant method doesn't use any such tables. 752: */ 1.1.1.7 ! root 753: 1.1.1.6 root 754: /* 755: * Alias for modmult_burn, called only from mp_modexp(). Destroys 756: * internal modmult tables. This version does nothing because no 757: * tables are used by the Russian peasant modmult. 758: */ 1.1.1.7 ! root 759: void peasant_burn(void) ! 760: { ! 761: } /* peasant_burn */ 1.1.1.6 root 762: 1.1.1.7 ! root 763: #endif /* PEASANT */ 1.1.1.6 root 764: 765: 766: #ifdef MERRITT 767: /*=========================================================================*/ 768: /* 1.1.1.7 ! root 769: This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ. ! 770: Also refined by Zhahai Stewart to reduce the number of subtracts. ! 771: Modified by Raymond Brand to reduce the number of SLOP_BITS by 1. ! 772: It performs a multiply combined with a modulo operation, without ! 773: going into "double precision". It is faster than the Russian peasant ! 774: method, and still works well on machines that lack a fast hardware ! 775: multiply instruction. ! 776: */ 1.1.1.6 root 777: 1.1.1.7 ! root 778: /* The following support functions, macros, and data structures ! 779: are used only by Merritt's modmult algorithm... */ 1.1.1.6 root 780: 1.1.1.7 ! root 781: /* Shift r1 1 whole unit to the left. Used only by modmult function. */ 1.1.1.6 root 782: static void mp_lshift_unit(register unitptr r1) 783: { 1.1.1.7 ! root 784: register short precision; ! 785: register unitptr r2; ! 786: precision = global_precision; ! 787: make_msbptr(r1, precision); ! 788: r2 = r1; ! 789: while (--precision) ! 790: *post_lowerunit(r1) = *pre_lowerunit(r2); ! 791: *r1 = 0; ! 792: } /* mp_lshift_unit */ 1.1.1.6 root 793: 794: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */ 1.1.1.7 ! root 795: static unit moduli_buf[UNITSIZE - 1][MAX_UNIT_PRECISION] = ! 796: {0}; ! 797: static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */ ! 798: {0, moduli_buf[0], moduli_buf[1], moduli_buf[2], ! 799: moduli_buf[3], moduli_buf[4], moduli_buf[5], moduli_buf[6] 1.1.1.6 root 800: #ifndef UNIT8 1.1.1.7 ! root 801: ,moduli_buf[7], moduli_buf[8], moduli_buf[9], moduli_buf[10], ! 802: moduli_buf[11], moduli_buf[12], moduli_buf[13], moduli_buf[14] ! 803: #ifndef UNIT16 /* and not UNIT8 */ ! 804: ,moduli_buf[15], moduli_buf[16], moduli_buf[17], moduli_buf[18], ! 805: moduli_buf[19], moduli_buf[20], moduli_buf[21], moduli_buf[22], ! 806: moduli_buf[23], moduli_buf[24], moduli_buf[25], moduli_buf[26], ! 807: moduli_buf[27], moduli_buf[28], moduli_buf[29], moduli_buf[30] ! 808: #endif /* UNIT16 and UNIT8 not defined */ ! 809: #endif /* UNIT8 not defined */ 1.1.1.6 root 810: }; 811: 812: /* 813: * To optimize msubs, need following 2 unit arrays, each filled 814: * with the most significant unit and next-to-most significant unit 815: * of the preshifted images of the modulus. 816: */ 1.1.1.7 ! root 817: static unit msu_moduli[UNITSIZE] = ! 818: {0}; /* most signif. unit */ ! 819: static unit nmsu_moduli[UNITSIZE] = ! 820: {0}; /* next-most signif. unit */ 1.1.1.6 root 821: 822: /* 823: * mpdbuf contains preshifted images of the multiplicand, mod n. 824: * It is used only by mp_modmult. It could be staticly declared 825: * inside of mp_modmult, but we put it outside mp_modmult so that 826: * it can be wiped clean by modmult_burn(), which is called at the 827: * end of mp_modexp. This is so that no sensitive data is left in 828: * memory after the program exits. 829: */ 1.1.1.7 ! root 830: static unit mpdbuf[UNITSIZE - 1][MAX_UNIT_PRECISION] = ! 831: {0}; 1.1.1.6 root 832: 833: /* 834: * Computes UNITSIZE images of r, each shifted left 1 more bit. 835: * Used only by modmult function. 836: */ 1.1.1.7 ! root 837: static void stage_mp_images(unitptr images[UNITSIZE], unitptr r) 1.1.1.6 root 838: { 1.1.1.7 ! root 839: short int i; 1.1.1.6 root 840: 1.1.1.7 ! root 841: images[0] = r; /* no need to move the first image, just copy ptr */ ! 842: for (i = 1; i < UNITSIZE; i++) { ! 843: mp_move(images[i], images[i - 1]); ! 844: mp_shift_left(images[i]); ! 845: msub(images[i], moduli[0]); ! 846: } ! 847: } /* stage_mp_images */ 1.1.1.6 root 848: 849: /* 850: * Computes UNITSIZE images of modulus, each shifted left 1 more bit. 851: * Before calling mp_modmult, you must first stage the moduli images by 852: * calling stage_modulus. n is the pointer to the modulus. 853: * Assumes that global_precision has already been adjusted to the 854: * size of the modulus, plus SLOP_BITS. 855: */ 1.1.1.7 ! root 856: int stage_merritt_modulus(unitptr n) 1.1.1.6 root 857: { 1.1.1.7 ! root 858: short int i; ! 859: unitptr msu; /* ptr to most significant unit, for faster msubs */ 1.1.1.6 root 860: 1.1.1.7 ! root 861: moduli[0] = n; /* no need to move the first image, just copy ptr */ 1.1.1.6 root 862: 1.1.1.7 ! root 863: /* used by optimized msubs macro... */ ! 864: msu = msbptr(n, global_precision); /* needed by msubs */ ! 865: msu_moduli[0] = *post_lowerunit(msu); ! 866: nmsu_moduli[0] = *msu; ! 867: ! 868: for (i = 1; i < UNITSIZE; i++) { ! 869: mp_move(moduli[i], moduli[i - 1]); ! 870: mp_shift_left(moduli[i]); 1.1.1.6 root 871: 1.1.1.7 ! root 872: /* used by optimized msubs macro... */ ! 873: msu = msbptr(moduli[i], global_precision); /* needed by msubs */ ! 874: msu_moduli[i] = *post_lowerunit(msu); /* for faster msubs */ ! 875: nmsu_moduli[i] = *msu; ! 876: } ! 877: return 0; /* normal return */ ! 878: } /* stage_merritt_modulus */ 1.1.1.6 root 879: 880: /* The following macros, sniffadd and msubs, are used by modmult... */ 881: #define sniffadd(i) if (*multiplier & power_of_2(i)) mp_add(prod,mpd[i]) 882: 883: /* Unoptimized msubs macro (msubs0) follows... */ 884: /* #define msubs0(i) msub(prod,moduli[i]) 1.1.1.7 ! root 885: */ 1.1.1.6 root 886: 887: /* 888: * To optimize msubs, compare the most significant units of the 889: * product and the shifted modulus before deciding to call the full 890: * mp_compare routine. Better still, compare the upper two units 891: * before deciding to call mp_compare. 892: * Optimization of msubs relies heavily on standard C left-to-right 893: * parsimonious evaluation of logical expressions. 894: */ 895: 896: /* Partially-optimized msubs macro (msubs1) follows... */ 897: /* #define msubs1(i) if ( \ 1.1.1.7 ! root 898: ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \ ! 899: (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \ ! 900: ) mp_sub(prod,moduli[i]) ! 901: */ 1.1.1.6 root 902: 903: /* Fully-optimized msubs macro (msubs2) follows... */ 904: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \ 905: (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \ 906: (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \ 907: mp_sub(prod,moduli[i]) 908: 909: /* 910: * Performs combined multiply/modulo operation. 911: * Computes: prod = (multiplicand*multiplier) mod modulus 912: * WARNING: All the arguments must be less than the modulus! 913: * Assumes the modulus has been predefined by first calling 914: * stage_modulus. 915: */ 1.1.1.7 ! root 916: int merritt_modmult(register unitptr prod, ! 917: unitptr multiplicand, register unitptr multiplier) 1.1.1.6 root 918: { 1.1.1.7 ! root 919: /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro... */ ! 920: register signedunit p_m; ! 921: register unitptr msu_prod; /* ptr to most significant unit of product */ ! 922: register unitptr nmsu_prod; /* next-most signif. unit of product */ ! 923: short mprec; /* precision of multiplier, in units */ ! 924: /* Array mpd contains a list of pointers to preshifted images of ! 925: the multiplicand: */ ! 926: static unitptr mpd[UNITSIZE] = ! 927: { ! 928: 0, mpdbuf[0], mpdbuf[1], mpdbuf[2], ! 929: mpdbuf[3], mpdbuf[4], mpdbuf[5], mpdbuf[6] 1.1.1.6 root 930: #ifndef UNIT8 1.1.1.7 ! root 931: ,mpdbuf[7], mpdbuf[8], mpdbuf[9], mpdbuf[10], ! 932: mpdbuf[11], mpdbuf[12], mpdbuf[13], mpdbuf[14] ! 933: #ifndef UNIT16 /* and not UNIT8 */ ! 934: ,mpdbuf[15], mpdbuf[16], mpdbuf[17], mpdbuf[18], ! 935: mpdbuf[19], mpdbuf[20], mpdbuf[21], mpdbuf[22], ! 936: mpdbuf[23], mpdbuf[24], mpdbuf[25], mpdbuf[26], ! 937: mpdbuf[27], mpdbuf[28], mpdbuf[29], mpdbuf[30] ! 938: #endif /* UNIT16 and UNIT8 not defined */ ! 939: #endif /* UNIT8 not defined */ ! 940: }; ! 941: ! 942: /* Compute preshifted images of multiplicand, mod n: */ ! 943: stage_mp_images(mpd, multiplicand); ! 944: ! 945: /* To optimize msubs, set up msu_prod and nmsu_prod: */ ! 946: msu_prod = msbptr(prod, global_precision); /* Get ptr to MSU of prod */ ! 947: nmsu_prod = msu_prod; ! 948: post_lowerunit(nmsu_prod); /* Get next-MSU of prod */ ! 949: ! 950: /* ! 951: * To understand this algorithm, it would be helpful to first ! 952: * study the conventional Russian peasant modmult algorithm. ! 953: * This one does about the same thing as Russian peasant, but ! 954: * is organized differently to save some steps. It loops ! 955: * through the multiplier a word (unit) at a time, instead of ! 956: * a bit at a time. It word-shifts the product instead of ! 957: * bit-shifting it, so it should be faster. It also does about ! 958: * half as many subtracts as Russian peasant. ! 959: */ ! 960: ! 961: mp_init(prod, 0); /* Initialize product to 0. */ ! 962: ! 963: /* ! 964: * The way mp_modmult is actually used in cryptographic ! 965: * applications, there will NEVER be a zero multiplier or ! 966: * multiplicand. So there is no need to optimize for that ! 967: * condition. ! 968: */ ! 969: /* if (testeq(multiplicand,0)) ! 970: return 0; *//* zero multiplicand means zero product */ ! 971: /* Normalize and compute number of units in multiplier first: */ ! 972: normalize(multiplier, mprec); ! 973: if (mprec == 0) /* if precision of multiplier is 0 */ ! 974: return 0; /* zero multiplier means zero product */ ! 975: make_msbptr(multiplier, mprec); /* start at MSU of multiplier */ 1.1.1.6 root 976: 1.1.1.7 ! root 977: while (mprec--) { /* Loop for the number of units in the multiplier */ 1.1.1.6 root 978: /* 1.1.1.7 ! root 979: * Shift the product one whole unit to the left. ! 980: * This is part of the multiply phase of modmult. 1.1.1.6 root 981: */ 982: 1.1.1.7 ! root 983: mp_lshift_unit(prod); 1.1.1.6 root 984: 1.1.1.7 ! root 985: /* ! 986: * The product may have grown by as many as UNITSIZE ! 987: * bits. That's why we have global_precision set to the ! 988: * size of the modulus plus UNITSIZE slop bits. ! 989: * Now reduce the product back down by conditionally ! 990: * subtracting the preshifted images of the modulus. ! 991: * This is part of the modulo reduction phase of modmult. ! 992: * The following loop is unrolled for speed, using macros... ! 993: ! 994: for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--) ! 995: if (mp_compare(prod,moduli[i]) >= 0) ! 996: mp_sub(prod,moduli[i]); 1.1.1.6 root 997: */ 998: 999: #ifndef UNIT8 1.1.1.7 ! root 1000: #ifndef UNIT16 /* and not UNIT8 */ ! 1001: msubs(31); ! 1002: msubs(30); ! 1003: msubs(29); ! 1004: msubs(28); ! 1005: msubs(27); ! 1006: msubs(26); ! 1007: msubs(25); ! 1008: msubs(24); ! 1009: msubs(23); ! 1010: msubs(22); ! 1011: msubs(21); ! 1012: msubs(20); ! 1013: msubs(19); ! 1014: msubs(18); ! 1015: msubs(17); ! 1016: msubs(16); ! 1017: #endif /* not UNIT16 and not UNIT8 */ ! 1018: msubs(15); ! 1019: msubs(14); ! 1020: msubs(13); ! 1021: msubs(12); ! 1022: msubs(11); ! 1023: msubs(10); ! 1024: msubs(9); ! 1025: msubs(8); ! 1026: #endif /* not UNIT8 */ ! 1027: msubs(7); ! 1028: msubs(6); ! 1029: msubs(5); 1.1.1.6 root 1030: #ifndef UNIT32 1.1.1.7 ! root 1031: msubs(4); 1.1.1.6 root 1032: #ifndef UNIT16 1.1.1.7 ! root 1033: msubs(3); 1.1.1.6 root 1034: #endif 1035: #endif 1036: 1.1.1.7 ! root 1037: /* ! 1038: * Sniff each bit in the current unit of the multiplier, ! 1039: * and conditionally add the corresponding preshifted ! 1040: * image of the multiplicand to the product. ! 1041: * This is also part of the multiply phase of modmult. ! 1042: * ! 1043: * The following loop is unrolled for speed, using macros... ! 1044: ! 1045: for (i=UNITSIZE-1; i>=0; i--) ! 1046: if (*multiplier & power_of_2(i)) ! 1047: mp_add(prod,mpd[i]); ! 1048: */ 1.1.1.6 root 1049: #ifndef UNIT8 1.1.1.7 ! root 1050: #ifndef UNIT16 /* and not UNIT8 */ ! 1051: sniffadd(31); ! 1052: sniffadd(30); ! 1053: sniffadd(29); ! 1054: sniffadd(28); ! 1055: sniffadd(27); ! 1056: sniffadd(26); ! 1057: sniffadd(25); ! 1058: sniffadd(24); ! 1059: sniffadd(23); ! 1060: sniffadd(22); ! 1061: sniffadd(21); ! 1062: sniffadd(20); ! 1063: sniffadd(19); ! 1064: sniffadd(18); ! 1065: sniffadd(17); ! 1066: sniffadd(16); ! 1067: #endif /* not UNIT16 and not UNIT8 */ ! 1068: sniffadd(15); ! 1069: sniffadd(14); ! 1070: sniffadd(13); ! 1071: sniffadd(12); ! 1072: sniffadd(11); ! 1073: sniffadd(10); ! 1074: sniffadd(9); ! 1075: sniffadd(8); ! 1076: #endif /* not UNIT8 */ ! 1077: sniffadd(7); ! 1078: sniffadd(6); ! 1079: sniffadd(5); ! 1080: sniffadd(4); ! 1081: sniffadd(3); ! 1082: sniffadd(2); ! 1083: sniffadd(1); ! 1084: sniffadd(0); ! 1085: ! 1086: /* ! 1087: * The product may have grown by as many as LOG_UNITSIZE+1 ! 1088: * bits. ! 1089: * Now reduce the product back down by conditionally ! 1090: * subtracting LOG_UNITSIZE+1 preshifted images of the ! 1091: * modulus. This is the modulo reduction phase of modmult. ! 1092: * ! 1093: * The following loop is unrolled for speed, using macros... ! 1094: ! 1095: for (i=LOG_UNITSIZE; i>=0; i--) ! 1096: if (mp_compare(prod,moduli[i]) >= 0) ! 1097: mp_sub(prod,moduli[i]); ! 1098: */ 1.1.1.6 root 1099: 1100: #ifndef UNIT8 1101: #ifndef UNIT16 1.1.1.7 ! root 1102: msubs(5); 1.1.1.6 root 1103: #endif 1.1.1.7 ! root 1104: msubs(4); 1.1.1.6 root 1105: #endif 1.1.1.7 ! root 1106: msubs(3); ! 1107: msubs(2); ! 1108: msubs(1); ! 1109: msubs(0); 1.1.1.6 root 1110: 1.1.1.7 ! root 1111: /* Bump pointer to next lower unit of multiplier: */ ! 1112: post_lowerunit(multiplier); 1.1.1.6 root 1113: 1.1.1.7 ! root 1114: } /* Loop for the number of units in the ! 1115: multiplier */ 1.1.1.6 root 1116: 1.1.1.7 ! root 1117: return 0; /* normal return */ 1.1.1.6 root 1118: 1.1.1.7 ! root 1119: } /* merritt_modmult */ 1.1.1.6 root 1120: 1121: #undef msubs 1122: #undef sniffadd 1123: 1124: /* 1125: * Merritt's mp_modmult function leaves some internal tables in memory, 1126: * so we have to call modmult_burn() at the end of mp_modexp. 1127: * This is so that no cryptographically sensitive data is left in memory 1128: * after the program exits. 1129: */ 1.1.1.7 ! root 1130: 1.1.1.6 root 1131: /* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */ 1.1.1.7 ! root 1132: void merritt_burn(void) 1.1.1.6 root 1133: { 1.1.1.7 ! root 1134: unitfill0(&(mpdbuf[0][0]), (UNITSIZE - 1) * MAX_UNIT_PRECISION); ! 1135: unitfill0(&(moduli_buf[0][0]), (UNITSIZE - 1) * MAX_UNIT_PRECISION); ! 1136: unitfill0(msu_moduli, UNITSIZE); ! 1137: unitfill0(nmsu_moduli, UNITSIZE); ! 1138: } /* merritt_burn() */ 1.1.1.6 root 1139: 1140: /******* end of Merritt's MODMULT stuff. *******/ 1141: /*=========================================================================*/ 1.1.1.7 ! root 1142: #endif /* MERRITT */ 1.1.1.6 root 1143: 1144: 1.1.1.7 ! root 1145: #ifdef UPTON_OR_SMITH /* Used by Upton's and Smith's ! 1146: modmult algorithms */ 1.1.1.6 root 1147: 1148: /* 1149: * Jimmy Upton's multiprecision modmult algorithm in C. 1150: * Performs a multiply combined with a modulo operation. 1151: * 1152: * The following support functions and data structures 1153: * are used only by Upton's modmult algorithm... 1154: */ 1155: 1.1.1.7 ! root 1156: short munit_prec; /* global_precision expressed in MULTUNITs */ 1.1.1.6 root 1157: 1158: /* 1159: * Note that since the SPARC CPU has no hardware integer multiply 1160: * instruction, there is not that much advantage in having an 1161: * assembly version of mp_smul on that machine. It might be faster 1162: * to use Merritt's modmult instead of Upton's modmult on the SPARC. 1163: */ 1164: 1165: /* 1166: * Multiply the single-word multiplier times the multiprecision integer 1167: * in multiplicand, accumulating result in prod. The resulting 1168: * multiprecision prod will be 1 word longer than the multiplicand. 1169: * multiplicand is munit_prec words long. We add into prod, so caller 1170: * should zero it out first. For best results, this time-critical 1171: * function should be implemented in assembly. 1172: * NOTE: Unlike other functions in the multiprecision arithmetic 1173: * library, both multiplicand and prod are pointing at the LSB, 1174: * regardless of byte order of the machine. On an 80x86, this makes 1175: * no difference. But if this assembly function is implemented 1176: * on a 680x0, it becomes important. 1177: */ 1178: 1179: /* 1180: * Note that this has been modified from the previous version to allow 1181: * better support for Smith's modmult: 1.1.1.7 ! root 1182: * The final carry bit is added to the existing product ! 1183: * array, rather than simply stored. ! 1184: */ 1.1.1.6 root 1185: 1186: #ifndef mp_smul 1.1.1.7 ! root 1187: void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier) 1.1.1.6 root 1188: { 1.1.1.7 ! root 1189: short i; ! 1190: unsigned long p, carry; ! 1191: ! 1192: carry = 0; ! 1193: for (i = 0; i < munit_prec; ++i) { ! 1194: p = (unsigned long) multiplier **post_higherunit(multiplicand); ! 1195: p += *prod + carry; ! 1196: *post_higherunit(prod) = (MULTUNIT) p; ! 1197: carry = p >> MULTUNITSIZE; ! 1198: } ! 1199: /* Add carry to the next higher word of product / dividend */ ! 1200: *prod += (MULTUNIT) carry; ! 1201: } /* mp_smul */ 1.1.1.6 root 1202: 1203: #endif 1204: 1205: /* 1206: * mp_dmul is a double-precision multiply multiplicand times multiplier, 1207: * result into prod. prod must be pointing at a "double precision" 1208: * buffer. E.g. If global_precision is 10 words, prod must be 1209: * pointing at a 20-word buffer. 1210: */ 1211: #ifndef mp_dmul 1.1.1.7 ! root 1212: void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier) 1.1.1.6 root 1213: { 1.1.1.7 ! root 1214: register int i; ! 1215: register MULTUNIT *p_multiplicand, *p_multiplier; ! 1216: register MULTUNIT *prodp; ! 1217: ! 1218: ! 1219: unitfill0(prod, global_precision * 2); /* Pre-zero prod */ ! 1220: /* Calculate precision in units of MULTUNIT */ ! 1221: munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; ! 1222: p_multiplicand = (MULTUNIT *) multiplicand; ! 1223: p_multiplier = (MULTUNIT *) multiplier; ! 1224: prodp = (MULTUNIT *) prod; ! 1225: make_lsbptr(p_multiplicand, munit_prec); ! 1226: make_lsbptr(p_multiplier, munit_prec); ! 1227: make_lsbptr(prodp, munit_prec * 2); ! 1228: /* Multiply multiplicand by each word in multiplier, accumulating prod: */ ! 1229: for (i = 0; i < munit_prec; ++i) ! 1230: mp_smul(post_higherunit(prodp), ! 1231: p_multiplicand, *post_higherunit(p_multiplier)); ! 1232: } /* mp_dmul */ ! 1233: #endif /* mp_dmul */ 1.1.1.6 root 1234: 1235: static unit ALIGN modulus[MAX_UNIT_PRECISION]; 1.1.1.7 ! root 1236: static short nbits; /* number of modulus significant bits */ ! 1237: #endif /* UPTON_OR_SMITH */ 1.1.1.6 root 1238: 1239: #ifdef UPTON 1240: 1241: /* 1242: * These scratchpad arrays are used only by upton_modmult (mp_modmult). 1243: * Some of them could be staticly declared inside of mp_modmult, but we 1244: * put them outside mp_modmult so that they can be wiped clean by 1245: * modmult_burn(), which is called at the end of mp_modexp. This is 1246: * so that no sensitive data is left in memory after the program exits. 1247: */ 1248: 1249: static unit ALIGN reciprocal[MAX_UNIT_PRECISION]; 1250: static unit ALIGN dhi[MAX_UNIT_PRECISION]; 1.1.1.7 ! root 1251: static unit ALIGN d_data[MAX_UNIT_PRECISION * 2]; /* double-wide ! 1252: register */ ! 1253: static unit ALIGN e_data[MAX_UNIT_PRECISION * 2]; /* double-wide ! 1254: register */ ! 1255: static unit ALIGN f_data[MAX_UNIT_PRECISION * 2]; /* double-wide ! 1256: register */ 1.1.1.6 root 1257: 1258: static short nbitsDivUNITSIZE; 1259: static short nbitsModUNITSIZE; 1260: 1261: /* 1262: * stage_upton_modulus() is aliased to stage_modulus(). 1263: * Prepare for an Upton modmult. Calculate the reciprocal of modulus, 1264: * and save both. Note that reciprocal will have one more bit than 1265: * modulus. 1266: * Assumes that global_precision has already been adjusted to the 1267: * size of the modulus, plus SLOP_BITS. 1268: */ 1269: int stage_upton_modulus(unitptr n) 1270: { 1.1.1.7 ! root 1271: mp_move(modulus, n); ! 1272: mp_recip(reciprocal, modulus); ! 1273: nbits = countbits(modulus); ! 1274: nbitsDivUNITSIZE = nbits / UNITSIZE; ! 1275: nbitsModUNITSIZE = nbits % UNITSIZE; ! 1276: return 0; /* normal return */ ! 1277: } /* stage_upton_modulus */ 1.1.1.6 root 1278: 1279: 1280: /* 1281: * Upton's algorithm performs a multiply combined with a modulo operation. 1282: * Computes: prod = (multiplicand*multiplier) mod modulus 1283: * WARNING: All the arguments must be less than the modulus! 1284: * References global unitptr modulus and reciprocal. 1285: * The reciprocal of modulus is 1 bit longer than the modulus. 1286: * upton_modmult() is aliased to mp_modmult(). 1287: */ 1.1.1.7 ! root 1288: int upton_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) 1.1.1.6 root 1289: { 1.1.1.7 ! root 1290: unitptr d = d_data; ! 1291: unitptr d1 = d_data; ! 1292: unitptr e = e_data; ! 1293: unitptr f = f_data; ! 1294: short orig_precision; 1.1.1.6 root 1295: 1.1.1.7 ! root 1296: orig_precision = global_precision; ! 1297: mp_dmul(d, multiplicand, multiplier); 1.1.1.6 root 1298: 1.1.1.7 ! root 1299: /* Throw off low nbits of d */ 1.1.1.6 root 1300: #ifndef HIGHFIRST 1.1.1.7 ! root 1301: d1 = d + nbitsDivUNITSIZE; 1.1.1.6 root 1302: #else 1.1.1.7 ! root 1303: d1 = d + orig_precision - nbitsDivUNITSIZE; 1.1.1.6 root 1304: #endif 1.1.1.7 ! root 1305: mp_move(dhi, d1); /* Don't screw up d, we need it later */ ! 1306: mp_shift_right_bits(dhi, nbitsModUNITSIZE); 1.1.1.6 root 1307: 1.1.1.7 ! root 1308: mp_dmul(e, dhi, reciprocal); /* Note - reciprocal has nbits+1 bits */ 1.1.1.6 root 1309: 1310: #ifndef HIGHFIRST 1.1.1.7 ! root 1311: e += nbitsDivUNITSIZE; 1.1.1.6 root 1312: #else 1.1.1.7 ! root 1313: e += orig_precision - nbitsDivUNITSIZE; 1.1.1.6 root 1314: #endif 1.1.1.7 ! root 1315: mp_shift_right_bits(e, nbitsModUNITSIZE); 1.1.1.6 root 1316: 1.1.1.7 ! root 1317: mp_dmul(f, e, modulus); 1.1.1.6 root 1318: 1.1.1.7 ! root 1319: /* Now for the only double-precision call to mpilib: */ ! 1320: set_precision(orig_precision * 2); ! 1321: mp_sub(d, f); ! 1322: ! 1323: /* d's precision should be <= orig_precision */ ! 1324: rescale(d, orig_precision * 2, orig_precision); ! 1325: set_precision(orig_precision); ! 1326: ! 1327: /* Should never have to do this final subtract more than twice: */ ! 1328: while (mp_compare(d, modulus) > 0) ! 1329: mp_sub(d, modulus); ! 1330: ! 1331: mp_move(prod, d); ! 1332: return 0; /* normal return */ ! 1333: } /* upton_modmult */ ! 1334: ! 1335: /* Upton's mp_modmult function leaves some internal arrays in memory, ! 1336: so we have to call modmult_burn() at the end of mp_modexp. ! 1337: This is so that no cryptographically sensitive data is left in memory ! 1338: after the program exits. ! 1339: upton_burn() is aliased to modmult_burn(). ! 1340: */ 1.1.1.6 root 1341: void upton_burn(void) 1342: { 1.1.1.7 ! root 1343: unitfill0(modulus, MAX_UNIT_PRECISION); ! 1344: unitfill0(reciprocal, MAX_UNIT_PRECISION); ! 1345: unitfill0(dhi, MAX_UNIT_PRECISION); ! 1346: unitfill0(d_data, MAX_UNIT_PRECISION * 2); ! 1347: unitfill0(e_data, MAX_UNIT_PRECISION * 2); ! 1348: unitfill0(f_data, MAX_UNIT_PRECISION * 2); ! 1349: nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0; ! 1350: } /* upton_burn */ 1.1.1.6 root 1351: 1352: /******* end of Upton's MODMULT stuff. *******/ 1353: /*=========================================================================*/ 1.1.1.7 ! root 1354: #endif /* UPTON */ 1.1.1.6 root 1355: 1.1.1.7 ! root 1356: #ifdef SMITH /* using Thad Smith's modmult algorithm */ 1.1.1.6 root 1357: 1358: /* 1359: * Thad Smith's implementation of multiprecision modmult algorithm in C. 1360: * Performs a multiply combined with a modulo operation. 1361: * The multiplication is done with mp_dmul, the same as for Upton's 1362: * modmult. The modulus reduction is done by long division, in 1363: * which a trial quotient "digit" is determined, then the product of 1364: * that digit and the divisor are subtracted from the dividend. 1365: * 1366: * In this case, the digit is MULTUNIT in size and the subtraction 1367: * is done by adding the product to the one's complement of the 1368: * dividend, which allows use of the existing mp_smul routine. 1369: * 1370: * The following support functions and data structures 1371: * are used only by Smith's modmult algorithm... 1372: */ 1373: 1374: /* 1375: * These scratchpad arrays are used only by smith_modmult (mp_modmult). 1376: * Some of them could be statically declared inside of mp_modmult, but we 1377: * put them outside mp_modmult so that they can be wiped clean by 1378: * modmult_burn(), which is called at the end of mp_modexp. This is 1379: * so that no sensitive data is left in memory after the program exits. 1380: */ 1381: 1.1.1.7 ! root 1382: static unit ALIGN ds_data[MAX_UNIT_PRECISION * 2 + 2]; 1.1.1.6 root 1383: 1.1.1.7 ! root 1384: static unit mod_quotient[4]; ! 1385: static unit mod_divisor[4]; /* 2 most signif. MULTUNITs of modulus */ 1.1.1.6 root 1386: 1.1.1.7 ! root 1387: static MULTUNIT *modmpl; /* ptr to modulus least significant ! 1388: ** MULTUNIT */ ! 1389: static int mshift; /* number of bits for ! 1390: ** recip scaling */ ! 1391: static MULTUNIT reciph; /* MSunit of scaled recip */ ! 1392: static MULTUNIT recipl; /* LSunit of scaled recip */ 1.1.1.6 root 1393: 1.1.1.7 ! root 1394: static short modlenMULTUNITS; /* length of modulus in MULTUNITs */ ! 1395: static MULTUNIT mutemp; /* temporary */ 1.1.1.6 root 1396: 1397: /* 1398: * The routines mp_smul and mp_dmul are the same as for UPTON and 1399: * should be coded in assembly. Note, however, that the previous 1400: * Upton's mp_smul version has been modified to compatible with 1401: * Smith's modmult. The new version also still works for Upton's 1402: * modmult. 1403: */ 1404: 1405: #ifndef mp_set_recip 1.1.1.7 ! root 1406: #define mp_set_recip(rh,rl,m) /* null */ 1.1.1.6 root 1407: #else 1408: /* setup routine for external mp_quo_digit */ 1409: void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m); 1410: #endif 1.1.1.7 ! root 1411: MULTUNIT mp_quo_digit(MULTUNIT * dividend); 1.1.1.6 root 1412: 1413: #ifdef MULTUNIT_SIZE_SAME 1414: #define mp_musubb mp_subb /* use existing routine */ 1.1.1.7 ! root 1415: #else /* ! MULTUNIT_SIZE_SAME */ 1.1.1.6 root 1416: 1417: /* 1418: * This performs the same function as mp_subb, but with MULTUNITs. 1419: * Note: Processors without alignment requirements may be able to use 1420: * mp_subb, even though MULTUNITs are smaller than units. In that case, 1421: * use mp_subb, since it would be faster if coded in assembly. Note that 1422: * this implementation won't work for MULTUNITs which are long -- use 1423: * mp_subb in that case. 1424: */ 1425: #ifndef mp_musubb 1.1.1.7 ! root 1426: 1.1.1.6 root 1427: /* 1428: * multiprecision subtract of MULTUNITs with borrow, r2 from r1, result in r1 1429: * borrow is incoming borrow flag-- value should be 0 or 1 1430: */ 1.1.1.7 ! root 1431: boolean mp_musubb ! 1432: (register MULTUNIT * r1, register MULTUNIT * r2, register boolean borrow) 1.1.1.6 root 1433: { 1.1.1.7 ! root 1434: register ulint x; /* won't work if sizeof(MULTUNIT)== ! 1435: sizeof(long) */ ! 1436: short precision; /* number of MULTUNITs to subtract */ ! 1437: precision = global_precision * MULTUNITs_perunit; ! 1438: make_lsbptr(r1, precision); ! 1439: make_lsbptr(r2, precision); ! 1440: while (precision--) { ! 1441: x = (ulint) * r1 - (ulint) * post_higherunit(r2) - (ulint) borrow; ! 1442: *post_higherunit(r1) = x; ! 1443: borrow = (((1L << MULTUNITSIZE) & x) != 0L); ! 1444: } ! 1445: return (borrow); ! 1446: } /* mp_musubb */ ! 1447: #endif /* mp_musubb */ ! 1448: #endif /* MULTUNIT_SIZE_SAME */ 1.1.1.6 root 1449: 1450: /* 1451: * The function mp_quo_digit is the heart of Smith's modulo reduction, 1452: * which uses a form of long division. It computes a trial quotient 1453: * "digit" (MULTUNIT-sized digit) by multiplying the three most 1454: * significant MULTUNITs of the dividend by the two most significant 1455: * MULTUNITs of the reciprocal of the modulus. Note that this function 1456: * requires that MULTUNITSIZE * 2 <= sizeof(unsigned long). 1457: * 1458: * An important part of this technique is that the quotient never be 1459: * too small, although it may occasionally be too large. This was 1460: * done to eliminate the need to check and correct for a remainder 1.1.1.7 ! root 1461: * exceeding the divisor. It is easier to check for a negative 1.1.1.6 root 1462: * remainder. The following technique rarely needs correction for 1463: * MULTUNITs of at least 16 bits. 1464: * 1465: * The following routine has two implementations: 1466: * 1467: * ASM_PROTOTYPE defined: written to be an executable prototype for 1468: * an efficient assembly language implementation. Note that several 1469: * of the following masks and shifts can be done by directly 1470: * manipulating single precision registers on some architectures. 1471: * 1472: * ASM_PROTOTYPE undefined: a slightly more efficient implementation 1473: * in C. Although this version returns a result larger than the 1474: * optimum (which is corrected in smith_modmult) more often than the 1475: * prototype version, the faster execution in C more than makes up 1476: * for the difference. 1477: * 1478: * Parameter: dividend - points to the most significant MULTUNIT 1.1.1.7 ! root 1479: * of the dividend. Note that dividend actually contains the ! 1480: * one's complement of the actual dividend value (see comments for ! 1481: * smith_modmult). 1.1.1.6 root 1482: * 1483: * Return: the trial quotient digit resulting from dividing the first 1.1.1.7 ! root 1484: * three MULTUNITs at dividend by the upper two MULTUNITs of the ! 1485: * modulus. 1.1.1.6 root 1486: */ 1487: 1.1.1.7 ! root 1488: /* #define ASM_PROTOTYPE *//* undefined: use C-optimized version */ 1.1.1.6 root 1489: 1490: #ifndef mp_quo_digit 1.1.1.7 ! root 1491: MULTUNIT mp_quo_digit(MULTUNIT * dividend) ! 1492: { ! 1493: unsigned long q, q0, q1, q2; ! 1494: unsigned short lsb_factor; 1.1.1.6 root 1495: 1496: /* 1497: * Compute the least significant product group. 1.1.1.7 ! root 1498: * The last terms of q1 and q2 perform upward rounding, which is ! 1499: * needed to guarantee that the result not be too small. 1.1.1.6 root 1500: */ 1.1.1.7 ! root 1501: q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long) reciph ! 1502: + reciph; ! 1503: q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long) recipl ! 1504: + (1L << MULTUNITSIZE); 1.1.1.6 root 1505: #ifdef ASM_PROTOTYPE 1.1.1.7 ! root 1506: lsb_factor = 1 & (q1 >> MULTUNITSIZE) & (q2 >> MULTUNITSIZE); ! 1507: q = q1 + q2; 1.1.1.6 root 1508: 1.1.1.7 ! root 1509: /* The following statement is equivalent to shifting the sum right ! 1510: one bit while shifting in the carry bit. ! 1511: */ ! 1512: q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); ! 1513: #else /* optimized C version */ ! 1514: q0 = (q1 >> 1) + (q2 >> 1) + 1; 1.1.1.6 root 1515: #endif 1516: 1.1.1.7 ! root 1517: /* Compute the middle significant product group. */ 1.1.1.6 root 1518: 1.1.1.7 ! root 1519: q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long) reciph; ! 1520: q2 = (dividend[0] ^ MULTUNIT_mask) * (unsigned long) recipl; 1.1.1.6 root 1521: #ifdef ASM_PROTOTYPE 1.1.1.7 ! root 1522: q = q1 + q2; ! 1523: q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); 1.1.1.6 root 1524: 1.1.1.7 ! root 1525: /* Add in the most significant word of the first group. ! 1526: The last term takes care of the carry from adding the lsb's ! 1527: that were shifted out prior to addition. ! 1528: */ ! 1529: q = (q0 >> MULTUNITSIZE) + q + (lsb_factor & (q1 ^ q2)); ! 1530: #else /* optimized C version */ ! 1531: q = (q0 >> MULTUNITSIZE) + (q1 >> 1) + (q2 >> 1) + 1; 1.1.1.6 root 1532: #endif 1533: 1.1.1.7 ! root 1534: /* Compute the most significant term and add in the others */ 1.1.1.6 root 1535: 1.1.1.7 ! root 1536: q = (q >> (MULTUNITSIZE - 2)) + ! 1537: (((dividend[0] ^ MULTUNIT_mask) * (unsigned long) reciph) << 1); ! 1538: q >>= mshift; ! 1539: ! 1540: /* Prevent overflow and then wipe out the intermediate results. */ ! 1541: ! 1542: mutemp = (MULTUNIT) min(q, (1L << MULTUNITSIZE) - 1); ! 1543: q = q0 = q1 = q2 = 0; ! 1544: lsb_factor = 0; ! 1545: (void) lsb_factor; ! 1546: return mutemp; 1.1.1.6 root 1547: } 1.1.1.7 ! root 1548: #endif /* mp_quo_digit */ 1.1.1.6 root 1549: 1550: /* 1551: * stage_smith_modulus() - Prepare for a Smith modmult. 1552: * 1553: * Calculate the reciprocal of modulus with a precision of two MULTUNITs. 1554: * Assumes that global_precision has already been adjusted to the 1555: * size of the modulus, plus SLOP_BITS. 1556: * 1557: * Note: This routine was designed to work with large values and 1558: * doesn't have the necessary testing or handling to work with a 1559: * modulus having less than three significant units. For such cases, 1560: * the separate multiply and modulus routines can be used. 1561: * 1562: * stage_smith_modulus() is aliased to stage_modulus(). 1563: */ 1564: int stage_smith_modulus(unitptr n_modulus) 1565: { 1.1.1.7 ! root 1566: int original_precision; ! 1567: int sigmod; /* significant units in modulus */ ! 1568: unitptr mp; /* modulus most significant pointer */ ! 1569: MULTUNIT *mpm; /* reciprocal pointer */ ! 1570: int prec; /* precision of reciprocal calc in units */ ! 1571: ! 1572: mp_move(modulus, n_modulus); ! 1573: modmpl = (MULTUNIT *) modulus; ! 1574: modmpl = lsbptr(modmpl, global_precision * MULTUNITs_perunit); ! 1575: nbits = countbits(modulus); ! 1576: modlenMULTUNITS = (nbits + MULTUNITSIZE - 1) / MULTUNITSIZE; ! 1577: ! 1578: original_precision = global_precision; ! 1579: ! 1580: /* The following code copies the three most significant units of ! 1581: * modulus to mod_divisor. ! 1582: */ ! 1583: mp = modulus; ! 1584: sigmod = significance(modulus); ! 1585: rescale(mp, original_precision, sigmod); 1.1.1.6 root 1586: /* prec is the unit precision required for 3 MULTUNITs */ 1.1.1.7 ! root 1587: prec = (3 + (MULTUNITs_perunit - 1)) / MULTUNITs_perunit; ! 1588: set_precision(prec); 1.1.1.6 root 1589: 1.1.1.7 ! root 1590: /* set mp = ptr to most significant units of modulus, then move ! 1591: * the most significant units to mp_divisor ! 1592: */ ! 1593: mp = msbptr(mp, sigmod) - tohigher(prec - 1); ! 1594: unmake_lsbptr(mp, prec); ! 1595: mp_move(mod_divisor, mp); ! 1596: ! 1597: /* Keep 2*MULTUNITSIZE bits in mod_divisor. ! 1598: * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits. ! 1599: */ ! 1600: mshift = countbits(mod_divisor) - 2 * MULTUNITSIZE; ! 1601: mp_shift_right_bits(mod_divisor, mshift); ! 1602: mp_recip(mod_quotient, mod_divisor); ! 1603: mp_shift_right_bits(mod_quotient, 1); ! 1604: ! 1605: /* Reduce to: 0 < mshift <= MULTUNITSIZE */ ! 1606: mshift = ((mshift + (MULTUNITSIZE - 1)) % MULTUNITSIZE) + 1; ! 1607: /* round up, rescaling if necessary */ ! 1608: mp_inc(mod_quotient); ! 1609: if (countbits(mod_quotient) > 2 * MULTUNITSIZE) { ! 1610: mp_shift_right_bits(mod_quotient, 1); ! 1611: mshift--; /* now 0 <= mshift <= MULTUNITSIZE */ ! 1612: } ! 1613: mpm = lsbptr((MULTUNIT *) mod_quotient, prec * MULTUNITs_perunit); ! 1614: recipl = *post_higherunit(mpm); ! 1615: reciph = *mpm; ! 1616: mp_set_recip(reciph, recipl, mshift); ! 1617: set_precision(original_precision); ! 1618: return 0; /* normal return */ ! 1619: } /* stage_smith_modulus */ ! 1620: ! 1621: /* Smith's algorithm performs a multiply combined with a modulo operation. ! 1622: Computes: prod = (multiplicand*multiplier) mod modulus ! 1623: The modulus must first be set by stage_smith_modulus(). ! 1624: smith_modmult() is aliased to mp_modmult(). ! 1625: */ ! 1626: int smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) ! 1627: { ! 1628: unitptr d; /* ptr to product */ ! 1629: MULTUNIT *dmph, *dmpl, *dmp; /* ptrs to dividend (high, low, first) ! 1630: * aligned for subtraction */ ! 1631: /* Note that dmph points one MULTUNIT higher than indicated by ! 1632: global precision. This allows us to zero out a word one higher than ! 1633: the normal precision. ! 1634: */ ! 1635: short orig_precision; ! 1636: short nqd; /* number of quotient digits remaining to ! 1637: * be generated */ ! 1638: short dmi; /* number of significant MULTUNITs in ! 1639: product */ ! 1640: ! 1641: d = ds_data + 1; /* room for leading MSB if HIGHFIRST */ ! 1642: orig_precision = global_precision; ! 1643: mp_dmul(d, multiplicand, multiplier); ! 1644: ! 1645: rescale(d, orig_precision * 2, orig_precision * 2 + 1); ! 1646: set_precision(orig_precision * 2 + 1); ! 1647: *msbptr(d, global_precision) = 0; /* leading 0 unit */ ! 1648: ! 1649: /* We now start working with MULTUNITs. ! 1650: Determine the most significant MULTUNIT of the product so we don't ! 1651: have to process leading zeros in our divide loop. ! 1652: */ ! 1653: dmi = significance(d) * MULTUNITs_perunit; ! 1654: if (dmi >= modlenMULTUNITS) { ! 1655: /* Make dividend negative. This allows the use of mp_smul to ! 1656: * "subtract" the product of the modulus and the trial divisor ! 1657: * by actually adding to a negative dividend. ! 1658: * The one's complement of the dividend is used, since it causes ! 1659: * a zero value to be represented as all ones. This facilitates ! 1660: * testing the result for possible overflow, since a sign bit ! 1661: * indicates that no adjustment is necessary, and we should not ! 1662: * attempt to adjust if the result of the addition is zero. ! 1663: */ ! 1664: mp_inc(d); ! 1665: mp_neg(d); ! 1666: set_precision(orig_precision); ! 1667: munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; ! 1668: ! 1669: /* Set msb, lsb, and normal ptrs of dividend */ ! 1670: dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) * ! 1671: MULTUNITs_perunit) + tohigher(dmi); ! 1672: nqd = dmi + 1 - modlenMULTUNITS; ! 1673: dmpl = dmph - tohigher(modlenMULTUNITS); 1.1.1.6 root 1674: 1.1.1.7 ! root 1675: /* 1.1.1.6 root 1676: * Divide loop. 1677: * Each iteration computes the next quotient MULTUNIT digit, then 1678: * multiplies the divisor (modulus) by the quotient digit and adds 1679: * it to the one's complement of the dividend (equivalent to 1680: * subtracting). If the product was greater than the remaining dividend, 1681: * we get a non-negative result, in which case we subtract off the 1682: * modulus to get the proper negative remainder. 1683: */ 1.1.1.7 ! root 1684: for (; nqd; nqd--) { ! 1685: MULTUNIT q; /* quotient trial digit */ 1.1.1.6 root 1686: 1.1.1.7 ! root 1687: q = mp_quo_digit(dmph); ! 1688: if (q > 0) { ! 1689: mp_smul(dmpl, modmpl, q); 1.1.1.6 root 1690: 1.1.1.7 ! root 1691: /* Perform correction if q too large. ! 1692: * This rarely occurs. ! 1693: */ ! 1694: if (!(*dmph & MULTUNIT_msb)) { ! 1695: dmp = dmpl; ! 1696: unmake_lsbptr(dmp, orig_precision * ! 1697: MULTUNITs_perunit); ! 1698: if (mp_musubb(dmp, ! 1699: (MULTUNIT *) modulus, 0)) ! 1700: (*dmph)--; ! 1701: } ! 1702: } ! 1703: pre_lowerunit(dmph); ! 1704: pre_lowerunit(dmpl); ! 1705: } ! 1706: /* d contains the one's complement of the remainder. */ ! 1707: rescale(d, orig_precision * 2 + 1, orig_precision); ! 1708: set_precision(orig_precision); ! 1709: mp_neg(d); ! 1710: mp_dec(d); ! 1711: } else { ! 1712: /* Product was less than modulus. Return it. */ ! 1713: rescale(d, orig_precision * 2 + 1, orig_precision); ! 1714: set_precision(orig_precision); ! 1715: } ! 1716: mp_move(prod, d); ! 1717: return (0); /* normal return */ ! 1718: } /* smith_modmult */ ! 1719: ! 1720: /* Smith's mp_modmult function leaves some internal arrays in memory, ! 1721: so we have to call modmult_burn() at the end of mp_modexp. ! 1722: This is so that no cryptographically sensitive data is left in memory ! 1723: after the program exits. ! 1724: smith_burn() is aliased to modmult_burn(). ! 1725: */ 1.1.1.6 root 1726: void smith_burn(void) 1727: { 1.1.1.7 ! root 1728: empty_array(modulus); ! 1729: empty_array(ds_data); ! 1730: empty_array(mod_quotient); ! 1731: empty_array(mod_divisor); ! 1732: modmpl = 0; ! 1733: mshift = nbits = 0; ! 1734: reciph = recipl = 0; ! 1735: modlenMULTUNITS = mutemp = 0; ! 1736: mp_set_recip(0, 0, 0); ! 1737: } /* smith_burn */ 1.1.1.6 root 1738: 1.1.1.7 ! root 1739: /* End of Thad Smith's implementation of modmult. */ 1.1.1.6 root 1740: 1.1.1.7 ! root 1741: #endif /* SMITH */ 1.1.1.6 root 1742: 1.1.1.7 ! root 1743: /* Returns number of significant bits in r */ 1.1.1.6 root 1744: int countbits(unitptr r) 1745: { 1.1.1.7 ! root 1746: int bits; ! 1747: short prec; ! 1748: register unit bitmask; ! 1749: init_bitsniffer(r, bitmask, prec, bits); ! 1750: return bits; ! 1751: } /* countbits */ 1.1.1.6 root 1752: 1753: char *copyright_notice(void) 1754: /* force linker to include copyright notice in the executable object image. */ 1.1.1.7 ! root 1755: { ! 1756: return ("(c)1986 Philip Zimmermann"); ! 1757: } /* copyright_notice */ 1.1.1.6 root 1758: 1759: /* 1760: * Russian peasant combined exponentiation/modulo algorithm. 1761: * Calls modmult instead of mult. 1762: * Computes: expout = (expin**exponent) mod modulus 1763: * WARNING: All the arguments must be less than the modulus! 1764: */ 1.1.1.7 ! root 1765: int mp_modexp(register unitptr expout, register unitptr expin, ! 1766: register unitptr exponent, register unitptr modulus) 1.1.1.6 root 1767: { 1.1.1.7 ! root 1768: int bits; ! 1769: short oldprecision; ! 1770: register unit bitmask; ! 1771: unit product[MAX_UNIT_PRECISION]; ! 1772: short eprec; 1.1.1.6 root 1773: 1774: #ifdef COUNTMULTS 1.1.1.7 ! root 1775: tally_modmults = 0; /* clear "number of modmults" counter */ ! 1776: tally_modsquares = 0; /* clear "number of modsquares" counter */ ! 1777: #endif /* COUNTMULTS */ ! 1778: mp_init(expout, 1); ! 1779: if (testeq(exponent, 0)) { ! 1780: if (testeq(expin, 0)) ! 1781: return -1; /* 0 to the 0th power means return error */ ! 1782: return 0; /* otherwise, zero exponent means expout ! 1783: is 1 */ ! 1784: } ! 1785: if (testeq(modulus, 0)) ! 1786: return -2; /* zero modulus means error */ ! 1787: #if SLOP_BITS > 0 /* if there's room for sign bits */ ! 1788: if (mp_tstminus(modulus)) ! 1789: return -2; /* negative modulus means error */ ! 1790: #endif /* SLOP_BITS > 0 */ ! 1791: if (mp_compare(expin, modulus) >= 0) ! 1792: return -3; /* if expin >= modulus, return error */ ! 1793: if (mp_compare(exponent, modulus) >= 0) ! 1794: return -4; /* if exponent >= modulus, return error */ ! 1795: ! 1796: oldprecision = global_precision; /* save global_precision */ ! 1797: /* set smallest optimum precision for this modulus */ ! 1798: set_precision(bits2units(countbits(modulus) + SLOP_BITS)); ! 1799: /* rescale all these registers to global_precision we just defined */ ! 1800: rescale(modulus, oldprecision, global_precision); ! 1801: rescale(expin, oldprecision, global_precision); ! 1802: rescale(exponent, oldprecision, global_precision); ! 1803: rescale(expout, oldprecision, global_precision); 1.1.1.6 root 1804: 1.1.1.7 ! root 1805: if (stage_modulus(modulus)) { ! 1806: set_precision(oldprecision); /* restore original precision */ ! 1807: return -5; /* unstageable modulus (STEWART algorithm) */ ! 1808: } ! 1809: /* normalize and compute number of bits in exponent first */ ! 1810: init_bitsniffer(exponent, bitmask, eprec, bits); ! 1811: ! 1812: /* We can "optimize out" the first modsquare and modmult: */ ! 1813: bits--; /* We know for sure at this point that ! 1814: bits>0 */ ! 1815: mp_move(expout, expin); /* expout = (1*1)*expin; */ ! 1816: bump_bitsniffer(exponent, bitmask); ! 1817: ! 1818: while (bits--) { ! 1819: poll_for_break(); /* polls keyboard, allows ctrl-C to ! 1820: abort program */ 1.1.1.6 root 1821: #ifdef COUNTMULTS 1.1.1.7 ! root 1822: tally_modsquares++; /* bump "number of modsquares" counter */ ! 1823: #endif /* COUNTMULTS */ ! 1824: mp_modsquare(product, expout); ! 1825: if (sniff_bit(exponent, bitmask)) { ! 1826: mp_modmult(expout, product, expin); 1.1.1.6 root 1827: #ifdef COUNTMULTS 1.1.1.7 ! root 1828: tally_modmults++; /* bump "number of modmults" counter */ ! 1829: #endif /* COUNTMULTS */ ! 1830: } else { ! 1831: mp_move(expout, product); 1.1.1.6 root 1832: } 1.1.1.7 ! root 1833: bump_bitsniffer(exponent, bitmask); ! 1834: } /* while bits-- */ ! 1835: mp_burn(product); /* burn the evidence on the stack */ ! 1836: modmult_burn(); /* ask mp_modmult to also burn its ! 1837: own evidence */ ! 1838: ! 1839: #ifdef COUNTMULTS /* diagnostic analysis */ ! 1840: { ! 1841: long atomic_mults; ! 1842: unsigned int unitcount, totalmults; ! 1843: unitcount = bits2units(countbits(modulus)); ! 1844: /* calculation assumes modsquare takes as long as a modmult: */ ! 1845: atomic_mults = (long) tally_modmults *(unitcount * unitcount); ! 1846: atomic_mults += (long) tally_modsquares *(unitcount * unitcount); ! 1847: printf("%ld atomic mults for ", atomic_mults); ! 1848: printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n", ! 1849: tally_modsquares, tally_modmults, ! 1850: tally_modsquares + tally_modmults, ! 1851: countbits(modulus), unitcount); ! 1852: } ! 1853: #endif /* COUNTMULTS */ ! 1854: ! 1855: set_precision(oldprecision); /* restore original precision */ ! 1856: ! 1857: /* Do an explicit reference to the copyright notice so that the linker ! 1858: will be forced to include it in the executable object image... */ ! 1859: copyright_notice(); /* has no real effect at run time */ 1.1.1.6 root 1860: 1.1.1.7 ! root 1861: return 0; /* normal return */ ! 1862: } /* mp_modexp */ 1.1.1.6 root 1863: 1864: /* 1865: * This is a faster modexp for moduli with a known factorisation into two 1866: * relatively prime factors p and q, and an input relatively prime to the 1867: * modulus, the Chinese Remainder Theorem to do the computation mod p and 1868: * mod q, and then combine the results. This relies on a number of 1869: * precomputed values, but does not actually require the modulus n or the 1870: * exponent e. 1871: * 1872: * expout = expin ^ e mod (p*q). 1873: * We form this by evaluating 1874: * p2 = (expin ^ e) mod p and 1875: * q2 = (expin ^ e) mod q 1876: * and then combining the two by the CRT. 1877: * 1878: * Two optimisations of this are possible. First, we can reduce expin 1879: * modulo p and q before starting. 1880: * 1881: * Second, since we know the factorisation of p and q (trivially derived 1882: * from the factorisation of n = p*q), and expin is relatively prime to 1883: * both p and q, we can use Euler's theorem, expin^phi(m) = 1 (mod m), 1884: * to throw away multiples of phi(p) or phi(q) in e. 1885: * Letting ep = e mod phi(p) and 1886: * eq = e mod phi(q) 1887: * then combining these two speedups, we only need to evaluate 1888: * p2 = ((expin mod p) ^ ep) mod p and 1889: * q2 = ((expin mod q) ^ eq) mod q. 1890: * 1891: * Now we need to apply the CRT. Starting with 1892: * expout = p2 (mod p) and 1893: * expout = q2 (mod q) 1894: * we can say that expout = p2 + p * k, and if we assume that 0 <= p2 < p, 1895: * then 0 <= expout < p*q for some 0 <= k < q. Since we want expout = q2 1896: * (mod q), then p*k = q2-p2 (mod q). Since p and q are relatively prime, 1897: * p has a multiplicative inverse u mod q. In other words, u = 1/p (mod q). 1898: * 1899: * Multiplying by u on both sides gives k = u*(q2-p2) (mod q). 1900: * Since we want 0 <= k < q, we can thus find k as 1901: * k = (u * (q2-p2)) mod q. 1902: * 1903: * Once we have k, evaluating p2 + p * k is easy, and 1904: * that gives us the result. 1905: * 1906: * In the detailed implementation, there is a temporary, temp, used to 1907: * hold intermediate results, p2 is held in expout, and q2 is used as a 1908: * temporary in the final step when it is no longer needed. With that, 1909: * you should be able to understand the code below. 1910: */ 1.1.1.7 ! root 1911: int mp_modexp_crt(unitptr expout, unitptr expin, ! 1912: unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u) 1.1.1.6 root 1913: { 1.1.1.7 ! root 1914: unit q2[MAX_UNIT_PRECISION]; ! 1915: unit temp[MAX_UNIT_PRECISION]; ! 1916: int status; ! 1917: ! 1918: /* First, compiute p2 (physically held in M) */ ! 1919: ! 1920: /* p2 = [ (expin mod p) ^ ep ] mod p */ ! 1921: mp_mod(temp, expin, p); /* temp = expin mod p */ ! 1922: status = mp_modexp(expout, temp, ep, p); ! 1923: if (status < 0) { /* mp_modexp returned an error. */ ! 1924: mp_init(expout, 1); ! 1925: return status; /* error return */ ! 1926: } ! 1927: /* And the same thing for q2 */ ! 1928: ! 1929: /* q2 = [ (expin mod q) ^ eq ] mod q */ ! 1930: mp_mod(temp, expin, q); /* temp = expin mod q */ ! 1931: status = mp_modexp(q2, temp, eq, q); ! 1932: if (status < 0) { /* mp_modexp returned an error. */ ! 1933: mp_init(expout, 1); ! 1934: return status; /* error return */ ! 1935: } ! 1936: /* Now use the multiplicative inverse u to glue together the ! 1937: two halves. ! 1938: */ 1.1.1.6 root 1939: #if 0 1940: /* This optimisation is useful if you commonly get small results, 1941: but for random results and large q, the odds of (1/q) of it 1942: being useful do not warrant the test. 1.1.1.7 ! root 1943: */ ! 1944: if (mp_compare(expout, q2) != 0) { 1.1.1.6 root 1945: #endif 1.1.1.7 ! root 1946: /* Find q2-p2 mod q */ ! 1947: if (mp_sub(q2, expout)) /* if the result went negative */ ! 1948: mp_add(q2, q); /* add q to q2 */ ! 1949: ! 1950: /* expout = p2 + ( p * [(q2*u) mod q] ) */ ! 1951: mp_mult(temp, q2, u); /* q2*u */ ! 1952: mp_mod(q2, temp, q); /* (q2*u) mod q */ ! 1953: mp_mult(temp, p, q2); /* p * [(q2*u) mod q] */ ! 1954: mp_add(expout, temp); /* expout = p2 + p * [...] */ 1.1.1.6 root 1955: #if 0 1.1.1.7 ! root 1956: } 1.1.1.6 root 1957: #endif 1958: 1.1.1.7 ! root 1959: mp_burn(q2); /* burn the evidence on the stack... */ ! 1960: mp_burn(temp); ! 1961: return 0; /* normal return */ ! 1962: } /* mp_modexp_crt */ 1.1.1.6 root 1963: 1964: 1965: /****************** end of MPI library ****************************/
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.