|
|
1.1.1.8 ! root 1: /* C source code for multiprecision arithmetic library routines. ! 2: Implemented Nov 86 by Philip Zimmermann ! 3: Last revised 27 Nov 91 by PRZ ! 4: ! 5: Boulder Software Engineering ! 6: 3021 Eleventh Street ! 7: Boulder, CO 80304 ! 8: (303) 541-0140 ! 9: ! 10: (c) Copyright 1986-1996 by Philip Zimmermann. All rights reserved. ! 11: The author assumes no liability for damages resulting from the use ! 12: of this software, even if the damage results from defects in this ! 13: software. No warranty is expressed or implied. The use of this ! 14: cryptographic software for developing weapon systems is expressly ! 15: forbidden. ! 16: ! 17: ! 18: These routines implement all of the multiprecision arithmetic ! 19: necessary for number-theoretic cryptographic algorithms such as ! 20: ElGamal, Diffie-Hellman, Rabin, or factoring studies for large ! 21: composite numbers, as well as Rivest-Shamir-Adleman (RSA) public ! 22: key cryptography. ! 23: ! 24: Although originally developed in Microsoft C for the IBM PC, this code ! 25: contains few machine dependencies. It assumes 2's complement ! 26: arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines, ! 27: lowbyte-highbyte order or highbyte-lowbyte order. This version ! 28: has been converted to ANSI C. ! 29: ! 30: ! 31: The internal representation for these extended precision integer ! 32: "registers" is an array of "units". A unit is a machine word, which ! 33: is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit ! 34: unsigned integer, depending on the machine's word size. For example, ! 35: an IBM PC or AT uses a unit size of 16 bits. To perform arithmetic ! 36: on these huge precision integers, we pass pointers to these unit ! 37: arrays to various subroutines. A pointer to an array of units is of ! 38: type unitptr. This is a pointer to a huge integer "register". ! 39: ! 40: When calling a subroutine, we always pass a pointer to the BEGINNING ! 41: of the array of units, regardless of the byte order of the machine. ! 42: On a lowbyte-first machine, such as the Intel 80x86, this unitptr ! 43: points to the LEAST significant unit, and the array of units increases ! 44: significance to the right. On a highbyte-first machine, such as the ! 45: Motorola 680x0, this unitptr points to the MOST significant unit, and ! 46: the array of units decreases significance to the right. ! 47: ! 48: Modified 8 Apr 92 - HAJK ! 49: Implement new VAX/VMS primitive support. ! 50: ! 51: Modified 30 Sep 92 -Castor Fu <[email protected]> ! 52: Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long) ! 53: ! 54: Modified 28 Nov 92 - Thad Smith ! 55: Added Smith modmult, generalized non-portable support. ! 56: */ ! 57: ! 58: /* #define COUNTMULTS *//* count modmults for performance studies */ ! 59: ! 60: #ifdef DEBUG ! 61: #if defined(MSDOS) || defined(WIN32) ! 62: #ifdef __GO32__ /* DJGPP */ ! 63: #include <pc.h> ! 64: #else ! 65: #include <conio.h> ! 66: #endif /* __GO32__ */ ! 67: #define poll_for_break() {while (kbhit()) getch();} ! 68: #endif /* MSDOS || WIN32 */ ! 69: #endif /* DEBUG */ ! 70: ! 71: #ifndef poll_for_break ! 72: #define poll_for_break() /* stub */ ! 73: #endif ! 74: ! 75: #include "mpilib.h" ! 76: ! 77: #ifdef MACTC5 ! 78: #include <stdio.h> ! 79: #include "Macutil3.h" ! 80: void upton_burn(void); ! 81: char *copyright_notice(void); ! 82: void merritt_burn(void); ! 83: #ifdef CODEWARRIOR ! 84: #define mp_modexp xxxx_mp_modexp ! 85: int mp_modexp(register unitptr expout, register unitptr expin, ! 86: register unitptr exponent, register unitptr modulus); ! 87: #endif ! 88: #endif ! 89: ! 90: #ifdef mp_smula ! 91: #ifdef mp_smul ! 92: Error:Both mp_smula and mp_smul cannot be defined. ! 93: #else ! 94: #define mp_smul mp_smula ! 95: #endif ! 96: #endif ! 97: ! 98: /* set macros for MULTUNIT */ ! 99: #ifdef MUNIT8 ! 100: #define MULTUNITSIZE 8 ! 101: typedef unsigned char MULTUNIT; ! 102: #ifdef UNIT8 ! 103: #define MULTUNIT_SIZE_SAME ! 104: #endif ! 105: #else /* not MUNIT8 */ ! 106: #ifdef MUNIT32 ! 107: #define MULTUNITSIZE 32 ! 108: typedef unsigned long MULTUNIT; ! 109: #ifdef UNIT32 ! 110: #define MULTUNIT_SIZE_SAME ! 111: #else ! 112: /* #error is not portable, this has the same effect */ ! 113: #include "UNITSIZE cannot be smaller than MULTUNITSIZE" ! 114: #endif ! 115: #else /* assume MUNIT16 */ ! 116: #define MULTUNITSIZE 16 ! 117: typedef unsigned short MULTUNIT; ! 118: #ifdef UNIT16 ! 119: #define MULTUNIT_SIZE_SAME ! 120: #endif /* UNIT16 */ ! 121: #ifdef UNIT8 ! 122: #include "UNITSIZE cannot be smaller than MULTUNITSIZE" ! 123: #endif /* UNIT8 */ ! 124: #endif /* MUNIT32 */ ! 125: #endif /* MUNIT8 */ ! 126: ! 127: #define MULTUNIT_msb ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of ! 128: MULTUNIT */ ! 129: #define DMULTUNIT_msb (1L << (2*MULTUNITSIZE-1)) ! 130: #define MULTUNIT_mask ((MULTUNIT)((1L << MULTUNITSIZE)-1)) ! 131: #define MULTUNITs_perunit (UNITSIZE/MULTUNITSIZE) ! 132: ! 133: ! 134: void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier); ! 135: void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier); ! 136: ! 137: #if defined(WIN32) ! 138: /* For Win32 we want this to be 32-bit, for compatibility with the assembler code */ ! 139: unsigned int global_precision = 0; /* units of precision for all routines */ ! 140: #else ! 141: short global_precision = 0; /* units of precision for all routines */ ! 142: #endif ! 143: /* global_precision is the unit precision last set by set_precision. ! 144: Initially, set_precision() should be called to define global_precision ! 145: before using any of these other multiprecision library routines. ! 146: i.e.: set_precision(MAX_UNIT_PRECISION); ! 147: */ ! 148: ! 149: /*************** multiprecision library primitives ****************/ ! 150: /* The following portable C primitives should be recoded in assembly. ! 151: The entry point name should be defined, in "mpilib.h" to the external ! 152: entry point name. If undefined, the C version will be used. ! 153: */ ! 154: ! 155: typedef unsigned long int ulint; ! 156: ! 157: #ifndef mp_addc ! 158: /* ! 159: multiprecision add with carry r2 to r1, result in r1 ! 160: carry is incoming carry flag-- value should be 0 or 1 ! 161: */ ! 162: boolean mp_addc ! 163: (register unitptr r1, register unitptr r2, register boolean carry) ! 164: { ! 165: register unit x; ! 166: short precision; /* number of units to add */ ! 167: precision = global_precision; ! 168: make_lsbptr(r1, precision); ! 169: make_lsbptr(r2, precision); ! 170: while (precision--) { ! 171: if (carry) { ! 172: x = *r1 + *r2 + 1; ! 173: carry = (*r2 >= (unit) (~*r1)); ! 174: } else { ! 175: x = *r1 + *r2; ! 176: carry = (x < *r1); ! 177: } ! 178: post_higherunit(r2); ! 179: *post_higherunit(r1) = x; ! 180: } ! 181: return carry; /* return the final carry flag bit */ ! 182: } /* mp_addc */ ! 183: #endif /* mp_addc */ ! 184: ! 185: #ifndef mp_subb ! 186: ! 187: /* ! 188: multiprecision subtract with borrow, r2 from r1, result in r1 ! 189: borrow is incoming borrow flag-- value should be 0 or 1 ! 190: */ ! 191: boolean mp_subb ! 192: (register unitptr r1, register unitptr r2, register boolean borrow) ! 193: { ! 194: register unit x; ! 195: short precision; /* number of units to subtract */ ! 196: precision = global_precision; ! 197: make_lsbptr(r1, precision); ! 198: make_lsbptr(r2, precision); ! 199: while (precision--) { ! 200: if (borrow) { ! 201: x = *r1 - *r2 - 1; ! 202: borrow = (*r1 <= *r2); ! 203: } else { ! 204: x = *r1 - *r2; ! 205: borrow = (*r1 < *r2); ! 206: } ! 207: post_higherunit(r2); ! 208: *post_higherunit(r1) = x; ! 209: } ! 210: return borrow; /* return the final carry/borrow flag bit */ ! 211: } /* mp_subb */ ! 212: #endif /* mp_subb */ ! 213: ! 214: #ifndef mp_rotate_left ! 215: ! 216: /* ! 217: multiprecision rotate left 1 bit with carry, result in r1. ! 218: carry is incoming carry flag-- value should be 0 or 1 ! 219: */ ! 220: boolean mp_rotate_left(register unitptr r1, register boolean carry) ! 221: { ! 222: register int precision; /* number of units to rotate */ ! 223: unsigned int mcarry = carry, nextcarry; /* int is supposed to be ! 224: * the efficient size for ops*/ ! 225: precision = global_precision; ! 226: make_lsbptr(r1, precision); ! 227: while (precision--) { ! 228: nextcarry = (((signedunit) * r1) < 0); ! 229: *r1 = (*r1 << 1) | mcarry; ! 230: mcarry = nextcarry; ! 231: pre_higherunit(r1); ! 232: } ! 233: return nextcarry; /* return the final carry flag bit */ ! 234: } /* mp_rotate_left */ ! 235: #endif /* mp_rotate_left */ ! 236: ! 237: /************** end of primitives that should be in assembly *************/ ! 238: ! 239: /* The mp_shift_right_bits function is not called in any time-critical ! 240: situations in public-key cryptographic functions, so it doesn't ! 241: need to be coded in assembly language. ! 242: */ ! 243: ! 244: /* ! 245: multiprecision shift right bits, result in r1. ! 246: bits is how many bits to shift, must be <= UNITSIZE. ! 247: */ ! 248: void mp_shift_right_bits(register unitptr r1, register short bits) ! 249: { ! 250: register short precision; /* number of units to shift */ ! 251: register unit carry, nextcarry, bitmask; ! 252: register short unbits; ! 253: if (bits == 0) ! 254: return; /* shift zero bits is a no-op */ ! 255: carry = 0; ! 256: bitmask = power_of_2(bits) - 1; ! 257: unbits = UNITSIZE - bits; /* shift bits must be <= UNITSIZE */ ! 258: precision = global_precision; ! 259: make_msbptr(r1, precision); ! 260: if (bits == UNITSIZE) { ! 261: while (precision--) { ! 262: nextcarry = *r1; ! 263: *r1 = carry; ! 264: carry = nextcarry; ! 265: pre_lowerunit(r1); ! 266: } ! 267: } else { ! 268: while (precision--) { ! 269: nextcarry = *r1 & bitmask; ! 270: *r1 >>= bits; ! 271: *r1 |= carry << unbits; ! 272: carry = nextcarry; ! 273: pre_lowerunit(r1); ! 274: } ! 275: } ! 276: } /* mp_shift_right_bits */ ! 277: ! 278: #ifndef mp_compare ! 279: ! 280: /* ! 281: * Compares multiprecision integers *r1, *r2, and returns: ! 282: * -1 iff *r1 < *r2 ! 283: * 0 iff *r1 == *r2 ! 284: * +1 iff *r1 > *r2 ! 285: */ ! 286: short mp_compare(register unitptr r1, register unitptr r2) ! 287: { ! 288: register short precision; /* number of units to compare */ ! 289: ! 290: precision = global_precision; ! 291: make_msbptr(r1, precision); ! 292: make_msbptr(r2, precision); ! 293: do { ! 294: if (*r1 < *r2) ! 295: return -1; ! 296: if (*post_lowerunit(r1) > *post_lowerunit(r2)) ! 297: return 1; ! 298: } while (--precision); ! 299: return 0; /* *r1 == *r2 */ ! 300: } /* mp_compare */ ! 301: #endif /* mp_compare */ ! 302: ! 303: /* Increment multiprecision integer r. */ ! 304: boolean mp_inc(register unitptr r) ! 305: { ! 306: register short precision; ! 307: precision = global_precision; ! 308: make_lsbptr(r, precision); ! 309: do { ! 310: if (++(*r)) ! 311: return 0; /* no carry */ ! 312: post_higherunit(r); ! 313: } while (--precision); ! 314: return 1; /* return carry set */ ! 315: } /* mp_inc */ ! 316: ! 317: /* Decrement multiprecision integer r. */ ! 318: boolean mp_dec(register unitptr r) ! 319: { ! 320: register short precision; ! 321: precision = global_precision; ! 322: make_lsbptr(r, precision); ! 323: do { ! 324: if ((signedunit) (--(*r)) != (signedunit) - 1) ! 325: return 0; /* no borrow */ ! 326: post_higherunit(r); ! 327: } while (--precision); ! 328: return 1; /* return borrow set */ ! 329: } /* mp_dec */ ! 330: ! 331: /* Compute 2's complement, the arithmetic negative, of r */ ! 332: void mp_neg(register unitptr r) ! 333: { ! 334: register short precision; /* number of units to negate */ ! 335: precision = global_precision; ! 336: mp_dec(r); /* 2's complement is 1's complement plus 1 */ ! 337: do { /* now do 1's complement */ ! 338: *r = ~(*r); ! 339: r++; ! 340: } while (--precision); ! 341: } /* mp_neg */ ! 342: ! 343: #ifndef mp_move ! 344: ! 345: void mp_move(register unitptr dst, register unitptr src) ! 346: { ! 347: register short precision; /* number of units to move */ ! 348: precision = global_precision; ! 349: do { ! 350: *dst++ = *src++; ! 351: } while (--precision); ! 352: } /* mp_move */ ! 353: #endif /* mp_move */ ! 354: ! 355: /* Init multiprecision register r with short value. */ ! 356: void mp_init(register unitptr r, word16 value) ! 357: { /* Note that mp_init doesn't extend sign bit for >32767 */ ! 358: ! 359: unitfill0(r, global_precision); ! 360: make_lsbptr(r, global_precision); ! 361: *post_higherunit(r) = value; ! 362: #ifdef UNIT8 ! 363: *post_higherunit(r) = value >> UNITSIZE; ! 364: #endif ! 365: } /* mp_init */ ! 366: ! 367: /* Returns number of significant units in r */ ! 368: short significance(register unitptr r) ! 369: { ! 370: register short precision; ! 371: precision = global_precision; ! 372: make_msbptr(r, precision); ! 373: do { ! 374: if (*post_lowerunit(r)) ! 375: return precision; ! 376: } while (--precision); ! 377: return precision; ! 378: } /* significance */ ! 379: ! 380: ! 381: #ifndef unitfill0 ! 382: ! 383: /* Zero-fill the unit buffer r. */ ! 384: void unitfill0(unitptr r, word16 unitcount) ! 385: { ! 386: while (unitcount--) ! 387: *r++ = 0; ! 388: } /* unitfill0 */ ! 389: #endif /* unitfill0 */ ! 390: ! 391: /* Unsigned divide, treats both operands as positive. */ ! 392: int mp_udiv(register unitptr remainder, register unitptr quotient, ! 393: register unitptr dividend, register unitptr divisor) ! 394: { ! 395: int bits; ! 396: short dprec; ! 397: register unit bitmask; ! 398: ! 399: if (testeq(divisor, 0)) ! 400: return -1; /* zero divisor means divide error */ ! 401: mp_init0(remainder); ! 402: mp_init0(quotient); ! 403: /* normalize and compute number of bits in dividend first */ ! 404: init_bitsniffer(dividend, bitmask, dprec, bits); ! 405: /* rescale quotient to same precision (dprec) as dividend */ ! 406: rescale(quotient, global_precision, dprec); ! 407: make_msbptr(quotient, dprec); ! 408: ! 409: while (bits--) { ! 410: mp_rotate_left(remainder, ! 411: (boolean) (sniff_bit(dividend, bitmask) != 0)); ! 412: if (mp_compare(remainder, divisor) >= 0) { ! 413: mp_sub(remainder, divisor); ! 414: stuff_bit(quotient, bitmask); ! 415: } ! 416: bump_2bitsniffers(dividend, quotient, bitmask); ! 417: } ! 418: return 0; ! 419: } /* mp_udiv */ ! 420: ! 421: #ifdef UPTON_OR_SMITH ! 422: ! 423: #define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */ ! 424: ! 425: /* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */ ! 426: int mp_recip(register unitptr quotient, register unitptr divisor) ! 427: { ! 428: int bits; ! 429: short qprec; ! 430: register unit bitmask; ! 431: unit remainder[MAX_UNIT_PRECISION]; ! 432: if (testeq(divisor, 0)) ! 433: return -1; /* zero divisor means divide error */ ! 434: mp_init0(remainder); ! 435: mp_init0(quotient); ! 436: ! 437: /* normalize and compute number of bits in quotient first */ ! 438: bits = countbits(divisor) + RECIPMARGIN; ! 439: bitmask = bitmsk(bits); /* bitmask within a single unit */ ! 440: qprec = bits2units(bits + 1); ! 441: mp_setbit(remainder, (bits - RECIPMARGIN) - 1); ! 442: /* rescale quotient to precision of divisor + RECIPMARGIN bits */ ! 443: rescale(quotient, global_precision, qprec); ! 444: make_msbptr(quotient, qprec); ! 445: ! 446: while (bits--) { ! 447: mp_shift_left(remainder); ! 448: if (mp_compare(remainder, divisor) >= 0) { ! 449: mp_sub(remainder, divisor); ! 450: stuff_bit(quotient, bitmask); ! 451: } ! 452: bump_bitsniffer(quotient, bitmask); ! 453: } ! 454: mp_init0(remainder); /* burn sensitive data left on stack */ ! 455: return 0; ! 456: } /* mp_recip */ ! 457: #endif /* UPTON_OR_SMITH */ ! 458: ! 459: /* Signed divide, either or both operands may be negative. */ ! 460: int mp_div(register unitptr remainder, register unitptr quotient, ! 461: register unitptr dividend, register unitptr divisor) ! 462: { ! 463: boolean dvdsign, dsign; ! 464: int status; ! 465: dvdsign = (boolean) (mp_tstminus(dividend) != 0); ! 466: dsign = (boolean) (mp_tstminus(divisor) != 0); ! 467: if (dvdsign) ! 468: mp_neg(dividend); ! 469: if (dsign) ! 470: mp_neg(divisor); ! 471: status = mp_udiv(remainder, quotient, dividend, divisor); ! 472: if (dvdsign) ! 473: mp_neg(dividend); /* repair caller's dividend */ ! 474: if (dsign) ! 475: mp_neg(divisor); /* repair caller's divisor */ ! 476: if (status < 0) ! 477: return status; /* divide error? */ ! 478: if (dvdsign) ! 479: mp_neg(remainder); ! 480: if (dvdsign ^ dsign) ! 481: mp_neg(quotient); ! 482: return status; ! 483: } /* mp_div */ ! 484: ! 485: /* ! 486: * This function does a fast divide and mod on a multiprecision dividend ! 487: * using a short integer divisor returning a short integer remainder. ! 488: * This is an unsigned divide. It treats both operands as positive. ! 489: * It is used mainly for faster printing of large numbers in base 10. ! 490: */ ! 491: word16 mp_shortdiv(register unitptr quotient, ! 492: register unitptr dividend, register word16 divisor) ! 493: { ! 494: int bits; ! 495: short dprec; ! 496: register unit bitmask; ! 497: register word16 remainder; ! 498: if (!divisor) /* if divisor == 0 */ ! 499: return -1; /* zero divisor means divide error */ ! 500: remainder = 0; ! 501: mp_init0(quotient); ! 502: /* normalize and compute number of bits in dividend first */ ! 503: init_bitsniffer(dividend, bitmask, dprec, bits); ! 504: /* rescale quotient to same precision (dprec) as dividend */ ! 505: rescale(quotient, global_precision, dprec); ! 506: make_msbptr(quotient, dprec); ! 507: ! 508: while (bits--) { ! 509: remainder <<= 1; ! 510: if (sniff_bit(dividend, bitmask)) ! 511: remainder++; ! 512: if (remainder >= divisor) { ! 513: remainder -= divisor; ! 514: stuff_bit(quotient, bitmask); ! 515: } ! 516: bump_2bitsniffers(dividend, quotient, bitmask); ! 517: } ! 518: return remainder; ! 519: } /* mp_shortdiv */ ! 520: ! 521: /* Unsigned divide, treats both operands as positive. */ ! 522: int mp_mod(register unitptr remainder, ! 523: register unitptr dividend, register unitptr divisor) ! 524: { ! 525: int bits; ! 526: short dprec; ! 527: register unit bitmask; ! 528: if (testeq(divisor, 0)) ! 529: return -1; /* zero divisor means divide error */ ! 530: mp_init0(remainder); ! 531: /* normalize and compute number of bits in dividend first */ ! 532: init_bitsniffer(dividend, bitmask, dprec, bits); ! 533: ! 534: while (bits--) { ! 535: mp_rotate_left(remainder, ! 536: (boolean) (sniff_bit(dividend, bitmask) != 0)); ! 537: msub(remainder, divisor); ! 538: bump_bitsniffer(dividend, bitmask); ! 539: } ! 540: return 0; ! 541: } /* mp_mod */ ! 542: ! 543: /* ! 544: * This function does a fast mod operation on a multiprecision dividend ! 545: * using a short integer modulus returning a short integer remainder. ! 546: * This is an unsigned divide. It treats both operands as positive. ! 547: * It is used mainly for fast sieve searches for large primes. ! 548: */ ! 549: word16 mp_shortmod(register unitptr dividend, register word16 divisor) ! 550: { ! 551: int bits; ! 552: short dprec; ! 553: register unit bitmask; ! 554: register word16 remainder; ! 555: if (!divisor) /* if divisor == 0 */ ! 556: return -1; /* zero divisor means divide error */ ! 557: remainder = 0; ! 558: /* normalize and compute number of bits in dividend first */ ! 559: init_bitsniffer(dividend, bitmask, dprec, bits); ! 560: ! 561: while (bits--) { ! 562: remainder <<= 1; ! 563: if (sniff_bit(dividend, bitmask)) ! 564: remainder++; ! 565: if (remainder >= divisor) ! 566: remainder -= divisor; ! 567: bump_bitsniffer(dividend, bitmask); ! 568: } ! 569: return remainder; ! 570: } /* mp_shortmod */ ! 571: ! 572: ! 573: ! 574: #ifdef COMB_MULT /* use faster "comb" multiply algorithm */ ! 575: /* We are skipping this code because it has a bug... */ ! 576: ! 577: /* ! 578: * Computes multiprecision prod = multiplicand * multiplier ! 579: * ! 580: * Uses interleaved comb multiply algorithm. ! 581: * This improved multiply more than twice as fast as a Russian ! 582: * peasant multiply, because it does a lot fewer shifts. ! 583: * Must have global_precision set to the size of the multiplicand ! 584: * plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum ! 585: * of the lengths of the multiplier and multiplicand. ! 586: * ! 587: * BUG ALERT: Unfortunately, this code has a bug. It fails for ! 588: * some numbers. One such example: ! 589: * x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 ! 590: * x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D ! 591: * 52C8 CDC7 AFB3 61C8 243C 741B ! 592: * --which is obviously wrong. The answer should be: ! 593: * x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A ! 594: * C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 ! 595: * We'll have to fix this some day. In the meantime, we'll ! 596: * just have the compiler skip over this code. ! 597: * ! 598: * BUG NOTE: Possibly fixed. Needs testing. ! 599: */ ! 600: int mp_mult(register unitptr prod, ! 601: register unitptr multiplicand, register unitptr multiplier) ! 602: { ! 603: int bits; ! 604: register unit bitmask; ! 605: unitptr product, mplier, temp; ! 606: short mprec, mprec2; ! 607: unit mplicand[MAX_UNIT_PRECISION]; ! 608: ! 609: /* better clear full width--double precision */ ! 610: mp_init(prod + tohigher(global_precision), 0); ! 611: ! 612: if (testeq(multiplicand, 0)) ! 613: return 0; /* zero multiplicand means zero product */ ! 614: ! 615: mp_move(mplicand, multiplicand); /* save it from damage */ ! 616: ! 617: normalize(multiplier, mprec); ! 618: if (!mprec) ! 619: return 0; ! 620: ! 621: make_lsbptr(multiplier, mprec); ! 622: bitmask = 1; /* start scan at LSB of multiplier */ ! 623: ! 624: do { /* UNITSIZE times */ ! 625: /* do for bits 0-15 */ ! 626: product = prod; ! 627: mplier = multiplier; ! 628: mprec2 = mprec; ! 629: while (mprec2--) { /* do for each word in multiplier */ ! 630: ! 631: if (sniff_bit(mplier, bitmask)) { ! 632: if (mp_addc(product, multiplicand, 0)) { /* ripple carry */ ! 633: /* After 1st time thru, this is rarely encountered. */ ! 634: temp = msbptr(product, global_precision); ! 635: pre_higherunit(temp); ! 636: /* temp now points to LSU of carry region. */ ! 637: unmake_lsbptr(temp, global_precision); ! 638: mp_inc(temp); ! 639: } /* ripple carry */ ! 640: } ! 641: pre_higherunit(mplier); ! 642: pre_higherunit(product); ! 643: } ! 644: if (!(bitmask <<= 1)) ! 645: break; ! 646: mp_shift_left(multiplicand); ! 647: ! 648: } while (TRUE); ! 649: ! 650: mp_move(multiplicand, mplicand); /* recover */ ! 651: ! 652: return 0; /* normal return */ ! 653: } /* mp_mult */ ! 654: ! 655: #endif /* COMB_MULT */ ! 656: ! 657: ! 658: /* Because the "comb" multiply has a bug, we will use the slower ! 659: Russian peasant multiply instead. Fortunately, the mp_mult ! 660: function is not called from any time-critical code. ! 661: */ ! 662: ! 663: /* ! 664: * Computes multiprecision prod = multiplicand * multiplier ! 665: * Uses "Russian peasant" multiply algorithm. ! 666: */ ! 667: int mp_mult(register unitptr prod, ! 668: register unitptr multiplicand, register unitptr multiplier) ! 669: { ! 670: int bits; ! 671: register unit bitmask; ! 672: short mprec; ! 673: mp_init(prod, 0); ! 674: if (testeq(multiplicand, 0)) ! 675: return 0; /* zero multiplicand means zero product */ ! 676: /* normalize and compute number of bits in multiplier first */ ! 677: init_bitsniffer(multiplier, bitmask, mprec, bits); ! 678: ! 679: while (bits--) { ! 680: mp_shift_left(prod); ! 681: if (sniff_bit(multiplier, bitmask)) ! 682: mp_add(prod, multiplicand); ! 683: bump_bitsniffer(multiplier, bitmask); ! 684: } ! 685: return 0; ! 686: } /* mp_mult */ ! 687: ! 688: /* ! 689: * mp_modmult computes a multiprecision multiply combined with a ! 690: * modulo operation. This is the most time-critical function in ! 691: * this multiprecision arithmetic library for performing modulo ! 692: * exponentiation. We experimented with different versions of modmult, ! 693: * depending on the machine architecture and performance requirements. ! 694: * We will either use a Russian Peasant modmult (peasant_modmult), ! 695: * Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's ! 696: * modmult (upton_modmult), or Thad Smith's modmult (smith_modmult). ! 697: * On machines with a hardware atomic multiply instruction, ! 698: * Smith's modmult is fastest. It can utilize assembly subroutines to ! 699: * speed up the hardware multiply logic and trial quotient calculation. ! 700: * If the machine lacks a fast hardware multiply, Merritt's modmult ! 701: * is preferred, which doesn't call any assembly multiply routine. ! 702: * We use the alias names mp_modmult, stage_modulus, and modmult_burn ! 703: * for the corresponding true names, which depend on what flavor of ! 704: * modmult we are using. ! 705: * ! 706: * Before making the first call to mp_modmult, you must set up the ! 707: * modulus-dependant precomputated tables by calling stage_modulus. ! 708: * After making all the calls to mp_modmult, you call modmult_burn to ! 709: * erase the tables created by stage_modulus that were left in memory. ! 710: */ ! 711: ! 712: #ifdef COUNTMULTS ! 713: /* "number of modmults" counters, used for performance studies. */ ! 714: static unsigned int tally_modmults = 0; ! 715: static unsigned int tally_modsquares = 0; ! 716: #endif /* COUNTMULTS */ ! 717: ! 718: ! 719: #ifdef PEASANT ! 720: /* Conventional Russian peasant multiply with modulo algorithm. */ ! 721: ! 722: static unitptr pmodulus = 0; /* used only by mp_modmult */ ! 723: ! 724: /* ! 725: * Must pass modulus to stage_modulus before calling modmult. ! 726: * Assumes that global_precision has already been adjusted to the ! 727: * size of the modulus, plus SLOP_BITS. ! 728: */ ! 729: int stage_peasant_modulus(unitptr n) ! 730: { /* For this simple version of modmult, just copy unit pointer. */ ! 731: pmodulus = n; ! 732: return 0; /* normal return */ ! 733: } /* stage_peasant_modulus */ ! 734: ! 735: /* "Russian peasant" multiply algorithm, combined with a modulo ! 736: * operation. This is a simple naive replacement for Merritt's ! 737: * faster modmult algorithm. References global unitptr "modulus". ! 738: * Computes: prod = (multiplicand*multiplier) mod modulus ! 739: * WARNING: All the arguments must be less than the modulus! ! 740: */ ! 741: int peasant_modmult(register unitptr prod, ! 742: unitptr multiplicand, register unitptr multiplier) ! 743: { ! 744: int bits; ! 745: register unit bitmask; ! 746: short mprec; ! 747: mp_init(prod, 0); ! 748: /* if (testeq(multiplicand,0)) ! 749: return 0; *//* zero multiplicand means zero product */ ! 750: /* normalize and compute number of bits in multiplier first */ ! 751: init_bitsniffer(multiplier, bitmask, mprec, bits); ! 752: ! 753: while (bits--) { ! 754: mp_shift_left(prod); ! 755: msub(prod, pmodulus); /* turns mult into modmult */ ! 756: if (sniff_bit(multiplier, bitmask)) { ! 757: mp_add(prod, multiplicand); ! 758: msub(prod, pmodulus); /* turns mult into modmult */ ! 759: } ! 760: bump_bitsniffer(multiplier, bitmask); ! 761: } ! 762: return 0; ! 763: } /* peasant_modmult */ ! 764: ! 765: /* ! 766: * If we are using a version of mp_modmult that uses internal tables ! 767: * in memory, we have to call modmult_burn() at the end of mp_modexp. ! 768: * This is so that no sensitive data is left in memory after the program ! 769: * exits. The Russian peasant method doesn't use any such tables. ! 770: */ ! 771: ! 772: /* ! 773: * Alias for modmult_burn, called only from mp_modexp(). Destroys ! 774: * internal modmult tables. This version does nothing because no ! 775: * tables are used by the Russian peasant modmult. ! 776: */ ! 777: void peasant_burn(void) ! 778: { ! 779: } /* peasant_burn */ ! 780: ! 781: #endif /* PEASANT */ ! 782: ! 783: ! 784: #ifdef MERRITT ! 785: /*=========================================================================*/ ! 786: /* ! 787: This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ. ! 788: Also refined by Zhahai Stewart to reduce the number of subtracts. ! 789: Modified by Raymond Brand to reduce the number of SLOP_BITS by 1. ! 790: It performs a multiply combined with a modulo operation, without ! 791: going into "double precision". It is faster than the Russian peasant ! 792: method, and still works well on machines that lack a fast hardware ! 793: multiply instruction. ! 794: */ ! 795: ! 796: /* The following support functions, macros, and data structures ! 797: are used only by Merritt's modmult algorithm... */ ! 798: ! 799: /* Shift r1 1 whole unit to the left. Used only by modmult function. */ ! 800: static void mp_lshift_unit(register unitptr r1) ! 801: { ! 802: register short precision; ! 803: register unitptr r2; ! 804: precision = global_precision; ! 805: make_msbptr(r1, precision); ! 806: r2 = r1; ! 807: while (--precision) ! 808: *post_lowerunit(r1) = *pre_lowerunit(r2); ! 809: *r1 = 0; ! 810: } /* mp_lshift_unit */ ! 811: ! 812: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */ ! 813: static unit moduli_buf[UNITSIZE - 1][MAX_UNIT_PRECISION] = ! 814: {0}; ! 815: static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */ ! 816: {0, moduli_buf[0], moduli_buf[1], moduli_buf[2], ! 817: moduli_buf[3], moduli_buf[4], moduli_buf[5], moduli_buf[6] ! 818: #ifndef UNIT8 ! 819: ,moduli_buf[7], moduli_buf[8], moduli_buf[9], moduli_buf[10], ! 820: moduli_buf[11], moduli_buf[12], moduli_buf[13], moduli_buf[14] ! 821: #ifndef UNIT16 /* and not UNIT8 */ ! 822: ,moduli_buf[15], moduli_buf[16], moduli_buf[17], moduli_buf[18], ! 823: moduli_buf[19], moduli_buf[20], moduli_buf[21], moduli_buf[22], ! 824: moduli_buf[23], moduli_buf[24], moduli_buf[25], moduli_buf[26], ! 825: moduli_buf[27], moduli_buf[28], moduli_buf[29], moduli_buf[30] ! 826: #endif /* UNIT16 and UNIT8 not defined */ ! 827: #endif /* UNIT8 not defined */ ! 828: }; ! 829: ! 830: /* ! 831: * To optimize msubs, need following 2 unit arrays, each filled ! 832: * with the most significant unit and next-to-most significant unit ! 833: * of the preshifted images of the modulus. ! 834: */ ! 835: static unit msu_moduli[UNITSIZE] = ! 836: {0}; /* most signif. unit */ ! 837: static unit nmsu_moduli[UNITSIZE] = ! 838: {0}; /* next-most signif. unit */ ! 839: ! 840: /* ! 841: * mpdbuf contains preshifted images of the multiplicand, mod n. ! 842: * It is used only by mp_modmult. It could be staticly declared ! 843: * inside of mp_modmult, but we put it outside mp_modmult so that ! 844: * it can be wiped clean by modmult_burn(), which is called at the ! 845: * end of mp_modexp. This is so that no sensitive data is left in ! 846: * memory after the program exits. ! 847: */ ! 848: static unit mpdbuf[UNITSIZE - 1][MAX_UNIT_PRECISION] = ! 849: {0}; ! 850: ! 851: /* ! 852: * Computes UNITSIZE images of r, each shifted left 1 more bit. ! 853: * Used only by modmult function. ! 854: */ ! 855: static void stage_mp_images(unitptr images[UNITSIZE], unitptr r) ! 856: { ! 857: short int i; ! 858: ! 859: images[0] = r; /* no need to move the first image, just copy ptr */ ! 860: for (i = 1; i < UNITSIZE; i++) { ! 861: mp_move(images[i], images[i - 1]); ! 862: mp_shift_left(images[i]); ! 863: msub(images[i], moduli[0]); ! 864: } ! 865: } /* stage_mp_images */ ! 866: ! 867: /* ! 868: * Computes UNITSIZE images of modulus, each shifted left 1 more bit. ! 869: * Before calling mp_modmult, you must first stage the moduli images by ! 870: * calling stage_modulus. n is the pointer to the modulus. ! 871: * Assumes that global_precision has already been adjusted to the ! 872: * size of the modulus, plus SLOP_BITS. ! 873: */ ! 874: int stage_merritt_modulus(unitptr n) ! 875: { ! 876: short int i; ! 877: unitptr msu; /* ptr to most significant unit, for faster msubs */ ! 878: ! 879: moduli[0] = n; /* no need to move the first image, just copy ptr */ ! 880: ! 881: /* used by optimized msubs macro... */ ! 882: msu = msbptr(n, global_precision); /* needed by msubs */ ! 883: msu_moduli[0] = *post_lowerunit(msu); ! 884: nmsu_moduli[0] = *msu; ! 885: ! 886: for (i = 1; i < UNITSIZE; i++) { ! 887: mp_move(moduli[i], moduli[i - 1]); ! 888: mp_shift_left(moduli[i]); ! 889: ! 890: /* used by optimized msubs macro... */ ! 891: msu = msbptr(moduli[i], global_precision); /* needed by msubs */ ! 892: msu_moduli[i] = *post_lowerunit(msu); /* for faster msubs */ ! 893: nmsu_moduli[i] = *msu; ! 894: } ! 895: return 0; /* normal return */ ! 896: } /* stage_merritt_modulus */ ! 897: ! 898: /* The following macros, sniffadd and msubs, are used by modmult... */ ! 899: #define sniffadd(i) if (*multiplier & power_of_2(i)) mp_add(prod,mpd[i]) ! 900: ! 901: /* Unoptimized msubs macro (msubs0) follows... */ ! 902: /* #define msubs0(i) msub(prod,moduli[i]) ! 903: */ ! 904: ! 905: /* ! 906: * To optimize msubs, compare the most significant units of the ! 907: * product and the shifted modulus before deciding to call the full ! 908: * mp_compare routine. Better still, compare the upper two units ! 909: * before deciding to call mp_compare. ! 910: * Optimization of msubs relies heavily on standard C left-to-right ! 911: * parsimonious evaluation of logical expressions. ! 912: */ ! 913: ! 914: /* Partially-optimized msubs macro (msubs1) follows... */ ! 915: /* #define msubs1(i) if ( \ ! 916: ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \ ! 917: (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \ ! 918: ) mp_sub(prod,moduli[i]) ! 919: */ ! 920: ! 921: /* Fully-optimized msubs macro (msubs2) follows... */ ! 922: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \ ! 923: (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \ ! 924: (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \ ! 925: mp_sub(prod,moduli[i]) ! 926: ! 927: /* ! 928: * Performs combined multiply/modulo operation. ! 929: * Computes: prod = (multiplicand*multiplier) mod modulus ! 930: * WARNING: All the arguments must be less than the modulus! ! 931: * Assumes the modulus has been predefined by first calling ! 932: * stage_modulus. ! 933: */ ! 934: int merritt_modmult(register unitptr prod, ! 935: unitptr multiplicand, register unitptr multiplier) ! 936: { ! 937: /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro... */ ! 938: register signedunit p_m; ! 939: register unitptr msu_prod; /* ptr to most significant unit of product */ ! 940: register unitptr nmsu_prod; /* next-most signif. unit of product */ ! 941: short mprec; /* precision of multiplier, in units */ ! 942: /* Array mpd contains a list of pointers to preshifted images of ! 943: the multiplicand: */ ! 944: static unitptr mpd[UNITSIZE] = ! 945: { ! 946: 0, mpdbuf[0], mpdbuf[1], mpdbuf[2], ! 947: mpdbuf[3], mpdbuf[4], mpdbuf[5], mpdbuf[6] ! 948: #ifndef UNIT8 ! 949: ,mpdbuf[7], mpdbuf[8], mpdbuf[9], mpdbuf[10], ! 950: mpdbuf[11], mpdbuf[12], mpdbuf[13], mpdbuf[14] ! 951: #ifndef UNIT16 /* and not UNIT8 */ ! 952: ,mpdbuf[15], mpdbuf[16], mpdbuf[17], mpdbuf[18], ! 953: mpdbuf[19], mpdbuf[20], mpdbuf[21], mpdbuf[22], ! 954: mpdbuf[23], mpdbuf[24], mpdbuf[25], mpdbuf[26], ! 955: mpdbuf[27], mpdbuf[28], mpdbuf[29], mpdbuf[30] ! 956: #endif /* UNIT16 and UNIT8 not defined */ ! 957: #endif /* UNIT8 not defined */ ! 958: }; ! 959: ! 960: /* Compute preshifted images of multiplicand, mod n: */ ! 961: stage_mp_images(mpd, multiplicand); ! 962: ! 963: /* To optimize msubs, set up msu_prod and nmsu_prod: */ ! 964: msu_prod = msbptr(prod, global_precision); /* Get ptr to MSU of prod */ ! 965: nmsu_prod = msu_prod; ! 966: post_lowerunit(nmsu_prod); /* Get next-MSU of prod */ ! 967: ! 968: /* ! 969: * To understand this algorithm, it would be helpful to first ! 970: * study the conventional Russian peasant modmult algorithm. ! 971: * This one does about the same thing as Russian peasant, but ! 972: * is organized differently to save some steps. It loops ! 973: * through the multiplier a word (unit) at a time, instead of ! 974: * a bit at a time. It word-shifts the product instead of ! 975: * bit-shifting it, so it should be faster. It also does about ! 976: * half as many subtracts as Russian peasant. ! 977: */ ! 978: ! 979: mp_init(prod, 0); /* Initialize product to 0. */ ! 980: ! 981: /* ! 982: * The way mp_modmult is actually used in cryptographic ! 983: * applications, there will NEVER be a zero multiplier or ! 984: * multiplicand. So there is no need to optimize for that ! 985: * condition. ! 986: */ ! 987: /* if (testeq(multiplicand,0)) ! 988: return 0; *//* zero multiplicand means zero product */ ! 989: /* Normalize and compute number of units in multiplier first: */ ! 990: normalize(multiplier, mprec); ! 991: if (mprec == 0) /* if precision of multiplier is 0 */ ! 992: return 0; /* zero multiplier means zero product */ ! 993: make_msbptr(multiplier, mprec); /* start at MSU of multiplier */ ! 994: ! 995: while (mprec--) { /* Loop for the number of units in the multiplier */ ! 996: /* ! 997: * Shift the product one whole unit to the left. ! 998: * This is part of the multiply phase of modmult. ! 999: */ ! 1000: ! 1001: mp_lshift_unit(prod); ! 1002: ! 1003: /* ! 1004: * The product may have grown by as many as UNITSIZE ! 1005: * bits. That's why we have global_precision set to the ! 1006: * size of the modulus plus UNITSIZE slop bits. ! 1007: * Now reduce the product back down by conditionally ! 1008: * subtracting the preshifted images of the modulus. ! 1009: * This is part of the modulo reduction phase of modmult. ! 1010: * The following loop is unrolled for speed, using macros... ! 1011: ! 1012: for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--) ! 1013: if (mp_compare(prod,moduli[i]) >= 0) ! 1014: mp_sub(prod,moduli[i]); ! 1015: */ ! 1016: ! 1017: #ifndef UNIT8 ! 1018: #ifndef UNIT16 /* and not UNIT8 */ ! 1019: msubs(31); ! 1020: msubs(30); ! 1021: msubs(29); ! 1022: msubs(28); ! 1023: msubs(27); ! 1024: msubs(26); ! 1025: msubs(25); ! 1026: msubs(24); ! 1027: msubs(23); ! 1028: msubs(22); ! 1029: msubs(21); ! 1030: msubs(20); ! 1031: msubs(19); ! 1032: msubs(18); ! 1033: msubs(17); ! 1034: msubs(16); ! 1035: #endif /* not UNIT16 and not UNIT8 */ ! 1036: msubs(15); ! 1037: msubs(14); ! 1038: msubs(13); ! 1039: msubs(12); ! 1040: msubs(11); ! 1041: msubs(10); ! 1042: msubs(9); ! 1043: msubs(8); ! 1044: #endif /* not UNIT8 */ ! 1045: msubs(7); ! 1046: msubs(6); ! 1047: msubs(5); ! 1048: #ifndef UNIT32 ! 1049: msubs(4); ! 1050: #ifndef UNIT16 ! 1051: msubs(3); ! 1052: #endif ! 1053: #endif ! 1054: ! 1055: /* ! 1056: * Sniff each bit in the current unit of the multiplier, ! 1057: * and conditionally add the corresponding preshifted ! 1058: * image of the multiplicand to the product. ! 1059: * This is also part of the multiply phase of modmult. ! 1060: * ! 1061: * The following loop is unrolled for speed, using macros... ! 1062: ! 1063: for (i=UNITSIZE-1; i>=0; i--) ! 1064: if (*multiplier & power_of_2(i)) ! 1065: mp_add(prod,mpd[i]); ! 1066: */ ! 1067: #ifndef UNIT8 ! 1068: #ifndef UNIT16 /* and not UNIT8 */ ! 1069: sniffadd(31); ! 1070: sniffadd(30); ! 1071: sniffadd(29); ! 1072: sniffadd(28); ! 1073: sniffadd(27); ! 1074: sniffadd(26); ! 1075: sniffadd(25); ! 1076: sniffadd(24); ! 1077: sniffadd(23); ! 1078: sniffadd(22); ! 1079: sniffadd(21); ! 1080: sniffadd(20); ! 1081: sniffadd(19); ! 1082: sniffadd(18); ! 1083: sniffadd(17); ! 1084: sniffadd(16); ! 1085: #endif /* not UNIT16 and not UNIT8 */ ! 1086: sniffadd(15); ! 1087: sniffadd(14); ! 1088: sniffadd(13); ! 1089: sniffadd(12); ! 1090: sniffadd(11); ! 1091: sniffadd(10); ! 1092: sniffadd(9); ! 1093: sniffadd(8); ! 1094: #endif /* not UNIT8 */ ! 1095: sniffadd(7); ! 1096: sniffadd(6); ! 1097: sniffadd(5); ! 1098: sniffadd(4); ! 1099: sniffadd(3); ! 1100: sniffadd(2); ! 1101: sniffadd(1); ! 1102: sniffadd(0); ! 1103: ! 1104: /* ! 1105: * The product may have grown by as many as LOG_UNITSIZE+1 ! 1106: * bits. ! 1107: * Now reduce the product back down by conditionally ! 1108: * subtracting LOG_UNITSIZE+1 preshifted images of the ! 1109: * modulus. This is the modulo reduction phase of modmult. ! 1110: * ! 1111: * The following loop is unrolled for speed, using macros... ! 1112: ! 1113: for (i=LOG_UNITSIZE; i>=0; i--) ! 1114: if (mp_compare(prod,moduli[i]) >= 0) ! 1115: mp_sub(prod,moduli[i]); ! 1116: */ ! 1117: ! 1118: #ifndef UNIT8 ! 1119: #ifndef UNIT16 ! 1120: msubs(5); ! 1121: #endif ! 1122: msubs(4); ! 1123: #endif ! 1124: msubs(3); ! 1125: msubs(2); ! 1126: msubs(1); ! 1127: msubs(0); ! 1128: ! 1129: /* Bump pointer to next lower unit of multiplier: */ ! 1130: post_lowerunit(multiplier); ! 1131: ! 1132: } /* Loop for the number of units in the ! 1133: multiplier */ ! 1134: ! 1135: return 0; /* normal return */ ! 1136: ! 1137: } /* merritt_modmult */ ! 1138: ! 1139: #undef msubs ! 1140: #undef sniffadd ! 1141: ! 1142: /* ! 1143: * Merritt's mp_modmult function leaves some internal tables in memory, ! 1144: * so we have to call modmult_burn() at the end of mp_modexp. ! 1145: * This is so that no cryptographically sensitive data is left in memory ! 1146: * after the program exits. ! 1147: */ ! 1148: ! 1149: /* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */ ! 1150: void merritt_burn(void) ! 1151: { ! 1152: unitfill0(&(mpdbuf[0][0]), (UNITSIZE - 1) * MAX_UNIT_PRECISION); ! 1153: unitfill0(&(moduli_buf[0][0]), (UNITSIZE - 1) * MAX_UNIT_PRECISION); ! 1154: unitfill0(msu_moduli, UNITSIZE); ! 1155: unitfill0(nmsu_moduli, UNITSIZE); ! 1156: } /* merritt_burn() */ ! 1157: ! 1158: /******* end of Merritt's MODMULT stuff. *******/ ! 1159: /*=========================================================================*/ ! 1160: #endif /* MERRITT */ ! 1161: ! 1162: ! 1163: #ifdef UPTON_OR_SMITH /* Used by Upton's and Smith's ! 1164: modmult algorithms */ ! 1165: ! 1166: /* ! 1167: * Jimmy Upton's multiprecision modmult algorithm in C. ! 1168: * Performs a multiply combined with a modulo operation. ! 1169: * ! 1170: * The following support functions and data structures ! 1171: * are used only by Upton's modmult algorithm... ! 1172: */ ! 1173: ! 1174: short munit_prec; /* global_precision expressed in MULTUNITs */ ! 1175: ! 1176: /* ! 1177: * Note that since the SPARC CPU has no hardware integer multiply ! 1178: * instruction, there is not that much advantage in having an ! 1179: * assembly version of mp_smul on that machine. It might be faster ! 1180: * to use Merritt's modmult instead of Upton's modmult on the SPARC. ! 1181: */ ! 1182: ! 1183: /* ! 1184: * Multiply the single-word multiplier times the multiprecision integer ! 1185: * in multiplicand, accumulating result in prod. The resulting ! 1186: * multiprecision prod will be 1 word longer than the multiplicand. ! 1187: * multiplicand is munit_prec words long. We add into prod, so caller ! 1188: * should zero it out first. For best results, this time-critical ! 1189: * function should be implemented in assembly. ! 1190: * NOTE: Unlike other functions in the multiprecision arithmetic ! 1191: * library, both multiplicand and prod are pointing at the LSB, ! 1192: * regardless of byte order of the machine. On an 80x86, this makes ! 1193: * no difference. But if this assembly function is implemented ! 1194: * on a 680x0, it becomes important. ! 1195: */ ! 1196: ! 1197: /* ! 1198: * Note that this has been modified from the previous version to allow ! 1199: * better support for Smith's modmult: ! 1200: * The final carry bit is added to the existing product ! 1201: * array, rather than simply stored. ! 1202: */ ! 1203: ! 1204: #ifndef mp_smul ! 1205: void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier) ! 1206: { ! 1207: short i; ! 1208: unsigned long p, carry; ! 1209: ! 1210: carry = 0; ! 1211: for (i = 0; i < munit_prec; ++i) { ! 1212: p = (unsigned long) multiplier **post_higherunit(multiplicand); ! 1213: p += *prod + carry; ! 1214: *post_higherunit(prod) = (MULTUNIT) p; ! 1215: carry = p >> MULTUNITSIZE; ! 1216: } ! 1217: /* Add carry to the next higher word of product / dividend */ ! 1218: *prod += (MULTUNIT) carry; ! 1219: } /* mp_smul */ ! 1220: ! 1221: #endif ! 1222: ! 1223: /* ! 1224: * mp_dmul is a double-precision multiply multiplicand times multiplier, ! 1225: * result into prod. prod must be pointing at a "double precision" ! 1226: * buffer. E.g. If global_precision is 10 words, prod must be ! 1227: * pointing at a 20-word buffer. ! 1228: */ ! 1229: #ifndef mp_dmul ! 1230: void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier) ! 1231: { ! 1232: register int i; ! 1233: register MULTUNIT *p_multiplicand, *p_multiplier; ! 1234: register MULTUNIT *prodp; ! 1235: ! 1236: ! 1237: unitfill0(prod, global_precision * 2); /* Pre-zero prod */ ! 1238: /* Calculate precision in units of MULTUNIT */ ! 1239: munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; ! 1240: p_multiplicand = (MULTUNIT *) multiplicand; ! 1241: p_multiplier = (MULTUNIT *) multiplier; ! 1242: prodp = (MULTUNIT *) prod; ! 1243: make_lsbptr(p_multiplicand, munit_prec); ! 1244: make_lsbptr(p_multiplier, munit_prec); ! 1245: make_lsbptr(prodp, munit_prec * 2); ! 1246: /* Multiply multiplicand by each word in multiplier, accumulating prod: */ ! 1247: for (i = 0; i < munit_prec; ++i) ! 1248: mp_smul(post_higherunit(prodp), ! 1249: p_multiplicand, *post_higherunit(p_multiplier)); ! 1250: } /* mp_dmul */ ! 1251: #endif /* mp_dmul */ ! 1252: ! 1253: static unit ALIGN modulus[MAX_UNIT_PRECISION]; ! 1254: static short nbits; /* number of modulus significant bits */ ! 1255: #endif /* UPTON_OR_SMITH */ ! 1256: ! 1257: #ifdef UPTON ! 1258: ! 1259: /* ! 1260: * These scratchpad arrays are used only by upton_modmult (mp_modmult). ! 1261: * Some of them could be staticly declared inside of mp_modmult, but we ! 1262: * put them outside mp_modmult so that they can be wiped clean by ! 1263: * modmult_burn(), which is called at the end of mp_modexp. This is ! 1264: * so that no sensitive data is left in memory after the program exits. ! 1265: */ ! 1266: ! 1267: static unit ALIGN reciprocal[MAX_UNIT_PRECISION]; ! 1268: static unit ALIGN dhi[MAX_UNIT_PRECISION]; ! 1269: static unit ALIGN d_data[MAX_UNIT_PRECISION * 2]; /* double-wide ! 1270: register */ ! 1271: static unit ALIGN e_data[MAX_UNIT_PRECISION * 2]; /* double-wide ! 1272: register */ ! 1273: static unit ALIGN f_data[MAX_UNIT_PRECISION * 2]; /* double-wide ! 1274: register */ ! 1275: ! 1276: static short nbitsDivUNITSIZE; ! 1277: static short nbitsModUNITSIZE; ! 1278: ! 1279: /* ! 1280: * stage_upton_modulus() is aliased to stage_modulus(). ! 1281: * Prepare for an Upton modmult. Calculate the reciprocal of modulus, ! 1282: * and save both. Note that reciprocal will have one more bit than ! 1283: * modulus. ! 1284: * Assumes that global_precision has already been adjusted to the ! 1285: * size of the modulus, plus SLOP_BITS. ! 1286: */ ! 1287: int stage_upton_modulus(unitptr n) ! 1288: { ! 1289: mp_move(modulus, n); ! 1290: mp_recip(reciprocal, modulus); ! 1291: nbits = countbits(modulus); ! 1292: nbitsDivUNITSIZE = nbits / UNITSIZE; ! 1293: nbitsModUNITSIZE = nbits % UNITSIZE; ! 1294: return 0; /* normal return */ ! 1295: } /* stage_upton_modulus */ ! 1296: ! 1297: ! 1298: /* ! 1299: * Upton's algorithm performs a multiply combined with a modulo operation. ! 1300: * Computes: prod = (multiplicand*multiplier) mod modulus ! 1301: * WARNING: All the arguments must be less than the modulus! ! 1302: * References global unitptr modulus and reciprocal. ! 1303: * The reciprocal of modulus is 1 bit longer than the modulus. ! 1304: * upton_modmult() is aliased to mp_modmult(). ! 1305: */ ! 1306: int upton_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) ! 1307: { ! 1308: unitptr d = d_data; ! 1309: unitptr d1 = d_data; ! 1310: unitptr e = e_data; ! 1311: unitptr f = f_data; ! 1312: short orig_precision; ! 1313: ! 1314: orig_precision = global_precision; ! 1315: mp_dmul(d, multiplicand, multiplier); ! 1316: ! 1317: /* Throw off low nbits of d */ ! 1318: #ifndef HIGHFIRST ! 1319: d1 = d + nbitsDivUNITSIZE; ! 1320: #else ! 1321: d1 = d + orig_precision - nbitsDivUNITSIZE; ! 1322: #endif ! 1323: mp_move(dhi, d1); /* Don't screw up d, we need it later */ ! 1324: mp_shift_right_bits(dhi, nbitsModUNITSIZE); ! 1325: ! 1326: mp_dmul(e, dhi, reciprocal); /* Note - reciprocal has nbits+1 bits */ ! 1327: ! 1328: #ifndef HIGHFIRST ! 1329: e += nbitsDivUNITSIZE; ! 1330: #else ! 1331: e += orig_precision - nbitsDivUNITSIZE; ! 1332: #endif ! 1333: mp_shift_right_bits(e, nbitsModUNITSIZE); ! 1334: ! 1335: mp_dmul(f, e, modulus); ! 1336: ! 1337: /* Now for the only double-precision call to mpilib: */ ! 1338: set_precision(orig_precision * 2); ! 1339: mp_sub(d, f); ! 1340: ! 1341: /* d's precision should be <= orig_precision */ ! 1342: rescale(d, orig_precision * 2, orig_precision); ! 1343: set_precision(orig_precision); ! 1344: ! 1345: /* Should never have to do this final subtract more than twice: */ ! 1346: while (mp_compare(d, modulus) > 0) ! 1347: mp_sub(d, modulus); ! 1348: ! 1349: mp_move(prod, d); ! 1350: return 0; /* normal return */ ! 1351: } /* upton_modmult */ ! 1352: ! 1353: /* Upton's mp_modmult function leaves some internal arrays in memory, ! 1354: so we have to call modmult_burn() at the end of mp_modexp. ! 1355: This is so that no cryptographically sensitive data is left in memory ! 1356: after the program exits. ! 1357: upton_burn() is aliased to modmult_burn(). ! 1358: */ ! 1359: void upton_burn(void) ! 1360: { ! 1361: unitfill0(modulus, MAX_UNIT_PRECISION); ! 1362: unitfill0(reciprocal, MAX_UNIT_PRECISION); ! 1363: unitfill0(dhi, MAX_UNIT_PRECISION); ! 1364: unitfill0(d_data, MAX_UNIT_PRECISION * 2); ! 1365: unitfill0(e_data, MAX_UNIT_PRECISION * 2); ! 1366: unitfill0(f_data, MAX_UNIT_PRECISION * 2); ! 1367: nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0; ! 1368: } /* upton_burn */ ! 1369: ! 1370: /******* end of Upton's MODMULT stuff. *******/ ! 1371: /*=========================================================================*/ ! 1372: #endif /* UPTON */ ! 1373: ! 1374: #ifdef SMITH /* using Thad Smith's modmult algorithm */ ! 1375: ! 1376: /* ! 1377: * Thad Smith's implementation of multiprecision modmult algorithm in C. ! 1378: * Performs a multiply combined with a modulo operation. ! 1379: * The multiplication is done with mp_dmul, the same as for Upton's ! 1380: * modmult. The modulus reduction is done by long division, in ! 1381: * which a trial quotient "digit" is determined, then the product of ! 1382: * that digit and the divisor are subtracted from the dividend. ! 1383: * ! 1384: * In this case, the digit is MULTUNIT in size and the subtraction ! 1385: * is done by adding the product to the one's complement of the ! 1386: * dividend, which allows use of the existing mp_smul routine. ! 1387: * ! 1388: * The following support functions and data structures ! 1389: * are used only by Smith's modmult algorithm... ! 1390: */ ! 1391: ! 1392: /* ! 1393: * These scratchpad arrays are used only by smith_modmult (mp_modmult). ! 1394: * Some of them could be statically declared inside of mp_modmult, but we ! 1395: * put them outside mp_modmult so that they can be wiped clean by ! 1396: * modmult_burn(), which is called at the end of mp_modexp. This is ! 1397: * so that no sensitive data is left in memory after the program exits. ! 1398: */ ! 1399: ! 1400: static unit ALIGN ds_data[MAX_UNIT_PRECISION * 2 + 2]; ! 1401: ! 1402: static unit mod_quotient[4]; ! 1403: static unit mod_divisor[4]; /* 2 most signif. MULTUNITs of modulus */ ! 1404: ! 1405: static MULTUNIT *modmpl; /* ptr to modulus least significant ! 1406: ** MULTUNIT */ ! 1407: #if defined(WIN32) && defined(USE_WIN32_ASSEMBLER) ! 1408: int mshift; /* number of bits for ! 1409: ** recip scaling */ ! 1410: MULTUNIT reciph; /* MSunit of scaled recip */ ! 1411: MULTUNIT recipl; /* LSunit of scaled recip */ ! 1412: #else ! 1413: static int mshift; /* number of bits for ! 1414: ** recip scaling */ ! 1415: static MULTUNIT reciph; /* MSunit of scaled recip */ ! 1416: static MULTUNIT recipl; /* LSunit of scaled recip */ ! 1417: #endif ! 1418: ! 1419: static short modlenMULTUNITS; /* length of modulus in MULTUNITs */ ! 1420: static MULTUNIT mutemp; /* temporary */ ! 1421: ! 1422: /* ! 1423: * The routines mp_smul and mp_dmul are the same as for UPTON and ! 1424: * should be coded in assembly. Note, however, that the previous ! 1425: * Upton's mp_smul version has been modified to compatible with ! 1426: * Smith's modmult. The new version also still works for Upton's ! 1427: * modmult. ! 1428: */ ! 1429: ! 1430: #ifndef mp_set_recip ! 1431: #define mp_set_recip(rh,rl,m) /* null */ ! 1432: #else ! 1433: /* setup routine for external mp_quo_digit */ ! 1434: void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m); ! 1435: #endif ! 1436: MULTUNIT mp_quo_digit(MULTUNIT * dividend); ! 1437: ! 1438: #ifdef MULTUNIT_SIZE_SAME ! 1439: #define mp_musubb mp_subb /* use existing routine */ ! 1440: #else /* ! MULTUNIT_SIZE_SAME */ ! 1441: ! 1442: /* ! 1443: * This performs the same function as mp_subb, but with MULTUNITs. ! 1444: * Note: Processors without alignment requirements may be able to use ! 1445: * mp_subb, even though MULTUNITs are smaller than units. In that case, ! 1446: * use mp_subb, since it would be faster if coded in assembly. Note that ! 1447: * this implementation won't work for MULTUNITs which are long -- use ! 1448: * mp_subb in that case. ! 1449: */ ! 1450: #ifndef mp_musubb ! 1451: ! 1452: /* ! 1453: * multiprecision subtract of MULTUNITs with borrow, r2 from r1, result in r1 ! 1454: * borrow is incoming borrow flag-- value should be 0 or 1 ! 1455: */ ! 1456: boolean mp_musubb ! 1457: (register MULTUNIT * r1, register MULTUNIT * r2, register boolean borrow) ! 1458: { ! 1459: register ulint x; /* won't work if sizeof(MULTUNIT)== ! 1460: sizeof(long) */ ! 1461: short precision; /* number of MULTUNITs to subtract */ ! 1462: precision = global_precision * MULTUNITs_perunit; ! 1463: make_lsbptr(r1, precision); ! 1464: make_lsbptr(r2, precision); ! 1465: while (precision--) { ! 1466: x = (ulint) * r1 - (ulint) * post_higherunit(r2) - (ulint) borrow; ! 1467: *post_higherunit(r1) = x; ! 1468: borrow = (((1L << MULTUNITSIZE) & x) != 0L); ! 1469: } ! 1470: return (borrow); ! 1471: } /* mp_musubb */ ! 1472: #endif /* mp_musubb */ ! 1473: #endif /* MULTUNIT_SIZE_SAME */ ! 1474: ! 1475: /* ! 1476: * The function mp_quo_digit is the heart of Smith's modulo reduction, ! 1477: * which uses a form of long division. It computes a trial quotient ! 1478: * "digit" (MULTUNIT-sized digit) by multiplying the three most ! 1479: * significant MULTUNITs of the dividend by the two most significant ! 1480: * MULTUNITs of the reciprocal of the modulus. Note that this function ! 1481: * requires that MULTUNITSIZE * 2 <= sizeof(unsigned long). ! 1482: * ! 1483: * An important part of this technique is that the quotient never be ! 1484: * too small, although it may occasionally be too large. This was ! 1485: * done to eliminate the need to check and correct for a remainder ! 1486: * exceeding the divisor. It is easier to check for a negative ! 1487: * remainder. The following technique rarely needs correction for ! 1488: * MULTUNITs of at least 16 bits. ! 1489: * ! 1490: * The following routine has two implementations: ! 1491: * ! 1492: * ASM_PROTOTYPE defined: written to be an executable prototype for ! 1493: * an efficient assembly language implementation. Note that several ! 1494: * of the following masks and shifts can be done by directly ! 1495: * manipulating single precision registers on some architectures. ! 1496: * ! 1497: * ASM_PROTOTYPE undefined: a slightly more efficient implementation ! 1498: * in C. Although this version returns a result larger than the ! 1499: * optimum (which is corrected in smith_modmult) more often than the ! 1500: * prototype version, the faster execution in C more than makes up ! 1501: * for the difference. ! 1502: * ! 1503: * Parameter: dividend - points to the most significant MULTUNIT ! 1504: * of the dividend. Note that dividend actually contains the ! 1505: * one's complement of the actual dividend value (see comments for ! 1506: * smith_modmult). ! 1507: * ! 1508: * Return: the trial quotient digit resulting from dividing the first ! 1509: * three MULTUNITs at dividend by the upper two MULTUNITs of the ! 1510: * modulus. ! 1511: */ ! 1512: ! 1513: /* #define ASM_PROTOTYPE *//* undefined: use C-optimized version */ ! 1514: ! 1515: #ifndef mp_quo_digit ! 1516: MULTUNIT mp_quo_digit(MULTUNIT * dividend) ! 1517: { ! 1518: unsigned long q, q0, q1, q2; ! 1519: unsigned short lsb_factor; ! 1520: ! 1521: /* ! 1522: * Compute the least significant product group. ! 1523: * The last terms of q1 and q2 perform upward rounding, which is ! 1524: * needed to guarantee that the result not be too small. ! 1525: */ ! 1526: q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long) reciph ! 1527: + reciph; ! 1528: q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long) recipl ! 1529: + (1L << MULTUNITSIZE); ! 1530: #ifdef ASM_PROTOTYPE ! 1531: lsb_factor = 1 & (q1 >> MULTUNITSIZE) & (q2 >> MULTUNITSIZE); ! 1532: q = q1 + q2; ! 1533: ! 1534: /* The following statement is equivalent to shifting the sum right ! 1535: one bit while shifting in the carry bit. ! 1536: */ ! 1537: q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); ! 1538: #else /* optimized C version */ ! 1539: q0 = (q1 >> 1) + (q2 >> 1) + 1; ! 1540: #endif ! 1541: ! 1542: /* Compute the middle significant product group. */ ! 1543: ! 1544: q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long) reciph; ! 1545: q2 = (dividend[0] ^ MULTUNIT_mask) * (unsigned long) recipl; ! 1546: #ifdef ASM_PROTOTYPE ! 1547: q = q1 + q2; ! 1548: q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); ! 1549: ! 1550: /* Add in the most significant word of the first group. ! 1551: The last term takes care of the carry from adding the lsb's ! 1552: that were shifted out prior to addition. ! 1553: */ ! 1554: q = (q0 >> MULTUNITSIZE) + q + (lsb_factor & (q1 ^ q2)); ! 1555: #else /* optimized C version */ ! 1556: q = (q0 >> MULTUNITSIZE) + (q1 >> 1) + (q2 >> 1) + 1; ! 1557: #endif ! 1558: ! 1559: /* Compute the most significant term and add in the others */ ! 1560: ! 1561: q = (q >> (MULTUNITSIZE - 2)) + ! 1562: (((dividend[0] ^ MULTUNIT_mask) * (unsigned long) reciph) << 1); ! 1563: q >>= mshift; ! 1564: ! 1565: /* Prevent overflow and then wipe out the intermediate results. */ ! 1566: ! 1567: mutemp = (MULTUNIT) min(q, (1L << MULTUNITSIZE) - 1); ! 1568: q = q0 = q1 = q2 = 0; ! 1569: lsb_factor = 0; ! 1570: (void) lsb_factor; ! 1571: return mutemp; ! 1572: } ! 1573: #endif /* mp_quo_digit */ ! 1574: ! 1575: /* ! 1576: * stage_smith_modulus() - Prepare for a Smith modmult. ! 1577: * ! 1578: * Calculate the reciprocal of modulus with a precision of two MULTUNITs. ! 1579: * Assumes that global_precision has already been adjusted to the ! 1580: * size of the modulus, plus SLOP_BITS. ! 1581: * ! 1582: * Note: This routine was designed to work with large values and ! 1583: * doesn't have the necessary testing or handling to work with a ! 1584: * modulus having less than three significant units. For such cases, ! 1585: * the separate multiply and modulus routines can be used. ! 1586: * ! 1587: * stage_smith_modulus() is aliased to stage_modulus(). ! 1588: */ ! 1589: int stage_smith_modulus(unitptr n_modulus) ! 1590: { ! 1591: int original_precision; ! 1592: int sigmod; /* significant units in modulus */ ! 1593: unitptr mp; /* modulus most significant pointer */ ! 1594: MULTUNIT *mpm; /* reciprocal pointer */ ! 1595: int prec; /* precision of reciprocal calc in units */ ! 1596: ! 1597: mp_move(modulus, n_modulus); ! 1598: modmpl = (MULTUNIT *) modulus; ! 1599: modmpl = lsbptr(modmpl, global_precision * MULTUNITs_perunit); ! 1600: nbits = countbits(modulus); ! 1601: modlenMULTUNITS = (nbits + MULTUNITSIZE - 1) / MULTUNITSIZE; ! 1602: ! 1603: original_precision = global_precision; ! 1604: ! 1605: /* The following code copies the three most significant units of ! 1606: * modulus to mod_divisor. ! 1607: */ ! 1608: mp = modulus; ! 1609: sigmod = significance(modulus); ! 1610: rescale(mp, original_precision, sigmod); ! 1611: /* prec is the unit precision required for 3 MULTUNITs */ ! 1612: prec = (3 + (MULTUNITs_perunit - 1)) / MULTUNITs_perunit; ! 1613: set_precision(prec); ! 1614: ! 1615: /* set mp = ptr to most significant units of modulus, then move ! 1616: * the most significant units to mp_divisor ! 1617: */ ! 1618: mp = msbptr(mp, sigmod) - tohigher(prec - 1); ! 1619: unmake_lsbptr(mp, prec); ! 1620: mp_move(mod_divisor, mp); ! 1621: ! 1622: /* Keep 2*MULTUNITSIZE bits in mod_divisor. ! 1623: * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits. ! 1624: */ ! 1625: mshift = countbits(mod_divisor) - 2 * MULTUNITSIZE; ! 1626: mp_shift_right_bits(mod_divisor, mshift); ! 1627: mp_recip(mod_quotient, mod_divisor); ! 1628: mp_shift_right_bits(mod_quotient, 1); ! 1629: ! 1630: /* Reduce to: 0 < mshift <= MULTUNITSIZE */ ! 1631: mshift = ((mshift + (MULTUNITSIZE - 1)) % MULTUNITSIZE) + 1; ! 1632: /* round up, rescaling if necessary */ ! 1633: mp_inc(mod_quotient); ! 1634: if (countbits(mod_quotient) > 2 * MULTUNITSIZE) { ! 1635: mp_shift_right_bits(mod_quotient, 1); ! 1636: mshift--; /* now 0 <= mshift <= MULTUNITSIZE */ ! 1637: } ! 1638: mpm = lsbptr((MULTUNIT *) mod_quotient, prec * MULTUNITs_perunit); ! 1639: recipl = *post_higherunit(mpm); ! 1640: reciph = *mpm; ! 1641: mp_set_recip(reciph, recipl, mshift); ! 1642: set_precision(original_precision); ! 1643: return 0; /* normal return */ ! 1644: } /* stage_smith_modulus */ ! 1645: ! 1646: /* Smith's algorithm performs a multiply combined with a modulo operation. ! 1647: Computes: prod = (multiplicand*multiplier) mod modulus ! 1648: The modulus must first be set by stage_smith_modulus(). ! 1649: smith_modmult() is aliased to mp_modmult(). ! 1650: */ ! 1651: int smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) ! 1652: { ! 1653: unitptr d; /* ptr to product */ ! 1654: MULTUNIT *dmph, *dmpl, *dmp; /* ptrs to dividend (high, low, first) ! 1655: * aligned for subtraction */ ! 1656: /* Note that dmph points one MULTUNIT higher than indicated by ! 1657: global precision. This allows us to zero out a word one higher than ! 1658: the normal precision. ! 1659: */ ! 1660: short orig_precision; ! 1661: short nqd; /* number of quotient digits remaining to ! 1662: * be generated */ ! 1663: short dmi; /* number of significant MULTUNITs in ! 1664: product */ ! 1665: ! 1666: d = ds_data + 1; /* room for leading MSB if HIGHFIRST */ ! 1667: orig_precision = global_precision; ! 1668: mp_dmul(d, multiplicand, multiplier); ! 1669: ! 1670: rescale(d, orig_precision * 2, orig_precision * 2 + 1); ! 1671: set_precision(orig_precision * 2 + 1); ! 1672: *msbptr(d, global_precision) = 0; /* leading 0 unit */ ! 1673: ! 1674: /* We now start working with MULTUNITs. ! 1675: Determine the most significant MULTUNIT of the product so we don't ! 1676: have to process leading zeros in our divide loop. ! 1677: */ ! 1678: dmi = significance(d) * MULTUNITs_perunit; ! 1679: if (dmi >= modlenMULTUNITS) { ! 1680: /* Make dividend negative. This allows the use of mp_smul to ! 1681: * "subtract" the product of the modulus and the trial divisor ! 1682: * by actually adding to a negative dividend. ! 1683: * The one's complement of the dividend is used, since it causes ! 1684: * a zero value to be represented as all ones. This facilitates ! 1685: * testing the result for possible overflow, since a sign bit ! 1686: * indicates that no adjustment is necessary, and we should not ! 1687: * attempt to adjust if the result of the addition is zero. ! 1688: */ ! 1689: mp_inc(d); ! 1690: mp_neg(d); ! 1691: set_precision(orig_precision); ! 1692: munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; ! 1693: ! 1694: /* Set msb, lsb, and normal ptrs of dividend */ ! 1695: dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) * ! 1696: MULTUNITs_perunit) + tohigher(dmi); ! 1697: nqd = dmi + 1 - modlenMULTUNITS; ! 1698: dmpl = dmph - tohigher(modlenMULTUNITS); ! 1699: ! 1700: /* ! 1701: * Divide loop. ! 1702: * Each iteration computes the next quotient MULTUNIT digit, then ! 1703: * multiplies the divisor (modulus) by the quotient digit and adds ! 1704: * it to the one's complement of the dividend (equivalent to ! 1705: * subtracting). If the product was greater than the remaining dividend, ! 1706: * we get a non-negative result, in which case we subtract off the ! 1707: * modulus to get the proper negative remainder. ! 1708: */ ! 1709: for (; nqd; nqd--) { ! 1710: MULTUNIT q; /* quotient trial digit */ ! 1711: ! 1712: q = mp_quo_digit(dmph); ! 1713: if (q > 0) { ! 1714: mp_smul(dmpl, modmpl, q); ! 1715: ! 1716: /* Perform correction if q too large. ! 1717: * This rarely occurs. ! 1718: */ ! 1719: if (!(*dmph & MULTUNIT_msb)) { ! 1720: dmp = dmpl; ! 1721: unmake_lsbptr(dmp, orig_precision * ! 1722: MULTUNITs_perunit); ! 1723: if (mp_musubb(dmp, ! 1724: (MULTUNIT *) modulus, 0)) ! 1725: (*dmph)--; ! 1726: } ! 1727: } ! 1728: pre_lowerunit(dmph); ! 1729: pre_lowerunit(dmpl); ! 1730: } ! 1731: /* d contains the one's complement of the remainder. */ ! 1732: rescale(d, orig_precision * 2 + 1, orig_precision); ! 1733: set_precision(orig_precision); ! 1734: mp_neg(d); ! 1735: mp_dec(d); ! 1736: } else { ! 1737: /* Product was less than modulus. Return it. */ ! 1738: rescale(d, orig_precision * 2 + 1, orig_precision); ! 1739: set_precision(orig_precision); ! 1740: } ! 1741: mp_move(prod, d); ! 1742: return (0); /* normal return */ ! 1743: } /* smith_modmult */ ! 1744: ! 1745: /* Smith's mp_modmult function leaves some internal arrays in memory, ! 1746: so we have to call modmult_burn() at the end of mp_modexp. ! 1747: This is so that no cryptographically sensitive data is left in memory ! 1748: after the program exits. ! 1749: smith_burn() is aliased to modmult_burn(). ! 1750: */ ! 1751: void smith_burn(void) ! 1752: { ! 1753: empty_array(modulus); ! 1754: empty_array(ds_data); ! 1755: empty_array(mod_quotient); ! 1756: empty_array(mod_divisor); ! 1757: modmpl = 0; ! 1758: mshift = nbits = 0; ! 1759: reciph = recipl = 0; ! 1760: modlenMULTUNITS = mutemp = 0; ! 1761: mp_set_recip(0, 0, 0); ! 1762: } /* smith_burn */ ! 1763: ! 1764: /* End of Thad Smith's implementation of modmult. */ ! 1765: ! 1766: #endif /* SMITH */ ! 1767: ! 1768: /* Returns number of significant bits in r */ ! 1769: int countbits(unitptr r) ! 1770: { ! 1771: int bits; ! 1772: short prec; ! 1773: register unit bitmask; ! 1774: init_bitsniffer(r, bitmask, prec, bits); ! 1775: return bits; ! 1776: } /* countbits */ ! 1777: ! 1778: char *copyright_notice(void) ! 1779: /* force linker to include copyright notice in the executable object image. */ ! 1780: { ! 1781: return ("(c)1986 Philip Zimmermann"); ! 1782: } /* copyright_notice */ ! 1783: ! 1784: /* ! 1785: * Russian peasant combined exponentiation/modulo algorithm. ! 1786: * Calls modmult instead of mult. ! 1787: * Computes: expout = (expin**exponent) mod modulus ! 1788: * WARNING: All the arguments must be less than the modulus! ! 1789: */ ! 1790: int mp_modexp(register unitptr expout, register unitptr expin, ! 1791: register unitptr exponent, register unitptr modulus) ! 1792: { ! 1793: int bits; ! 1794: short oldprecision; ! 1795: register unit bitmask; ! 1796: unit product[MAX_UNIT_PRECISION]; ! 1797: short eprec; ! 1798: ! 1799: #ifdef COUNTMULTS ! 1800: tally_modmults = 0; /* clear "number of modmults" counter */ ! 1801: tally_modsquares = 0; /* clear "number of modsquares" counter */ ! 1802: #endif /* COUNTMULTS */ ! 1803: mp_init(expout, 1); ! 1804: if (testeq(exponent, 0)) { ! 1805: if (testeq(expin, 0)) ! 1806: return -1; /* 0 to the 0th power means return error */ ! 1807: return 0; /* otherwise, zero exponent means expout ! 1808: is 1 */ ! 1809: } ! 1810: if (testeq(modulus, 0)) ! 1811: return -2; /* zero modulus means error */ ! 1812: #if SLOP_BITS > 0 /* if there's room for sign bits */ ! 1813: if (mp_tstminus(modulus)) ! 1814: return -2; /* negative modulus means error */ ! 1815: #endif /* SLOP_BITS > 0 */ ! 1816: if (mp_compare(expin, modulus) >= 0) ! 1817: return -3; /* if expin >= modulus, return error */ ! 1818: if (mp_compare(exponent, modulus) >= 0) ! 1819: return -4; /* if exponent >= modulus, return error */ ! 1820: ! 1821: oldprecision = global_precision; /* save global_precision */ ! 1822: /* set smallest optimum precision for this modulus */ ! 1823: set_precision(bits2units(countbits(modulus) + SLOP_BITS)); ! 1824: /* rescale all these registers to global_precision we just defined */ ! 1825: rescale(modulus, oldprecision, global_precision); ! 1826: rescale(expin, oldprecision, global_precision); ! 1827: rescale(exponent, oldprecision, global_precision); ! 1828: rescale(expout, oldprecision, global_precision); ! 1829: ! 1830: if (stage_modulus(modulus)) { ! 1831: set_precision(oldprecision); /* restore original precision */ ! 1832: return -5; /* unstageable modulus (STEWART algorithm) */ ! 1833: } ! 1834: /* normalize and compute number of bits in exponent first */ ! 1835: init_bitsniffer(exponent, bitmask, eprec, bits); ! 1836: ! 1837: /* We can "optimize out" the first modsquare and modmult: */ ! 1838: bits--; /* We know for sure at this point that ! 1839: bits>0 */ ! 1840: mp_move(expout, expin); /* expout = (1*1)*expin; */ ! 1841: bump_bitsniffer(exponent, bitmask); ! 1842: ! 1843: while (bits--) { ! 1844: #ifdef MACTC5 ! 1845: mac_poll_for_break(); ! 1846: #endif ! 1847: poll_for_break(); /* polls keyboard, allows ctrl-C to ! 1848: abort program */ ! 1849: #ifdef COUNTMULTS ! 1850: tally_modsquares++; /* bump "number of modsquares" counter */ ! 1851: #endif /* COUNTMULTS */ ! 1852: mp_modsquare(product, expout); ! 1853: if (sniff_bit(exponent, bitmask)) { ! 1854: mp_modmult(expout, product, expin); ! 1855: #ifdef COUNTMULTS ! 1856: tally_modmults++; /* bump "number of modmults" counter */ ! 1857: #endif /* COUNTMULTS */ ! 1858: } else { ! 1859: mp_move(expout, product); ! 1860: } ! 1861: bump_bitsniffer(exponent, bitmask); ! 1862: } /* while bits-- */ ! 1863: mp_burn(product); /* burn the evidence on the stack */ ! 1864: modmult_burn(); /* ask mp_modmult to also burn its ! 1865: own evidence */ ! 1866: ! 1867: #ifdef COUNTMULTS /* diagnostic analysis */ ! 1868: { ! 1869: long atomic_mults; ! 1870: unsigned int unitcount, totalmults; ! 1871: unitcount = bits2units(countbits(modulus)); ! 1872: /* calculation assumes modsquare takes as long as a modmult: */ ! 1873: atomic_mults = (long) tally_modmults *(unitcount * unitcount); ! 1874: atomic_mults += (long) tally_modsquares *(unitcount * unitcount); ! 1875: printf("%ld atomic mults for ", atomic_mults); ! 1876: printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n", ! 1877: tally_modsquares, tally_modmults, ! 1878: tally_modsquares + tally_modmults, ! 1879: countbits(modulus), unitcount); ! 1880: } ! 1881: #endif /* COUNTMULTS */ ! 1882: ! 1883: set_precision(oldprecision); /* restore original precision */ ! 1884: ! 1885: /* Do an explicit reference to the copyright notice so that the linker ! 1886: will be forced to include it in the executable object image... */ ! 1887: copyright_notice(); /* has no real effect at run time */ ! 1888: ! 1889: return 0; /* normal return */ ! 1890: } /* mp_modexp */ ! 1891: ! 1892: /* ! 1893: * This is a faster modexp for moduli with a known factorisation into two ! 1894: * relatively prime factors p and q, and an input relatively prime to the ! 1895: * modulus, the Chinese Remainder Theorem to do the computation mod p and ! 1896: * mod q, and then combine the results. This relies on a number of ! 1897: * precomputed values, but does not actually require the modulus n or the ! 1898: * exponent e. ! 1899: * ! 1900: * expout = expin ^ e mod (p*q). ! 1901: * We form this by evaluating ! 1902: * p2 = (expin ^ e) mod p and ! 1903: * q2 = (expin ^ e) mod q ! 1904: * and then combining the two by the CRT. ! 1905: * ! 1906: * Two optimisations of this are possible. First, we can reduce expin ! 1907: * modulo p and q before starting. ! 1908: * ! 1909: * Second, since we know the factorisation of p and q (trivially derived ! 1910: * from the factorisation of n = p*q), and expin is relatively prime to ! 1911: * both p and q, we can use Euler's theorem, expin^phi(m) = 1 (mod m), ! 1912: * to throw away multiples of phi(p) or phi(q) in e. ! 1913: * Letting ep = e mod phi(p) and ! 1914: * eq = e mod phi(q) ! 1915: * then combining these two speedups, we only need to evaluate ! 1916: * p2 = ((expin mod p) ^ ep) mod p and ! 1917: * q2 = ((expin mod q) ^ eq) mod q. ! 1918: * ! 1919: * Now we need to apply the CRT. Starting with ! 1920: * expout = p2 (mod p) and ! 1921: * expout = q2 (mod q) ! 1922: * we can say that expout = p2 + p * k, and if we assume that 0 <= p2 < p, ! 1923: * then 0 <= expout < p*q for some 0 <= k < q. Since we want expout = q2 ! 1924: * (mod q), then p*k = q2-p2 (mod q). Since p and q are relatively prime, ! 1925: * p has a multiplicative inverse u mod q. In other words, u = 1/p (mod q). ! 1926: * ! 1927: * Multiplying by u on both sides gives k = u*(q2-p2) (mod q). ! 1928: * Since we want 0 <= k < q, we can thus find k as ! 1929: * k = (u * (q2-p2)) mod q. ! 1930: * ! 1931: * Once we have k, evaluating p2 + p * k is easy, and ! 1932: * that gives us the result. ! 1933: * ! 1934: * In the detailed implementation, there is a temporary, temp, used to ! 1935: * hold intermediate results, p2 is held in expout, and q2 is used as a ! 1936: * temporary in the final step when it is no longer needed. With that, ! 1937: * you should be able to understand the code below. ! 1938: */ ! 1939: int mp_modexp_crt(unitptr expout, unitptr expin, ! 1940: unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u) ! 1941: { ! 1942: unit q2[MAX_UNIT_PRECISION]; ! 1943: unit temp[MAX_UNIT_PRECISION]; ! 1944: int status; ! 1945: ! 1946: /* First, compiute p2 (physically held in M) */ ! 1947: ! 1948: /* p2 = [ (expin mod p) ^ ep ] mod p */ ! 1949: mp_mod(temp, expin, p); /* temp = expin mod p */ ! 1950: status = mp_modexp(expout, temp, ep, p); ! 1951: if (status < 0) { /* mp_modexp returned an error. */ ! 1952: mp_init(expout, 1); ! 1953: return status; /* error return */ ! 1954: } ! 1955: /* And the same thing for q2 */ ! 1956: ! 1957: /* q2 = [ (expin mod q) ^ eq ] mod q */ ! 1958: mp_mod(temp, expin, q); /* temp = expin mod q */ ! 1959: status = mp_modexp(q2, temp, eq, q); ! 1960: if (status < 0) { /* mp_modexp returned an error. */ ! 1961: mp_init(expout, 1); ! 1962: return status; /* error return */ ! 1963: } ! 1964: /* Now use the multiplicative inverse u to glue together the ! 1965: two halves. ! 1966: */ ! 1967: #if 0 ! 1968: /* This optimisation is useful if you commonly get small results, ! 1969: but for random results and large q, the odds of (1/q) of it ! 1970: being useful do not warrant the test. ! 1971: */ ! 1972: if (mp_compare(expout, q2) != 0) { ! 1973: #endif ! 1974: /* Find q2-p2 mod q */ ! 1975: if (mp_sub(q2, expout)) /* if the result went negative */ ! 1976: mp_add(q2, q); /* add q to q2 */ ! 1977: ! 1978: /* expout = p2 + ( p * [(q2*u) mod q] ) */ ! 1979: mp_mult(temp, q2, u); /* q2*u */ ! 1980: mp_mod(q2, temp, q); /* (q2*u) mod q */ ! 1981: mp_mult(temp, p, q2); /* p * [(q2*u) mod q] */ ! 1982: mp_add(expout, temp); /* expout = p2 + p * [...] */ ! 1983: #if 0 ! 1984: } ! 1985: #endif ! 1986: ! 1987: mp_burn(q2); /* burn the evidence on the stack... */ ! 1988: mp_burn(temp); ! 1989: return 0; /* normal return */ ! 1990: } /* mp_modexp_crt */ ! 1991: ! 1992: ! 1993: /****************** end of MPI library ****************************/
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.