|
|
1.1.1.6 ! root 1: /* C source code for multiprecision arithmetic library routines. ! 2: Implemented Nov 86 by Philip Zimmermann ! 3: Last revised 27 Nov 91 by PRZ ! 4: ! 5: Boulder Software Engineering ! 6: 3021 Eleventh Street ! 7: Boulder, CO 80304 ! 8: (303) 541-0140 ! 9: ! 10: (c) Copyright 1986-1994 by Philip Zimmermann. All rights reserved. ! 11: The author assumes no liability for damages resulting from the use ! 12: of this software, even if the damage results from defects in this ! 13: software. No warranty is expressed or implied. The use of this ! 14: cryptographic software for developing weapon systems is expressly ! 15: forbidden. ! 16: ! 17: ! 18: These routines implement all of the multiprecision arithmetic ! 19: necessary for number-theoretic cryptographic algorithms such as ! 20: ElGamal, Diffie-Hellman, Rabin, or factoring studies for large ! 21: composite numbers, as well as Rivest-Shamir-Adleman (RSA) public ! 22: key cryptography. ! 23: ! 24: Although originally developed in Microsoft C for the IBM PC, this code ! 25: contains few machine dependencies. It assumes 2's complement ! 26: arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines, ! 27: lowbyte-highbyte order or highbyte-lowbyte order. This version ! 28: has been converted to ANSI C. ! 29: ! 30: ! 31: The internal representation for these extended precision integer ! 32: "registers" is an array of "units". A unit is a machine word, which ! 33: is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit ! 34: unsigned integer, depending on the machine's word size. For example, ! 35: an IBM PC or AT uses a unit size of 16 bits. To perform arithmetic ! 36: on these huge precision integers, we pass pointers to these unit ! 37: arrays to various subroutines. A pointer to an array of units is of ! 38: type unitptr. This is a pointer to a huge integer "register". ! 39: ! 40: When calling a subroutine, we always pass a pointer to the BEGINNING ! 41: of the array of units, regardless of the byte order of the machine. ! 42: On a lowbyte-first machine, such as the Intel 80x86, this unitptr ! 43: points to the LEAST significant unit, and the array of units increases ! 44: significance to the right. On a highbyte-first machine, such as the ! 45: Motorola 680x0, this unitptr points to the MOST significant unit, and ! 46: the array of units decreases significance to the right. ! 47: ! 48: Modified 8 Apr 92 - HAJK ! 49: Implement new VAX/VMS primitive support. ! 50: ! 51: Modified 30 Sep 92 -Castor Fu <[email protected]> ! 52: Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long) ! 53: ! 54: Modified 28 Nov 92 - Thad Smith ! 55: Added Smith modmult, generalized non-portable support. ! 56: */ ! 57: ! 58: /* #define COUNTMULTS */ /* count modmults for performance studies */ ! 59: ! 60: #ifdef DEBUG ! 61: #ifdef MSDOS ! 62: #ifdef __GO32__ /* DJGPP */ ! 63: #include <pc.h> ! 64: #else ! 65: #include <conio.h> ! 66: #endif /* __GO32__ */ ! 67: #define poll_for_break() {while (kbhit()) getch();} ! 68: #endif /* MSDOS */ ! 69: #endif /* DEBUG */ ! 70: ! 71: #ifndef poll_for_break ! 72: #define poll_for_break() /* stub */ ! 73: #endif ! 74: ! 75: #include "mpilib.h" ! 76: ! 77: #ifdef mp_smula ! 78: #ifdef mp_smul ! 79: Error: Both mp_smula and mp_smul cannot be defined. ! 80: #else ! 81: #define mp_smul mp_smula ! 82: #endif ! 83: #endif ! 84: ! 85: /* set macros for MULTUNIT */ ! 86: #ifdef MUNIT8 ! 87: #define MULTUNITSIZE 8 ! 88: typedef unsigned char MULTUNIT; ! 89: #ifdef UNIT8 ! 90: #define MULTUNIT_SIZE_SAME ! 91: #endif ! 92: #else /* not MUNIT8 */ ! 93: #ifdef MUNIT32 ! 94: #define MULTUNITSIZE 32 ! 95: typedef unsigned long MULTUNIT; ! 96: #ifdef UNIT32 ! 97: #define MULTUNIT_SIZE_SAME ! 98: #else ! 99: /* #error is not portable, this has the same effect */ ! 100: #include "UNITSIZE cannot be smaller than MULTUNITSIZE" ! 101: #endif ! 102: #else /* assume MUNIT16 */ ! 103: #define MULTUNITSIZE 16 ! 104: typedef unsigned short MULTUNIT; ! 105: #ifdef UNIT16 ! 106: #define MULTUNIT_SIZE_SAME ! 107: #endif /* UNIT16 */ ! 108: #ifdef UNIT8 ! 109: #include "UNITSIZE cannot be smaller than MULTUNITSIZE" ! 110: #endif /* UNIT8 */ ! 111: #endif /* MUNIT32 */ ! 112: #endif /* MUNIT8 */ ! 113: ! 114: #define MULTUNIT_msb ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */ ! 115: #define DMULTUNIT_msb (1L << (2*MULTUNITSIZE-1)) ! 116: #define MULTUNIT_mask ((MULTUNIT)((1L << MULTUNITSIZE)-1)) ! 117: #define MULTUNITs_perunit (UNITSIZE/MULTUNITSIZE) ! 118: ! 119: ! 120: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier); ! 121: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier); ! 122: ! 123: short global_precision = 0; /* units of precision for all routines */ ! 124: /* global_precision is the unit precision last set by set_precision. ! 125: Initially, set_precision() should be called to define global_precision ! 126: before using any of these other multiprecision library routines. ! 127: i.e.: set_precision(MAX_UNIT_PRECISION); ! 128: */ ! 129: ! 130: /*************** multiprecision library primitives ****************/ ! 131: /* The following portable C primitives should be recoded in assembly. ! 132: The entry point name should be defined, in "mpilib.h" to the external ! 133: entry point name. If undefined, the C version will be used. ! 134: */ ! 135: ! 136: typedef unsigned long int ulint; ! 137: ! 138: #ifndef mp_addc ! 139: boolean mp_addc ! 140: (register unitptr r1,register unitptr r2,register boolean carry) ! 141: /* multiprecision add with carry r2 to r1, result in r1 */ ! 142: /* carry is incoming carry flag-- value should be 0 or 1 */ ! 143: { ! 144: register unit x; ! 145: short precision; /* number of units to add */ ! 146: precision = global_precision; ! 147: make_lsbptr(r1,precision); ! 148: make_lsbptr(r2,precision); ! 149: while (precision--) { ! 150: if (carry) { ! 151: x = *r1 + *r2 + 1; ! 152: carry = (*r2 >= (unit)(~ *r1)); ! 153: } else { ! 154: x = *r1 + *r2; ! 155: carry = (x < *r1) ; ! 156: } ! 157: post_higherunit(r2); ! 158: *post_higherunit(r1) = x; ! 159: } ! 160: return carry; /* return the final carry flag bit */ ! 161: } /* mp_addc */ ! 162: #endif /* mp_addc */ ! 163: ! 164: #ifndef mp_subb ! 165: boolean mp_subb ! 166: (register unitptr r1,register unitptr r2,register boolean borrow) ! 167: /* multiprecision subtract with borrow, r2 from r1, result in r1 */ ! 168: /* borrow is incoming borrow flag-- value should be 0 or 1 */ ! 169: { ! 170: register unit x; ! 171: short precision; /* number of units to subtract */ ! 172: precision = global_precision; ! 173: make_lsbptr(r1,precision); ! 174: make_lsbptr(r2,precision); ! 175: while (precision--) { ! 176: if (borrow) { ! 177: x = *r1 - *r2 - 1; ! 178: borrow = (*r1 <= *r2); ! 179: } else { ! 180: x = *r1 - *r2; ! 181: borrow = (*r1 < *r2); ! 182: } ! 183: post_higherunit(r2); ! 184: *post_higherunit(r1) = x; ! 185: } ! 186: return borrow; /* return the final carry/borrow flag bit */ ! 187: } /* mp_subb */ ! 188: #endif /* mp_subb */ ! 189: ! 190: #ifndef mp_rotate_left ! 191: boolean mp_rotate_left(register unitptr r1,register boolean carry) ! 192: /* multiprecision rotate left 1 bit with carry, result in r1. */ ! 193: /* carry is incoming carry flag-- value should be 0 or 1 */ ! 194: { ! 195: register int precision; /* number of units to rotate */ ! 196: unsigned int mcarry = carry, nextcarry; /* int is supposed to be ! 197: * the efficient size for ops*/ ! 198: precision = global_precision; ! 199: make_lsbptr(r1,precision); ! 200: while (precision--) { ! 201: nextcarry = (((signedunit) *r1) < 0); ! 202: *r1 = (*r1 << 1) | mcarry; ! 203: mcarry = nextcarry; ! 204: pre_higherunit(r1); ! 205: } ! 206: return nextcarry; /* return the final carry flag bit */ ! 207: } /* mp_rotate_left */ ! 208: #endif /* mp_rotate_left */ ! 209: ! 210: /************** end of primitives that should be in assembly *************/ ! 211: ! 212: ! 213: /* The mp_shift_right_bits function is not called in any time-critical ! 214: situations in public-key cryptographic functions, so it doesn't ! 215: need to be coded in assembly language. ! 216: */ ! 217: void mp_shift_right_bits(register unitptr r1,register short bits) ! 218: /* multiprecision shift right bits, result in r1. ! 219: bits is how many bits to shift, must be <= UNITSIZE. ! 220: */ ! 221: { ! 222: register short precision; /* number of units to shift */ ! 223: register unit carry,nextcarry,bitmask; ! 224: register short unbits; ! 225: if (bits==0) return; /* shift zero bits is a no-op */ ! 226: carry = 0; ! 227: bitmask = power_of_2(bits)-1; ! 228: unbits = UNITSIZE-bits; /* shift bits must be <= UNITSIZE */ ! 229: precision = global_precision; ! 230: make_msbptr(r1,precision); ! 231: if (bits == UNITSIZE) { ! 232: while (precision--) { ! 233: nextcarry = *r1; ! 234: *r1 = carry; ! 235: carry = nextcarry; ! 236: pre_lowerunit(r1); ! 237: } ! 238: } else { ! 239: while (precision--) { ! 240: nextcarry = *r1 & bitmask; ! 241: *r1 >>= bits; ! 242: *r1 |= carry << unbits; ! 243: carry = nextcarry; ! 244: pre_lowerunit(r1); ! 245: } ! 246: } ! 247: } /* mp_shift_right_bits */ ! 248: ! 249: ! 250: #ifndef mp_compare ! 251: short mp_compare(register unitptr r1,register unitptr r2) ! 252: /* ! 253: * Compares multiprecision integers *r1, *r2, and returns: ! 254: * -1 iff *r1 < *r2 ! 255: * 0 iff *r1 == *r2 ! 256: * +1 iff *r1 > *r2 ! 257: */ ! 258: { ! 259: register short precision; /* number of units to compare */ ! 260: ! 261: precision = global_precision; ! 262: make_msbptr(r1,precision); ! 263: make_msbptr(r2,precision); ! 264: do { ! 265: if (*r1 < *r2) ! 266: return -1; ! 267: if (*post_lowerunit(r1) > *post_lowerunit(r2)) ! 268: return 1; ! 269: } while (--precision); ! 270: return 0; /* *r1 == *r2 */ ! 271: } /* mp_compare */ ! 272: #endif /* mp_compare */ ! 273: ! 274: ! 275: boolean mp_inc(register unitptr r) ! 276: /* Increment multiprecision integer r. */ ! 277: { ! 278: register short precision; ! 279: precision = global_precision; ! 280: make_lsbptr(r,precision); ! 281: do { ! 282: if ( ++(*r) ) return 0; /* no carry */ ! 283: post_higherunit(r); ! 284: } while (--precision); ! 285: return 1; /* return carry set */ ! 286: } /* mp_inc */ ! 287: ! 288: ! 289: boolean mp_dec(register unitptr r) ! 290: /* Decrement multiprecision integer r. */ ! 291: { ! 292: register short precision; ! 293: precision = global_precision; ! 294: make_lsbptr(r,precision); ! 295: do { ! 296: if ( (signedunit) (--(*r)) != (signedunit) -1 ) ! 297: return 0; /* no borrow */ ! 298: post_higherunit(r); ! 299: } while (--precision); ! 300: return 1; /* return borrow set */ ! 301: } /* mp_dec */ ! 302: ! 303: ! 304: void mp_neg(register unitptr r) ! 305: /* Compute 2's complement, the arithmetic negative, of r */ ! 306: { ! 307: register short precision; /* number of units to negate */ ! 308: precision = global_precision; ! 309: mp_dec(r); /* 2's complement is 1's complement plus 1 */ ! 310: do { /* now do 1's complement */ ! 311: *r = ~(*r); ! 312: r++; ! 313: } while (--precision); ! 314: } /* mp_neg */ ! 315: ! 316: #ifndef mp_move ! 317: void mp_move(register unitptr dst,register unitptr src) ! 318: { ! 319: register short precision; /* number of units to move */ ! 320: precision = global_precision; ! 321: do { *dst++ = *src++; } while (--precision); ! 322: } /* mp_move */ ! 323: #endif /* mp_move */ ! 324: ! 325: void mp_init(register unitptr r, word16 value) ! 326: /* Init multiprecision register r with short value. */ ! 327: { /* Note that mp_init doesn't extend sign bit for >32767 */ ! 328: ! 329: unitfill0( r, global_precision); ! 330: make_lsbptr(r,global_precision); ! 331: *post_higherunit(r) = value; ! 332: #ifdef UNIT8 ! 333: *post_higherunit(r) = value >> UNITSIZE; ! 334: #endif ! 335: } /* mp_init */ ! 336: ! 337: ! 338: short significance(register unitptr r) ! 339: /* Returns number of significant units in r */ ! 340: { ! 341: register short precision; ! 342: precision = global_precision; ! 343: make_msbptr(r,precision); ! 344: do { ! 345: if (*post_lowerunit(r)) ! 346: return precision; ! 347: } while (--precision); ! 348: return precision; ! 349: } /* significance */ ! 350: ! 351: ! 352: #ifndef unitfill0 ! 353: void unitfill0(unitptr r,word16 unitcount) ! 354: /* Zero-fill the unit buffer r. */ ! 355: { ! 356: while (unitcount--) *r++ = 0; ! 357: } /* unitfill0 */ ! 358: #endif /* unitfill0 */ ! 359: ! 360: ! 361: int mp_udiv(register unitptr remainder,register unitptr quotient, ! 362: register unitptr dividend,register unitptr divisor) ! 363: /* Unsigned divide, treats both operands as positive. */ ! 364: { ! 365: int bits; ! 366: short dprec; ! 367: register unit bitmask; ! 368: ! 369: if (testeq(divisor,0)) ! 370: return -1; /* zero divisor means divide error */ ! 371: mp_init0(remainder); ! 372: mp_init0(quotient); ! 373: /* normalize and compute number of bits in dividend first */ ! 374: init_bitsniffer(dividend,bitmask,dprec,bits); ! 375: /* rescale quotient to same precision (dprec) as dividend */ ! 376: rescale(quotient,global_precision,dprec); ! 377: make_msbptr(quotient,dprec); ! 378: ! 379: while (bits--) { ! 380: mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0)); ! 381: if (mp_compare(remainder,divisor) >= 0) { ! 382: mp_sub(remainder,divisor); ! 383: stuff_bit(quotient,bitmask); ! 384: } ! 385: bump_2bitsniffers(dividend,quotient,bitmask); ! 386: } ! 387: return 0; ! 388: } /* mp_udiv */ ! 389: ! 390: ! 391: #ifdef UPTON_OR_SMITH ! 392: #define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */ ! 393: ! 394: int mp_recip(register unitptr quotient,register unitptr divisor) ! 395: /* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */ ! 396: { ! 397: int bits; ! 398: short qprec; ! 399: register unit bitmask; ! 400: unit remainder[MAX_UNIT_PRECISION]; ! 401: if (testeq(divisor,0)) ! 402: return -1; /* zero divisor means divide error */ ! 403: mp_init0(remainder); ! 404: mp_init0(quotient); ! 405: ! 406: /* normalize and compute number of bits in quotient first */ ! 407: bits = countbits(divisor) + RECIPMARGIN; ! 408: bitmask = bitmsk(bits); /* bitmask within a single unit */ ! 409: qprec = bits2units(bits+1); ! 410: mp_setbit(remainder,(bits-RECIPMARGIN)-1); ! 411: /* rescale quotient to precision of divisor + RECIPMARGIN bits */ ! 412: rescale(quotient,global_precision,qprec); ! 413: make_msbptr(quotient,qprec); ! 414: ! 415: while (bits--) { ! 416: mp_shift_left(remainder); ! 417: if (mp_compare(remainder,divisor) >= 0) { ! 418: mp_sub(remainder,divisor); ! 419: stuff_bit(quotient,bitmask); ! 420: } ! 421: bump_bitsniffer(quotient,bitmask); ! 422: } ! 423: mp_init0(remainder); /* burn sensitive data left on stack */ ! 424: return 0; ! 425: } /* mp_recip */ ! 426: #endif /* UPTON_OR_SMITH */ ! 427: ! 428: ! 429: int mp_div(register unitptr remainder,register unitptr quotient, ! 430: register unitptr dividend,register unitptr divisor) ! 431: /* Signed divide, either or both operands may be negative. */ ! 432: { ! 433: boolean dvdsign,dsign; ! 434: int status; ! 435: dvdsign = (boolean)(mp_tstminus(dividend)!=0); ! 436: dsign = (boolean)(mp_tstminus(divisor)!=0); ! 437: if (dvdsign) mp_neg(dividend); ! 438: if (dsign) mp_neg(divisor); ! 439: status = mp_udiv(remainder,quotient,dividend,divisor); ! 440: if (dvdsign) mp_neg(dividend); /* repair caller's dividend */ ! 441: if (dsign) mp_neg(divisor); /* repair caller's divisor */ ! 442: if (status<0) return status; /* divide error? */ ! 443: if (dvdsign) mp_neg(remainder); ! 444: if (dvdsign ^ dsign) mp_neg(quotient); ! 445: return status; ! 446: } /* mp_div */ ! 447: ! 448: ! 449: word16 mp_shortdiv(register unitptr quotient, ! 450: register unitptr dividend,register word16 divisor) ! 451: /* ! 452: * This function does a fast divide and mod on a multiprecision dividend ! 453: * using a short integer divisor returning a short integer remainder. ! 454: * This is an unsigned divide. It treats both operands as positive. ! 455: * It is used mainly for faster printing of large numbers in base 10. ! 456: */ ! 457: { ! 458: int bits; ! 459: short dprec; ! 460: register unit bitmask; ! 461: register word16 remainder; ! 462: if (!divisor) /* if divisor == 0 */ ! 463: return -1; /* zero divisor means divide error */ ! 464: remainder=0; ! 465: mp_init0(quotient); ! 466: /* normalize and compute number of bits in dividend first */ ! 467: init_bitsniffer(dividend,bitmask,dprec,bits); ! 468: /* rescale quotient to same precision (dprec) as dividend */ ! 469: rescale(quotient,global_precision,dprec); ! 470: make_msbptr(quotient,dprec); ! 471: ! 472: while (bits--) { ! 473: remainder <<= 1; ! 474: if (sniff_bit(dividend,bitmask)) ! 475: remainder++; ! 476: if (remainder >= divisor) { ! 477: remainder -= divisor; ! 478: stuff_bit(quotient,bitmask); ! 479: } ! 480: bump_2bitsniffers(dividend,quotient,bitmask); ! 481: } ! 482: return remainder; ! 483: } /* mp_shortdiv */ ! 484: ! 485: ! 486: int mp_mod(register unitptr remainder, ! 487: register unitptr dividend,register unitptr divisor) ! 488: /* Unsigned divide, treats both operands as positive. */ ! 489: { ! 490: int bits; ! 491: short dprec; ! 492: register unit bitmask; ! 493: if (testeq(divisor,0)) ! 494: return -1; /* zero divisor means divide error */ ! 495: mp_init0(remainder); ! 496: /* normalize and compute number of bits in dividend first */ ! 497: init_bitsniffer(dividend,bitmask,dprec,bits); ! 498: ! 499: while (bits--) { ! 500: mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0)); ! 501: msub(remainder,divisor); ! 502: bump_bitsniffer(dividend,bitmask); ! 503: } ! 504: return 0; ! 505: } /* mp_mod */ ! 506: ! 507: ! 508: word16 mp_shortmod(register unitptr dividend,register word16 divisor) ! 509: /* ! 510: * This function does a fast mod operation on a multiprecision dividend ! 511: * using a short integer modulus returning a short integer remainder. ! 512: * This is an unsigned divide. It treats both operands as positive. ! 513: * It is used mainly for fast sieve searches for large primes. ! 514: */ ! 515: { int bits; ! 516: short dprec; ! 517: register unit bitmask; ! 518: register word16 remainder; ! 519: if (!divisor) /* if divisor == 0 */ ! 520: return -1; /* zero divisor means divide error */ ! 521: remainder=0; ! 522: /* normalize and compute number of bits in dividend first */ ! 523: init_bitsniffer(dividend,bitmask,dprec,bits); ! 524: ! 525: while (bits--) { ! 526: remainder <<= 1; ! 527: if (sniff_bit(dividend,bitmask)) ! 528: remainder++; ! 529: if (remainder >= divisor) remainder -= divisor; ! 530: bump_bitsniffer(dividend,bitmask); ! 531: } ! 532: return remainder; ! 533: } /* mp_shortmod */ ! 534: ! 535: ! 536: ! 537: #ifdef COMB_MULT /* use faster "comb" multiply algorithm */ ! 538: /* We are skipping this code because it has a bug... */ ! 539: ! 540: int mp_mult(register unitptr prod, ! 541: register unitptr multiplicand, register unitptr multiplier) ! 542: /* ! 543: * Computes multiprecision prod = multiplicand * multiplier ! 544: * ! 545: * Uses interleaved comb multiply algorithm. ! 546: * This improved multiply more than twice as fast as a Russian ! 547: * peasant multiply, because it does a lot fewer shifts. ! 548: * Must have global_precision set to the size of the multiplicand ! 549: * plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum ! 550: * of the lengths of the multiplier and multiplicand. ! 551: * ! 552: * BUG ALERT: Unfortunately, this code has a bug. It fails for ! 553: * some numbers. One such example: ! 554: * x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 ! 555: * x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D ! 556: * 52C8 CDC7 AFB3 61C8 243C 741B ! 557: * --which is obviously wrong. The answer should be: ! 558: * x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A ! 559: * C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 ! 560: * We'll have to fix this some day. In the meantime, we'll ! 561: * just have the compiler skip over this code. ! 562: * ! 563: * BUG NOTE: Possibly fixed. Needs testing. ! 564: */ ! 565: { ! 566: int bits; ! 567: register unit bitmask; ! 568: unitptr product, mplier, temp; ! 569: short mprec,mprec2; ! 570: unit mplicand[MAX_UNIT_PRECISION]; ! 571: ! 572: /* better clear full width--double precision */ ! 573: mp_init(prod+tohigher(global_precision),0); ! 574: ! 575: if (testeq(multiplicand,0)) ! 576: return 0; /* zero multiplicand means zero product */ ! 577: ! 578: mp_move(mplicand,multiplicand); /* save it from damage */ ! 579: ! 580: normalize(multiplier,mprec); ! 581: if (!mprec) ! 582: return 0; ! 583: ! 584: make_lsbptr(multiplier,mprec); ! 585: bitmask = 1; /* start scan at LSB of multiplier */ ! 586: ! 587: do { /* UNITSIZE times */ ! 588: /* do for bits 0-15 */ ! 589: product = prod; ! 590: mplier = multiplier; ! 591: mprec2 = mprec; ! 592: while (mprec2--) { /* do for each word in multiplier */ ! 593: ! 594: if (sniff_bit(mplier,bitmask)) { ! 595: if (mp_addc(product,multiplicand,0)) /* ripple carry */ ! 596: { /* After 1st time thru, this is rarely encountered. */ ! 597: temp = msbptr(product,global_precision); ! 598: pre_higherunit(temp); ! 599: /* temp now points to LSU of carry region. */ ! 600: unmake_lsbptr(temp,global_precision); ! 601: mp_inc(temp); ! 602: } /* ripple carry */ ! 603: } ! 604: pre_higherunit(mplier); ! 605: pre_higherunit(product); ! 606: } ! 607: if (!(bitmask <<= 1)) ! 608: break; ! 609: mp_shift_left(multiplicand); ! 610: ! 611: } while (TRUE); ! 612: ! 613: mp_move(multiplicand,mplicand); /* recover */ ! 614: ! 615: return 0; /* normal return */ ! 616: } /* mp_mult */ ! 617: ! 618: #endif /* COMB_MULT */ ! 619: ! 620: ! 621: /* Because the "comb" multiply has a bug, we will use the slower ! 622: Russian peasant multiply instead. Fortunately, the mp_mult ! 623: function is not called from any time-critical code. ! 624: */ ! 625: ! 626: int mp_mult(register unitptr prod, ! 627: register unitptr multiplicand,register unitptr multiplier) ! 628: /* ! 629: * Computes multiprecision prod = multiplicand * multiplier ! 630: * Uses "Russian peasant" multiply algorithm. ! 631: */ ! 632: { ! 633: int bits; ! 634: register unit bitmask; ! 635: short mprec; ! 636: mp_init(prod,0); ! 637: if (testeq(multiplicand,0)) ! 638: return 0; /* zero multiplicand means zero product */ ! 639: /* normalize and compute number of bits in multiplier first */ ! 640: init_bitsniffer(multiplier,bitmask,mprec,bits); ! 641: ! 642: while (bits--) { ! 643: mp_shift_left(prod); ! 644: if (sniff_bit(multiplier,bitmask)) ! 645: mp_add(prod,multiplicand); ! 646: bump_bitsniffer(multiplier,bitmask); ! 647: } ! 648: return 0; ! 649: } /* mp_mult */ ! 650: ! 651: ! 652: ! 653: /* ! 654: * mp_modmult computes a multiprecision multiply combined with a ! 655: * modulo operation. This is the most time-critical function in ! 656: * this multiprecision arithmetic library for performing modulo ! 657: * exponentiation. We experimented with different versions of modmult, ! 658: * depending on the machine architecture and performance requirements. ! 659: * We will either use a Russian Peasant modmult (peasant_modmult), ! 660: * Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's ! 661: * modmult (upton_modmult), or Thad Smith's modmult (smith_modmult). ! 662: * On machines with a hardware atomic multiply instruction, ! 663: * Smith's modmult is fastest. It can utilize assembly subroutines to ! 664: * speed up the hardware multiply logic and trial quotient calculation. ! 665: * If the machine lacks a fast hardware multiply, Merritt's modmult ! 666: * is preferred, which doesn't call any assembly multiply routine. ! 667: * We use the alias names mp_modmult, stage_modulus, and modmult_burn ! 668: * for the corresponding true names, which depend on what flavor of ! 669: * modmult we are using. ! 670: * ! 671: * Before making the first call to mp_modmult, you must set up the ! 672: * modulus-dependant precomputated tables by calling stage_modulus. ! 673: * After making all the calls to mp_modmult, you call modmult_burn to ! 674: * erase the tables created by stage_modulus that were left in memory. ! 675: */ ! 676: ! 677: #ifdef COUNTMULTS ! 678: /* "number of modmults" counters, used for performance studies. */ ! 679: static unsigned int tally_modmults = 0; ! 680: static unsigned int tally_modsquares = 0; ! 681: #endif /* COUNTMULTS */ ! 682: ! 683: ! 684: #ifdef PEASANT ! 685: /* Conventional Russian peasant multiply with modulo algorithm. */ ! 686: ! 687: static unitptr pmodulus = 0; /* used only by mp_modmult */ ! 688: ! 689: int stage_peasant_modulus(unitptr n) ! 690: /* ! 691: * Must pass modulus to stage_modulus before calling modmult. ! 692: * Assumes that global_precision has already been adjusted to the ! 693: * size of the modulus, plus SLOP_BITS. ! 694: */ ! 695: { /* For this simple version of modmult, just copy unit pointer. */ ! 696: pmodulus = n; ! 697: return 0; /* normal return */ ! 698: } /* stage_peasant_modulus */ ! 699: ! 700: ! 701: int peasant_modmult(register unitptr prod, ! 702: unitptr multiplicand,register unitptr multiplier) ! 703: /* "Russian peasant" multiply algorithm, combined with a modulo ! 704: * operation. This is a simple naive replacement for Merritt's ! 705: * faster modmult algorithm. References global unitptr "modulus". ! 706: * Computes: prod = (multiplicand*multiplier) mod modulus ! 707: * WARNING: All the arguments must be less than the modulus! ! 708: */ ! 709: { ! 710: int bits; ! 711: register unit bitmask; ! 712: short mprec; ! 713: mp_init(prod,0); ! 714: /* if (testeq(multiplicand,0)) ! 715: return 0; */ /* zero multiplicand means zero product */ ! 716: /* normalize and compute number of bits in multiplier first */ ! 717: init_bitsniffer(multiplier,bitmask,mprec,bits); ! 718: ! 719: while (bits--) { ! 720: mp_shift_left(prod); ! 721: msub(prod,pmodulus); /* turns mult into modmult */ ! 722: if (sniff_bit(multiplier,bitmask)) { ! 723: mp_add(prod,multiplicand); ! 724: msub(prod,pmodulus); /* turns mult into modmult */ ! 725: } ! 726: bump_bitsniffer(multiplier,bitmask); ! 727: } ! 728: return 0; ! 729: } /* peasant_modmult */ ! 730: ! 731: ! 732: /* ! 733: * If we are using a version of mp_modmult that uses internal tables ! 734: * in memory, we have to call modmult_burn() at the end of mp_modexp. ! 735: * This is so that no sensitive data is left in memory after the program ! 736: * exits. The Russian peasant method doesn't use any such tables. ! 737: */ ! 738: void peasant_burn(void) ! 739: /* ! 740: * Alias for modmult_burn, called only from mp_modexp(). Destroys ! 741: * internal modmult tables. This version does nothing because no ! 742: * tables are used by the Russian peasant modmult. ! 743: */ ! 744: { } /* peasant_burn */ ! 745: ! 746: #endif /* PEASANT */ ! 747: ! 748: ! 749: #ifdef MERRITT ! 750: /*=========================================================================*/ ! 751: /* ! 752: This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ. ! 753: Also refined by Zhahai Stewart to reduce the number of subtracts. ! 754: Modified by Raymond Brand to reduce the number of SLOP_BITS by 1. ! 755: It performs a multiply combined with a modulo operation, without ! 756: going into "double precision". It is faster than the Russian peasant ! 757: method, and still works well on machines that lack a fast hardware ! 758: multiply instruction. ! 759: */ ! 760: ! 761: /* The following support functions, macros, and data structures ! 762: are used only by Merritt's modmult algorithm... */ ! 763: ! 764: static void mp_lshift_unit(register unitptr r1) ! 765: /* Shift r1 1 whole unit to the left. Used only by modmult function. */ ! 766: { ! 767: register short precision; ! 768: register unitptr r2; ! 769: precision = global_precision; ! 770: make_msbptr(r1,precision); ! 771: r2 = r1; ! 772: while (--precision) ! 773: *post_lowerunit(r1) = *pre_lowerunit(r2); ! 774: *r1 = 0; ! 775: } /* mp_lshift_unit */ ! 776: ! 777: ! 778: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */ ! 779: static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0}; ! 780: static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */ ! 781: { 0, moduli_buf[ 0], moduli_buf[ 1], moduli_buf[ 2], ! 782: moduli_buf[ 3], moduli_buf[ 4], moduli_buf[ 5], moduli_buf[ 6] ! 783: #ifndef UNIT8 ! 784: ,moduli_buf[ 7] ,moduli_buf[ 8], moduli_buf[ 9], moduli_buf[10], ! 785: moduli_buf[11], moduli_buf[12], moduli_buf[13], moduli_buf[14] ! 786: #ifndef UNIT16 /* and not UNIT8 */ ! 787: ,moduli_buf[15] ,moduli_buf[16], moduli_buf[17], moduli_buf[18], ! 788: moduli_buf[19], moduli_buf[20], moduli_buf[21], moduli_buf[22], ! 789: moduli_buf[23], moduli_buf[24], moduli_buf[25], moduli_buf[26], ! 790: moduli_buf[27], moduli_buf[28], moduli_buf[29], moduli_buf[30] ! 791: #endif /* UNIT16 and UNIT8 not defined */ ! 792: #endif /* UNIT8 not defined */ ! 793: }; ! 794: ! 795: /* ! 796: * To optimize msubs, need following 2 unit arrays, each filled ! 797: * with the most significant unit and next-to-most significant unit ! 798: * of the preshifted images of the modulus. ! 799: */ ! 800: static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */ ! 801: static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */ ! 802: ! 803: /* ! 804: * mpdbuf contains preshifted images of the multiplicand, mod n. ! 805: * It is used only by mp_modmult. It could be staticly declared ! 806: * inside of mp_modmult, but we put it outside mp_modmult so that ! 807: * it can be wiped clean by modmult_burn(), which is called at the ! 808: * end of mp_modexp. This is so that no sensitive data is left in ! 809: * memory after the program exits. ! 810: */ ! 811: static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0}; ! 812: ! 813: ! 814: static void stage_mp_images(unitptr images[UNITSIZE],unitptr r) ! 815: /* ! 816: * Computes UNITSIZE images of r, each shifted left 1 more bit. ! 817: * Used only by modmult function. ! 818: */ ! 819: { ! 820: short int i; ! 821: ! 822: images[0] = r; /* no need to move the first image, just copy ptr */ ! 823: for (i=1; i<UNITSIZE; i++) { ! 824: mp_move(images[i],images[i-1]); ! 825: mp_shift_left(images[i]); ! 826: msub(images[i],moduli[0]); ! 827: } ! 828: } /* stage_mp_images */ ! 829: ! 830: ! 831: int stage_merritt_modulus(unitptr n) ! 832: /* ! 833: * Computes UNITSIZE images of modulus, each shifted left 1 more bit. ! 834: * Before calling mp_modmult, you must first stage the moduli images by ! 835: * calling stage_modulus. n is the pointer to the modulus. ! 836: * Assumes that global_precision has already been adjusted to the ! 837: * size of the modulus, plus SLOP_BITS. ! 838: */ ! 839: { ! 840: short int i; ! 841: unitptr msu; /* ptr to most significant unit, for faster msubs */ ! 842: ! 843: moduli[0] = n; /* no need to move the first image, just copy ptr */ ! 844: ! 845: /* used by optimized msubs macro... */ ! 846: msu = msbptr(n,global_precision); /* needed by msubs */ ! 847: msu_moduli[0] = *post_lowerunit(msu); ! 848: nmsu_moduli[0] = *msu; ! 849: ! 850: for (i=1; i<UNITSIZE; i++) { ! 851: mp_move(moduli[i],moduli[i-1]); ! 852: mp_shift_left(moduli[i]); ! 853: ! 854: /* used by optimized msubs macro... */ ! 855: msu = msbptr(moduli[i],global_precision); /* needed by msubs */ ! 856: msu_moduli[i] = *post_lowerunit(msu); /* for faster msubs */ ! 857: nmsu_moduli[i] = *msu; ! 858: } ! 859: return 0; /* normal return */ ! 860: } /* stage_merritt_modulus */ ! 861: ! 862: ! 863: /* The following macros, sniffadd and msubs, are used by modmult... */ ! 864: #define sniffadd(i) if (*multiplier & power_of_2(i)) mp_add(prod,mpd[i]) ! 865: ! 866: /* Unoptimized msubs macro (msubs0) follows... */ ! 867: /* #define msubs0(i) msub(prod,moduli[i]) ! 868: */ ! 869: ! 870: /* ! 871: * To optimize msubs, compare the most significant units of the ! 872: * product and the shifted modulus before deciding to call the full ! 873: * mp_compare routine. Better still, compare the upper two units ! 874: * before deciding to call mp_compare. ! 875: * Optimization of msubs relies heavily on standard C left-to-right ! 876: * parsimonious evaluation of logical expressions. ! 877: */ ! 878: ! 879: /* Partially-optimized msubs macro (msubs1) follows... */ ! 880: /* #define msubs1(i) if ( \ ! 881: ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \ ! 882: (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \ ! 883: ) mp_sub(prod,moduli[i]) ! 884: */ ! 885: ! 886: /* Fully-optimized msubs macro (msubs2) follows... */ ! 887: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \ ! 888: (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \ ! 889: (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \ ! 890: mp_sub(prod,moduli[i]) ! 891: ! 892: ! 893: int merritt_modmult(register unitptr prod, ! 894: unitptr multiplicand,register unitptr multiplier) ! 895: /* ! 896: * Performs combined multiply/modulo operation. ! 897: * Computes: prod = (multiplicand*multiplier) mod modulus ! 898: * WARNING: All the arguments must be less than the modulus! ! 899: * Assumes the modulus has been predefined by first calling ! 900: * stage_modulus. ! 901: */ ! 902: { ! 903: /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/ ! 904: register signedunit p_m; ! 905: register unitptr msu_prod; /* ptr to most significant unit of product */ ! 906: register unitptr nmsu_prod; /* next-most signif. unit of product */ ! 907: short mprec; /* precision of multiplier, in units */ ! 908: /* Array mpd contains a list of pointers to preshifted images of ! 909: the multiplicand: */ ! 910: static unitptr mpd[UNITSIZE] = { ! 911: 0, mpdbuf[ 0], mpdbuf[ 1], mpdbuf[ 2], ! 912: mpdbuf[ 3], mpdbuf[ 4], mpdbuf[ 5], mpdbuf[ 6] ! 913: #ifndef UNIT8 ! 914: ,mpdbuf[ 7], mpdbuf[ 8], mpdbuf[ 9], mpdbuf[10], ! 915: mpdbuf[11], mpdbuf[12], mpdbuf[13], mpdbuf[14] ! 916: #ifndef UNIT16 /* and not UNIT8 */ ! 917: ,mpdbuf[15], mpdbuf[16], mpdbuf[17], mpdbuf[18], ! 918: mpdbuf[19], mpdbuf[20], mpdbuf[21], mpdbuf[22], ! 919: mpdbuf[23], mpdbuf[24], mpdbuf[25], mpdbuf[26], ! 920: mpdbuf[27], mpdbuf[28], mpdbuf[29], mpdbuf[30] ! 921: #endif /* UNIT16 and UNIT8 not defined */ ! 922: #endif /* UNIT8 not defined */ ! 923: }; ! 924: ! 925: /* Compute preshifted images of multiplicand, mod n: */ ! 926: stage_mp_images(mpd,multiplicand); ! 927: ! 928: /* To optimize msubs, set up msu_prod and nmsu_prod: */ ! 929: msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */ ! 930: nmsu_prod = msu_prod; ! 931: post_lowerunit(nmsu_prod); /* Get next-MSU of prod */ ! 932: ! 933: /* ! 934: * To understand this algorithm, it would be helpful to first ! 935: * study the conventional Russian peasant modmult algorithm. ! 936: * This one does about the same thing as Russian peasant, but ! 937: * is organized differently to save some steps. It loops ! 938: * through the multiplier a word (unit) at a time, instead of ! 939: * a bit at a time. It word-shifts the product instead of ! 940: * bit-shifting it, so it should be faster. It also does about ! 941: * half as many subtracts as Russian peasant. ! 942: */ ! 943: ! 944: mp_init(prod,0); /* Initialize product to 0. */ ! 945: ! 946: /* ! 947: * The way mp_modmult is actually used in cryptographic ! 948: * applications, there will NEVER be a zero multiplier or ! 949: * multiplicand. So there is no need to optimize for that ! 950: * condition. ! 951: */ ! 952: /* if (testeq(multiplicand,0)) ! 953: return 0; */ /* zero multiplicand means zero product */ ! 954: /* Normalize and compute number of units in multiplier first: */ ! 955: normalize(multiplier,mprec); ! 956: if (mprec==0) /* if precision of multiplier is 0 */ ! 957: return 0; /* zero multiplier means zero product */ ! 958: make_msbptr(multiplier,mprec); /* start at MSU of multiplier */ ! 959: ! 960: while (mprec--) { /* Loop for the number of units in the multiplier */ ! 961: /* ! 962: * Shift the product one whole unit to the left. ! 963: * This is part of the multiply phase of modmult. ! 964: */ ! 965: ! 966: mp_lshift_unit(prod); ! 967: ! 968: /* ! 969: * The product may have grown by as many as UNITSIZE ! 970: * bits. That's why we have global_precision set to the ! 971: * size of the modulus plus UNITSIZE slop bits. ! 972: * Now reduce the product back down by conditionally ! 973: * subtracting the preshifted images of the modulus. ! 974: * This is part of the modulo reduction phase of modmult. ! 975: * The following loop is unrolled for speed, using macros... ! 976: ! 977: for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--) ! 978: if (mp_compare(prod,moduli[i]) >= 0) ! 979: mp_sub(prod,moduli[i]); ! 980: */ ! 981: ! 982: #ifndef UNIT8 ! 983: #ifndef UNIT16 /* and not UNIT8 */ ! 984: msubs(31); ! 985: msubs(30); ! 986: msubs(29); ! 987: msubs(28); ! 988: msubs(27); ! 989: msubs(26); ! 990: msubs(25); ! 991: msubs(24); ! 992: msubs(23); ! 993: msubs(22); ! 994: msubs(21); ! 995: msubs(20); ! 996: msubs(19); ! 997: msubs(18); ! 998: msubs(17); ! 999: msubs(16); ! 1000: #endif /* not UNIT16 and not UNIT8 */ ! 1001: msubs(15); ! 1002: msubs(14); ! 1003: msubs(13); ! 1004: msubs(12); ! 1005: msubs(11); ! 1006: msubs(10); ! 1007: msubs(9); ! 1008: msubs(8); ! 1009: #endif /* not UNIT8 */ ! 1010: msubs(7); ! 1011: msubs(6); ! 1012: msubs(5); ! 1013: #ifndef UNIT32 ! 1014: msubs(4); ! 1015: #ifndef UNIT16 ! 1016: msubs(3); ! 1017: #endif ! 1018: #endif ! 1019: ! 1020: /* ! 1021: * Sniff each bit in the current unit of the multiplier, ! 1022: * and conditionally add the corresponding preshifted ! 1023: * image of the multiplicand to the product. ! 1024: * This is also part of the multiply phase of modmult. ! 1025: * ! 1026: * The following loop is unrolled for speed, using macros... ! 1027: ! 1028: for (i=UNITSIZE-1; i>=0; i--) ! 1029: if (*multiplier & power_of_2(i)) ! 1030: mp_add(prod,mpd[i]); ! 1031: */ ! 1032: #ifndef UNIT8 ! 1033: #ifndef UNIT16 /* and not UNIT8 */ ! 1034: sniffadd(31); ! 1035: sniffadd(30); ! 1036: sniffadd(29); ! 1037: sniffadd(28); ! 1038: sniffadd(27); ! 1039: sniffadd(26); ! 1040: sniffadd(25); ! 1041: sniffadd(24); ! 1042: sniffadd(23); ! 1043: sniffadd(22); ! 1044: sniffadd(21); ! 1045: sniffadd(20); ! 1046: sniffadd(19); ! 1047: sniffadd(18); ! 1048: sniffadd(17); ! 1049: sniffadd(16); ! 1050: #endif /* not UNIT16 and not UNIT8 */ ! 1051: sniffadd(15); ! 1052: sniffadd(14); ! 1053: sniffadd(13); ! 1054: sniffadd(12); ! 1055: sniffadd(11); ! 1056: sniffadd(10); ! 1057: sniffadd(9); ! 1058: sniffadd(8); ! 1059: #endif /* not UNIT8 */ ! 1060: sniffadd(7); ! 1061: sniffadd(6); ! 1062: sniffadd(5); ! 1063: sniffadd(4); ! 1064: sniffadd(3); ! 1065: sniffadd(2); ! 1066: sniffadd(1); ! 1067: sniffadd(0); ! 1068: ! 1069: /* ! 1070: * The product may have grown by as many as LOG_UNITSIZE+1 ! 1071: * bits. ! 1072: * Now reduce the product back down by conditionally ! 1073: * subtracting LOG_UNITSIZE+1 preshifted images of the ! 1074: * modulus. This is the modulo reduction phase of modmult. ! 1075: * ! 1076: * The following loop is unrolled for speed, using macros... ! 1077: ! 1078: for (i=LOG_UNITSIZE; i>=0; i--) ! 1079: if (mp_compare(prod,moduli[i]) >= 0) ! 1080: mp_sub(prod,moduli[i]); ! 1081: */ ! 1082: ! 1083: #ifndef UNIT8 ! 1084: #ifndef UNIT16 ! 1085: msubs(5); ! 1086: #endif ! 1087: msubs(4); ! 1088: #endif ! 1089: msubs(3); ! 1090: msubs(2); ! 1091: msubs(1); ! 1092: msubs(0); ! 1093: ! 1094: /* Bump pointer to next lower unit of multiplier: */ ! 1095: post_lowerunit(multiplier); ! 1096: ! 1097: } /* Loop for the number of units in the multiplier */ ! 1098: ! 1099: return 0; /* normal return */ ! 1100: ! 1101: } /* merritt_modmult */ ! 1102: ! 1103: ! 1104: #undef msubs ! 1105: #undef sniffadd ! 1106: ! 1107: ! 1108: /* ! 1109: * Merritt's mp_modmult function leaves some internal tables in memory, ! 1110: * so we have to call modmult_burn() at the end of mp_modexp. ! 1111: * This is so that no cryptographically sensitive data is left in memory ! 1112: * after the program exits. ! 1113: */ ! 1114: void merritt_burn(void) ! 1115: /* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */ ! 1116: { ! 1117: unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION); ! 1118: unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION); ! 1119: unitfill0(msu_moduli,UNITSIZE); ! 1120: unitfill0(nmsu_moduli,UNITSIZE); ! 1121: } /* merritt_burn() */ ! 1122: ! 1123: /******* end of Merritt's MODMULT stuff. *******/ ! 1124: /*=========================================================================*/ ! 1125: #endif /* MERRITT */ ! 1126: ! 1127: ! 1128: #ifdef UPTON_OR_SMITH /* Used by Upton's and Smith's modmult algorithms */ ! 1129: ! 1130: /* ! 1131: * Jimmy Upton's multiprecision modmult algorithm in C. ! 1132: * Performs a multiply combined with a modulo operation. ! 1133: * ! 1134: * The following support functions and data structures ! 1135: * are used only by Upton's modmult algorithm... ! 1136: */ ! 1137: ! 1138: short munit_prec; /* global_precision expressed in MULTUNITs */ ! 1139: ! 1140: /* ! 1141: * Note that since the SPARC CPU has no hardware integer multiply ! 1142: * instruction, there is not that much advantage in having an ! 1143: * assembly version of mp_smul on that machine. It might be faster ! 1144: * to use Merritt's modmult instead of Upton's modmult on the SPARC. ! 1145: */ ! 1146: ! 1147: /* ! 1148: * Multiply the single-word multiplier times the multiprecision integer ! 1149: * in multiplicand, accumulating result in prod. The resulting ! 1150: * multiprecision prod will be 1 word longer than the multiplicand. ! 1151: * multiplicand is munit_prec words long. We add into prod, so caller ! 1152: * should zero it out first. For best results, this time-critical ! 1153: * function should be implemented in assembly. ! 1154: * NOTE: Unlike other functions in the multiprecision arithmetic ! 1155: * library, both multiplicand and prod are pointing at the LSB, ! 1156: * regardless of byte order of the machine. On an 80x86, this makes ! 1157: * no difference. But if this assembly function is implemented ! 1158: * on a 680x0, it becomes important. ! 1159: */ ! 1160: ! 1161: /* ! 1162: * Note that this has been modified from the previous version to allow ! 1163: * better support for Smith's modmult: ! 1164: * The final carry bit is added to the existing product ! 1165: * array, rather than simply stored. ! 1166: */ ! 1167: ! 1168: #ifndef mp_smul ! 1169: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier) ! 1170: { ! 1171: short i; ! 1172: unsigned long p, carry; ! 1173: ! 1174: carry = 0; ! 1175: for (i=0; i<munit_prec; ++i) { ! 1176: p = (unsigned long)multiplier * *post_higherunit(multiplicand); ! 1177: p += *prod + carry; ! 1178: *post_higherunit(prod) = (MULTUNIT)p; ! 1179: carry = p >> MULTUNITSIZE; ! 1180: } ! 1181: /* Add carry to the next higher word of product / dividend */ ! 1182: *prod += (MULTUNIT)carry; ! 1183: } /* mp_smul */ ! 1184: #endif ! 1185: ! 1186: /* ! 1187: * mp_dmul is a double-precision multiply multiplicand times multiplier, ! 1188: * result into prod. prod must be pointing at a "double precision" ! 1189: * buffer. E.g. If global_precision is 10 words, prod must be ! 1190: * pointing at a 20-word buffer. ! 1191: */ ! 1192: #ifndef mp_dmul ! 1193: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier) ! 1194: { ! 1195: register int i; ! 1196: register MULTUNIT *p_multiplicand, *p_multiplier; ! 1197: register MULTUNIT *prodp; ! 1198: ! 1199: ! 1200: unitfill0(prod,global_precision*2); /* Pre-zero prod */ ! 1201: /* Calculate precision in units of MULTUNIT */ ! 1202: munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; ! 1203: p_multiplicand = (MULTUNIT *)multiplicand; ! 1204: p_multiplier = (MULTUNIT *)multiplier; ! 1205: prodp = (MULTUNIT *)prod; ! 1206: make_lsbptr(p_multiplicand,munit_prec); ! 1207: make_lsbptr(p_multiplier,munit_prec); ! 1208: make_lsbptr(prodp,munit_prec*2); ! 1209: /* Multiply multiplicand by each word in multiplier, accumulating prod: */ ! 1210: for (i=0; i<munit_prec; ++i) ! 1211: mp_smul (post_higherunit(prodp), ! 1212: p_multiplicand, *post_higherunit(p_multiplier)); ! 1213: } /* mp_dmul */ ! 1214: #endif /* mp_dmul */ ! 1215: ! 1216: static unit ALIGN modulus[MAX_UNIT_PRECISION]; ! 1217: static short nbits; /* number of modulus significant bits */ ! 1218: #endif /* UPTON_OR_SMITH */ ! 1219: ! 1220: #ifdef UPTON ! 1221: ! 1222: /* ! 1223: * These scratchpad arrays are used only by upton_modmult (mp_modmult). ! 1224: * Some of them could be staticly declared inside of mp_modmult, but we ! 1225: * put them outside mp_modmult so that they can be wiped clean by ! 1226: * modmult_burn(), which is called at the end of mp_modexp. This is ! 1227: * so that no sensitive data is left in memory after the program exits. ! 1228: */ ! 1229: ! 1230: static unit ALIGN reciprocal[MAX_UNIT_PRECISION]; ! 1231: static unit ALIGN dhi[MAX_UNIT_PRECISION]; ! 1232: static unit ALIGN d_data[MAX_UNIT_PRECISION*2]; /* double-wide register */ ! 1233: static unit ALIGN e_data[MAX_UNIT_PRECISION*2]; /* double-wide register */ ! 1234: static unit ALIGN f_data[MAX_UNIT_PRECISION*2]; /* double-wide register */ ! 1235: ! 1236: static short nbitsDivUNITSIZE; ! 1237: static short nbitsModUNITSIZE; ! 1238: ! 1239: /* ! 1240: * stage_upton_modulus() is aliased to stage_modulus(). ! 1241: * Prepare for an Upton modmult. Calculate the reciprocal of modulus, ! 1242: * and save both. Note that reciprocal will have one more bit than ! 1243: * modulus. ! 1244: * Assumes that global_precision has already been adjusted to the ! 1245: * size of the modulus, plus SLOP_BITS. ! 1246: */ ! 1247: int stage_upton_modulus(unitptr n) ! 1248: { ! 1249: mp_move(modulus, n); ! 1250: mp_recip(reciprocal, modulus); ! 1251: nbits = countbits(modulus); ! 1252: nbitsDivUNITSIZE = nbits / UNITSIZE; ! 1253: nbitsModUNITSIZE = nbits % UNITSIZE; ! 1254: return 0; /* normal return */ ! 1255: } /* stage_upton_modulus */ ! 1256: ! 1257: ! 1258: /* ! 1259: * Upton's algorithm performs a multiply combined with a modulo operation. ! 1260: * Computes: prod = (multiplicand*multiplier) mod modulus ! 1261: * WARNING: All the arguments must be less than the modulus! ! 1262: * References global unitptr modulus and reciprocal. ! 1263: * The reciprocal of modulus is 1 bit longer than the modulus. ! 1264: * upton_modmult() is aliased to mp_modmult(). ! 1265: */ ! 1266: int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier) ! 1267: { ! 1268: unitptr d = d_data; ! 1269: unitptr d1 = d_data; ! 1270: unitptr e = e_data; ! 1271: unitptr f = f_data; ! 1272: short orig_precision; ! 1273: ! 1274: orig_precision = global_precision; ! 1275: mp_dmul (d,multiplicand,multiplier); ! 1276: ! 1277: /* Throw off low nbits of d */ ! 1278: #ifndef HIGHFIRST ! 1279: d1 = d + nbitsDivUNITSIZE; ! 1280: #else ! 1281: d1 = d + orig_precision - nbitsDivUNITSIZE; ! 1282: #endif ! 1283: mp_move(dhi, d1); /* Don't screw up d, we need it later */ ! 1284: mp_shift_right_bits(dhi, nbitsModUNITSIZE); ! 1285: ! 1286: mp_dmul (e,dhi,reciprocal); /* Note - reciprocal has nbits+1 bits */ ! 1287: ! 1288: #ifndef HIGHFIRST ! 1289: e += nbitsDivUNITSIZE; ! 1290: #else ! 1291: e += orig_precision - nbitsDivUNITSIZE; ! 1292: #endif ! 1293: mp_shift_right_bits(e, nbitsModUNITSIZE); ! 1294: ! 1295: mp_dmul (f,e,modulus); ! 1296: ! 1297: /* Now for the only double-precision call to mpilib: */ ! 1298: set_precision (orig_precision * 2); ! 1299: mp_sub (d,f); ! 1300: ! 1301: /* d's precision should be <= orig_precision */ ! 1302: rescale (d, orig_precision*2, orig_precision); ! 1303: set_precision (orig_precision); ! 1304: ! 1305: /* Should never have to do this final subtract more than twice: */ ! 1306: while (mp_compare(d,modulus) > 0) ! 1307: mp_sub (d,modulus); ! 1308: ! 1309: mp_move(prod,d); ! 1310: return 0; /* normal return */ ! 1311: } /* upton_modmult */ ! 1312: ! 1313: ! 1314: /* Upton's mp_modmult function leaves some internal arrays in memory, ! 1315: so we have to call modmult_burn() at the end of mp_modexp. ! 1316: This is so that no cryptographically sensitive data is left in memory ! 1317: after the program exits. ! 1318: upton_burn() is aliased to modmult_burn(). ! 1319: */ ! 1320: void upton_burn(void) ! 1321: { ! 1322: unitfill0(modulus,MAX_UNIT_PRECISION); ! 1323: unitfill0(reciprocal,MAX_UNIT_PRECISION); ! 1324: unitfill0(dhi,MAX_UNIT_PRECISION); ! 1325: unitfill0(d_data,MAX_UNIT_PRECISION*2); ! 1326: unitfill0(e_data,MAX_UNIT_PRECISION*2); ! 1327: unitfill0(f_data,MAX_UNIT_PRECISION*2); ! 1328: nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0; ! 1329: } /* upton_burn */ ! 1330: ! 1331: /******* end of Upton's MODMULT stuff. *******/ ! 1332: /*=========================================================================*/ ! 1333: #endif /* UPTON */ ! 1334: ! 1335: #ifdef SMITH /* using Thad Smith's modmult algorithm */ ! 1336: ! 1337: /* ! 1338: * Thad Smith's implementation of multiprecision modmult algorithm in C. ! 1339: * Performs a multiply combined with a modulo operation. ! 1340: * The multiplication is done with mp_dmul, the same as for Upton's ! 1341: * modmult. The modulus reduction is done by long division, in ! 1342: * which a trial quotient "digit" is determined, then the product of ! 1343: * that digit and the divisor are subtracted from the dividend. ! 1344: * ! 1345: * In this case, the digit is MULTUNIT in size and the subtraction ! 1346: * is done by adding the product to the one's complement of the ! 1347: * dividend, which allows use of the existing mp_smul routine. ! 1348: * ! 1349: * The following support functions and data structures ! 1350: * are used only by Smith's modmult algorithm... ! 1351: */ ! 1352: ! 1353: /* ! 1354: * These scratchpad arrays are used only by smith_modmult (mp_modmult). ! 1355: * Some of them could be statically declared inside of mp_modmult, but we ! 1356: * put them outside mp_modmult so that they can be wiped clean by ! 1357: * modmult_burn(), which is called at the end of mp_modexp. This is ! 1358: * so that no sensitive data is left in memory after the program exits. ! 1359: */ ! 1360: ! 1361: static unit ALIGN ds_data[MAX_UNIT_PRECISION*2+2]; ! 1362: ! 1363: static unit mod_quotient [4]; ! 1364: static unit mod_divisor [4]; /* 2 most signif. MULTUNITs of modulus */ ! 1365: ! 1366: static MULTUNIT *modmpl; /* ptr to modulus least significant ! 1367: ** MULTUNIT */ ! 1368: static int mshift; /* number of bits for ! 1369: ** recip scaling */ ! 1370: static MULTUNIT reciph; /* MSunit of scaled recip */ ! 1371: static MULTUNIT recipl; /* LSunit of scaled recip */ ! 1372: ! 1373: static short modlenMULTUNITS; /* length of modulus in MULTUNITs */ ! 1374: static MULTUNIT mutemp; /* temporary */ ! 1375: ! 1376: /* ! 1377: * The routines mp_smul and mp_dmul are the same as for UPTON and ! 1378: * should be coded in assembly. Note, however, that the previous ! 1379: * Upton's mp_smul version has been modified to compatible with ! 1380: * Smith's modmult. The new version also still works for Upton's ! 1381: * modmult. ! 1382: */ ! 1383: ! 1384: #ifndef mp_set_recip ! 1385: #define mp_set_recip(rh,rl,m) /* null */ ! 1386: #else ! 1387: /* setup routine for external mp_quo_digit */ ! 1388: void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m); ! 1389: #endif ! 1390: MULTUNIT mp_quo_digit (MULTUNIT *dividend); ! 1391: ! 1392: #ifdef MULTUNIT_SIZE_SAME ! 1393: #define mp_musubb mp_subb /* use existing routine */ ! 1394: #else /* ! MULTUNIT_SIZE_SAME */ ! 1395: ! 1396: /* ! 1397: * This performs the same function as mp_subb, but with MULTUNITs. ! 1398: * Note: Processors without alignment requirements may be able to use ! 1399: * mp_subb, even though MULTUNITs are smaller than units. In that case, ! 1400: * use mp_subb, since it would be faster if coded in assembly. Note that ! 1401: * this implementation won't work for MULTUNITs which are long -- use ! 1402: * mp_subb in that case. ! 1403: */ ! 1404: #ifndef mp_musubb ! 1405: boolean mp_musubb ! 1406: (register MULTUNIT* r1,register MULTUNIT* r2,register boolean borrow) ! 1407: /* ! 1408: * multiprecision subtract of MULTUNITs with borrow, r2 from r1, result in r1 ! 1409: * borrow is incoming borrow flag-- value should be 0 or 1 ! 1410: */ ! 1411: { ! 1412: register ulint x; /* won't work if sizeof(MULTUNIT)== ! 1413: sizeof(long) */ ! 1414: short precision; /* number of MULTUNITs to subtract */ ! 1415: precision = global_precision * MULTUNITs_perunit; ! 1416: make_lsbptr(r1,precision); ! 1417: make_lsbptr(r2,precision); ! 1418: while (precision--) { ! 1419: x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow; ! 1420: *post_higherunit(r1) = x; ! 1421: borrow = (((1L << MULTUNITSIZE) & x) != 0L); ! 1422: } ! 1423: return (borrow); ! 1424: } /* mp_musubb */ ! 1425: #endif /* mp_musubb */ ! 1426: #endif /* MULTUNIT_SIZE_SAME */ ! 1427: ! 1428: /* ! 1429: * The function mp_quo_digit is the heart of Smith's modulo reduction, ! 1430: * which uses a form of long division. It computes a trial quotient ! 1431: * "digit" (MULTUNIT-sized digit) by multiplying the three most ! 1432: * significant MULTUNITs of the dividend by the two most significant ! 1433: * MULTUNITs of the reciprocal of the modulus. Note that this function ! 1434: * requires that MULTUNITSIZE * 2 <= sizeof(unsigned long). ! 1435: * ! 1436: * An important part of this technique is that the quotient never be ! 1437: * too small, although it may occasionally be too large. This was ! 1438: * done to eliminate the need to check and correct for a remainder ! 1439: * exceeding the divisor. It is easier to check for a negative ! 1440: * remainder. The following technique rarely needs correction for ! 1441: * MULTUNITs of at least 16 bits. ! 1442: * ! 1443: * The following routine has two implementations: ! 1444: * ! 1445: * ASM_PROTOTYPE defined: written to be an executable prototype for ! 1446: * an efficient assembly language implementation. Note that several ! 1447: * of the following masks and shifts can be done by directly ! 1448: * manipulating single precision registers on some architectures. ! 1449: * ! 1450: * ASM_PROTOTYPE undefined: a slightly more efficient implementation ! 1451: * in C. Although this version returns a result larger than the ! 1452: * optimum (which is corrected in smith_modmult) more often than the ! 1453: * prototype version, the faster execution in C more than makes up ! 1454: * for the difference. ! 1455: * ! 1456: * Parameter: dividend - points to the most significant MULTUNIT ! 1457: * of the dividend. Note that dividend actually contains the ! 1458: * one's complement of the actual dividend value (see comments for ! 1459: * smith_modmult). ! 1460: * ! 1461: * Return: the trial quotient digit resulting from dividing the first ! 1462: * three MULTUNITs at dividend by the upper two MULTUNITs of the ! 1463: * modulus. ! 1464: */ ! 1465: ! 1466: /* #define ASM_PROTOTYPE */ /* undefined: use C-optimized version */ ! 1467: ! 1468: #ifndef mp_quo_digit ! 1469: MULTUNIT mp_quo_digit (MULTUNIT *dividend) { ! 1470: unsigned long q, q0, q1, q2; ! 1471: unsigned short lsb_factor; ! 1472: ! 1473: /* ! 1474: * Compute the least significant product group. ! 1475: *The last terms of q1 and q2 perform upward rounding, which is ! 1476: *needed to guarantee that the result not be too small. ! 1477: */ ! 1478: q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long)reciph ! 1479: + reciph; ! 1480: q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)recipl ! 1481: + (1L << MULTUNITSIZE) ; ! 1482: #ifdef ASM_PROTOTYPE ! 1483: lsb_factor = 1 & (q1>>MULTUNITSIZE) & (q2>>MULTUNITSIZE); ! 1484: q = q1 + q2; ! 1485: ! 1486: /* The following statement is equivalent to shifting the sum right ! 1487: one bit while shifting in the carry bit. ! 1488: */ ! 1489: q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); ! 1490: #else /* optimized C version */ ! 1491: q0 = (q1>>1) + (q2>>1) + 1; ! 1492: #endif ! 1493: ! 1494: /* Compute the middle significant product group. */ ! 1495: ! 1496: q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph; ! 1497: q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl; ! 1498: #ifdef ASM_PROTOTYPE ! 1499: q = q1 + q2; ! 1500: q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); ! 1501: ! 1502: /* Add in the most significant word of the first group. ! 1503: The last term takes care of the carry from adding the lsb's ! 1504: that were shifted out prior to addition. ! 1505: */ ! 1506: q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2)); ! 1507: #else /* optimized C version */ ! 1508: q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1; ! 1509: #endif ! 1510: ! 1511: /* Compute the most significant term and add in the others */ ! 1512: ! 1513: q = (q >> (MULTUNITSIZE-2)) + ! 1514: (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1); ! 1515: q >>= mshift; ! 1516: ! 1517: /* Prevent overflow and then wipe out the intermediate results. */ ! 1518: ! 1519: mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1); ! 1520: q= q0 = q1 = q2 = 0; lsb_factor = 0; (void)lsb_factor; ! 1521: return mutemp; ! 1522: } ! 1523: #endif /* mp_quo_digit */ ! 1524: ! 1525: /* ! 1526: * stage_smith_modulus() - Prepare for a Smith modmult. ! 1527: * ! 1528: * Calculate the reciprocal of modulus with a precision of two MULTUNITs. ! 1529: * Assumes that global_precision has already been adjusted to the ! 1530: * size of the modulus, plus SLOP_BITS. ! 1531: * ! 1532: * Note: This routine was designed to work with large values and ! 1533: * doesn't have the necessary testing or handling to work with a ! 1534: * modulus having less than three significant units. For such cases, ! 1535: * the separate multiply and modulus routines can be used. ! 1536: * ! 1537: * stage_smith_modulus() is aliased to stage_modulus(). ! 1538: */ ! 1539: ! 1540: int stage_smith_modulus(unitptr n_modulus) ! 1541: { ! 1542: int original_precision; ! 1543: int sigmod; /* significant units in modulus */ ! 1544: unitptr mp; /* modulus most significant pointer */ ! 1545: MULTUNIT *mpm; /* reciprocal pointer */ ! 1546: int prec; /* precision of reciprocal calc in units */ ! 1547: ! 1548: mp_move(modulus, n_modulus); ! 1549: modmpl = (MULTUNIT*) modulus; ! 1550: modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit); ! 1551: nbits = countbits(modulus); ! 1552: modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE; ! 1553: ! 1554: original_precision = global_precision; ! 1555: ! 1556: /* The following code copies the three most significant units of ! 1557: * modulus to mod_divisor. ! 1558: */ ! 1559: mp = modulus; ! 1560: sigmod = significance (modulus); ! 1561: rescale (mp, original_precision, sigmod); ! 1562: /* prec is the unit precision required for 3 MULTUNITs */ ! 1563: prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit; ! 1564: set_precision (prec); ! 1565: ! 1566: /* set mp = ptr to most significant units of modulus, then move ! 1567: * the most significant units to mp_divisor ! 1568: */ ! 1569: mp = msbptr(mp,sigmod) -tohigher(prec-1); ! 1570: unmake_lsbptr (mp, prec); ! 1571: mp_move (mod_divisor, mp); ! 1572: ! 1573: /* Keep 2*MULTUNITSIZE bits in mod_divisor. ! 1574: * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits. ! 1575: */ ! 1576: mshift = countbits (mod_divisor) - 2*MULTUNITSIZE; ! 1577: mp_shift_right_bits (mod_divisor, mshift); ! 1578: mp_recip(mod_quotient,mod_divisor); ! 1579: mp_shift_right_bits (mod_quotient,1); ! 1580: ! 1581: /* Reduce to: 0 < mshift <= MULTUNITSIZE */ ! 1582: mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1; ! 1583: /* round up, rescaling if necessary */ ! 1584: mp_inc (mod_quotient); ! 1585: if (countbits (mod_quotient) > 2*MULTUNITSIZE) { ! 1586: mp_shift_right_bits (mod_quotient,1); ! 1587: mshift--; /* now 0 <= mshift <= MULTUNITSIZE */ ! 1588: } ! 1589: mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit); ! 1590: recipl = *post_higherunit (mpm); ! 1591: reciph = *mpm; ! 1592: mp_set_recip (reciph, recipl, mshift); ! 1593: set_precision (original_precision); ! 1594: return 0; /* normal return */ ! 1595: } /* stage_smith_modulus */ ! 1596: ! 1597: /* Smith's algorithm performs a multiply combined with a modulo operation. ! 1598: Computes: prod = (multiplicand*multiplier) mod modulus ! 1599: The modulus must first be set by stage_smith_modulus(). ! 1600: smith_modmult() is aliased to mp_modmult(). ! 1601: */ ! 1602: ! 1603: int ! 1604: smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) ! 1605: { ! 1606: unitptr d; /* ptr to product */ ! 1607: MULTUNIT *dmph, *dmpl, *dmp; /* ptrs to dividend (high, low, first) ! 1608: * aligned for subtraction */ ! 1609: /* Note that dmph points one MULTUNIT higher than indicated by ! 1610: global precision. This allows us to zero out a word one higher than ! 1611: the normal precision. ! 1612: */ ! 1613: short orig_precision; ! 1614: short nqd; /* number of quotient digits remaining to ! 1615: * be generated */ ! 1616: short dmi; /* number of significant MULTUNITs in product */ ! 1617: ! 1618: d = ds_data + 1; /* room for leading MSB if HIGHFIRST */ ! 1619: orig_precision = global_precision; ! 1620: mp_dmul(d, multiplicand, multiplier); ! 1621: ! 1622: rescale(d, orig_precision * 2, orig_precision * 2 + 1); ! 1623: set_precision(orig_precision * 2 + 1); ! 1624: *msbptr(d, global_precision) = 0; /* leading 0 unit */ ! 1625: ! 1626: /* We now start working with MULTUNITs. ! 1627: Determine the most significant MULTUNIT of the product so we don't ! 1628: have to process leading zeros in our divide loop. ! 1629: */ ! 1630: dmi = significance(d) * MULTUNITs_perunit; ! 1631: if (dmi >= modlenMULTUNITS) { ! 1632: /* Make dividend negative. This allows the use of mp_smul to ! 1633: * "subtract" the product of the modulus and the trial divisor ! 1634: * by actually adding to a negative dividend. ! 1635: * The one's complement of the dividend is used, since it causes ! 1636: * a zero value to be represented as all ones. This facilitates ! 1637: * testing the result for possible overflow, since a sign bit ! 1638: * indicates that no adjustment is necessary, and we should not ! 1639: * attempt to adjust if the result of the addition is zero. ! 1640: */ ! 1641: mp_inc(d); ! 1642: mp_neg(d); ! 1643: set_precision(orig_precision); ! 1644: munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; ! 1645: ! 1646: /* Set msb, lsb, and normal ptrs of dividend */ ! 1647: dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) * ! 1648: MULTUNITs_perunit) + tohigher(dmi); ! 1649: nqd = dmi + 1 - modlenMULTUNITS; ! 1650: dmpl = dmph - tohigher(modlenMULTUNITS); ! 1651: ! 1652: /* ! 1653: * Divide loop. ! 1654: * Each iteration computes the next quotient MULTUNIT digit, then ! 1655: * multiplies the divisor (modulus) by the quotient digit and adds ! 1656: * it to the one's complement of the dividend (equivalent to ! 1657: * subtracting). If the product was greater than the remaining dividend, ! 1658: * we get a non-negative result, in which case we subtract off the ! 1659: * modulus to get the proper negative remainder. ! 1660: */ ! 1661: for (; nqd; nqd--) { ! 1662: MULTUNIT q; /* quotient trial digit */ ! 1663: ! 1664: q = mp_quo_digit(dmph); ! 1665: if (q > 0) { ! 1666: mp_smul(dmpl, modmpl, q); ! 1667: ! 1668: /* Perform correction if q too large. ! 1669: * This rarely occurs. ! 1670: */ ! 1671: if (!(*dmph & MULTUNIT_msb)) { ! 1672: dmp = dmpl; ! 1673: unmake_lsbptr(dmp, orig_precision * ! 1674: MULTUNITs_perunit); ! 1675: if (mp_musubb(dmp, ! 1676: (MULTUNIT *) modulus, 0)) ! 1677: (*dmph) --; ! 1678: } ! 1679: } ! 1680: pre_lowerunit(dmph); ! 1681: pre_lowerunit(dmpl); ! 1682: } ! 1683: /* d contains the one's complement of the remainder. */ ! 1684: rescale(d, orig_precision * 2 + 1, orig_precision); ! 1685: set_precision(orig_precision); ! 1686: mp_neg(d); ! 1687: mp_dec(d); ! 1688: } else { ! 1689: /* Product was less than modulus. Return it. */ ! 1690: rescale(d, orig_precision * 2 + 1, orig_precision); ! 1691: set_precision(orig_precision); ! 1692: } ! 1693: mp_move(prod, d); ! 1694: return (0); /* normal return */ ! 1695: } /* smith_modmult */ ! 1696: ! 1697: ! 1698: /* Smith's mp_modmult function leaves some internal arrays in memory, ! 1699: so we have to call modmult_burn() at the end of mp_modexp. ! 1700: This is so that no cryptographically sensitive data is left in memory ! 1701: after the program exits. ! 1702: smith_burn() is aliased to modmult_burn(). ! 1703: */ ! 1704: void smith_burn(void) ! 1705: { ! 1706: empty_array (modulus); ! 1707: empty_array (ds_data); ! 1708: empty_array (mod_quotient); ! 1709: empty_array (mod_divisor); ! 1710: modmpl = 0; ! 1711: mshift =nbits = 0; ! 1712: reciph = recipl = 0; ! 1713: modlenMULTUNITS = mutemp = 0; ! 1714: mp_set_recip (0,0,0); ! 1715: } /* smith_burn */ ! 1716: ! 1717: /* End of Thad Smith's implementation of modmult. */ ! 1718: ! 1719: #endif /* SMITH */ ! 1720: ! 1721: ! 1722: int countbits(unitptr r) ! 1723: /* Returns number of significant bits in r */ ! 1724: { ! 1725: int bits; ! 1726: short prec; ! 1727: register unit bitmask; ! 1728: init_bitsniffer(r,bitmask,prec,bits); ! 1729: return bits; ! 1730: } /* countbits */ ! 1731: ! 1732: ! 1733: char *copyright_notice(void) ! 1734: /* force linker to include copyright notice in the executable object image. */ ! 1735: { return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */ ! 1736: ! 1737: ! 1738: int mp_modexp(register unitptr expout,register unitptr expin, ! 1739: register unitptr exponent,register unitptr modulus) ! 1740: /* ! 1741: * Russian peasant combined exponentiation/modulo algorithm. ! 1742: * Calls modmult instead of mult. ! 1743: * Computes: expout = (expin**exponent) mod modulus ! 1744: * WARNING: All the arguments must be less than the modulus! ! 1745: */ ! 1746: { ! 1747: int bits; ! 1748: short oldprecision; ! 1749: register unit bitmask; ! 1750: unit product[MAX_UNIT_PRECISION]; ! 1751: short eprec; ! 1752: ! 1753: #ifdef COUNTMULTS ! 1754: tally_modmults = 0; /* clear "number of modmults" counter */ ! 1755: tally_modsquares = 0; /* clear "number of modsquares" counter */ ! 1756: #endif /* COUNTMULTS */ ! 1757: mp_init(expout,1); ! 1758: if (testeq(exponent,0)) { ! 1759: if (testeq(expin,0)) ! 1760: return -1; /* 0 to the 0th power means return error */ ! 1761: return 0; /* otherwise, zero exponent means expout is 1 */ ! 1762: } ! 1763: if (testeq(modulus,0)) ! 1764: return -2; /* zero modulus means error */ ! 1765: #if SLOP_BITS > 0 /* if there's room for sign bits */ ! 1766: if (mp_tstminus(modulus)) ! 1767: return -2; /* negative modulus means error */ ! 1768: #endif /* SLOP_BITS > 0 */ ! 1769: if (mp_compare(expin,modulus) >= 0) ! 1770: return -3; /* if expin >= modulus, return error */ ! 1771: if (mp_compare(exponent,modulus) >= 0) ! 1772: return -4; /* if exponent >= modulus, return error */ ! 1773: ! 1774: oldprecision = global_precision; /* save global_precision */ ! 1775: /* set smallest optimum precision for this modulus */ ! 1776: set_precision(bits2units(countbits(modulus)+SLOP_BITS)); ! 1777: /* rescale all these registers to global_precision we just defined */ ! 1778: rescale(modulus,oldprecision,global_precision); ! 1779: rescale(expin,oldprecision,global_precision); ! 1780: rescale(exponent,oldprecision,global_precision); ! 1781: rescale(expout,oldprecision,global_precision); ! 1782: ! 1783: if (stage_modulus(modulus)) { ! 1784: set_precision(oldprecision); /* restore original precision */ ! 1785: return -5; /* unstageable modulus (STEWART algorithm) */ ! 1786: } ! 1787: ! 1788: /* normalize and compute number of bits in exponent first */ ! 1789: init_bitsniffer(exponent,bitmask,eprec,bits); ! 1790: ! 1791: /* We can "optimize out" the first modsquare and modmult: */ ! 1792: bits--; /* We know for sure at this point that bits>0 */ ! 1793: mp_move(expout,expin); /* expout = (1*1)*expin; */ ! 1794: bump_bitsniffer(exponent,bitmask); ! 1795: ! 1796: while (bits--) { ! 1797: poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */ ! 1798: #ifdef COUNTMULTS ! 1799: tally_modsquares++; /* bump "number of modsquares" counter */ ! 1800: #endif /* COUNTMULTS */ ! 1801: mp_modsquare(product,expout); ! 1802: if (sniff_bit(exponent,bitmask)) { ! 1803: mp_modmult(expout,product,expin); ! 1804: #ifdef COUNTMULTS ! 1805: tally_modmults++; /* bump "number of modmults" counter */ ! 1806: #endif /* COUNTMULTS */ ! 1807: } else { ! 1808: mp_move(expout,product); ! 1809: } ! 1810: bump_bitsniffer(exponent,bitmask); ! 1811: } /* while bits-- */ ! 1812: mp_burn(product); /* burn the evidence on the stack */ ! 1813: modmult_burn(); /* ask mp_modmult to also burn its own evidence */ ! 1814: ! 1815: #ifdef COUNTMULTS /* diagnostic analysis */ ! 1816: { ! 1817: long atomic_mults; ! 1818: unsigned int unitcount,totalmults; ! 1819: unitcount = bits2units(countbits(modulus)); ! 1820: /* calculation assumes modsquare takes as long as a modmult: */ ! 1821: atomic_mults = (long) tally_modmults * (unitcount * unitcount); ! 1822: atomic_mults += (long) tally_modsquares * (unitcount * unitcount); ! 1823: printf("%ld atomic mults for ",atomic_mults); ! 1824: printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n", ! 1825: tally_modsquares,tally_modmults, ! 1826: tally_modsquares+tally_modmults, ! 1827: countbits(modulus), unitcount); ! 1828: } ! 1829: #endif /* COUNTMULTS */ ! 1830: ! 1831: set_precision(oldprecision); /* restore original precision */ ! 1832: ! 1833: /* Do an explicit reference to the copyright notice so that the linker ! 1834: will be forced to include it in the executable object image... */ ! 1835: copyright_notice(); /* has no real effect at run time */ ! 1836: ! 1837: return 0; /* normal return */ ! 1838: } /* mp_modexp */ ! 1839: ! 1840: int mp_modexp_crt(unitptr expout, unitptr expin, ! 1841: unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u) ! 1842: /* ! 1843: * This is a faster modexp for moduli with a known factorisation into two ! 1844: * relatively prime factors p and q, and an input relatively prime to the ! 1845: * modulus, the Chinese Remainder Theorem to do the computation mod p and ! 1846: * mod q, and then combine the results. This relies on a number of ! 1847: * precomputed values, but does not actually require the modulus n or the ! 1848: * exponent e. ! 1849: * ! 1850: * expout = expin ^ e mod (p*q). ! 1851: * We form this by evaluating ! 1852: * p2 = (expin ^ e) mod p and ! 1853: * q2 = (expin ^ e) mod q ! 1854: * and then combining the two by the CRT. ! 1855: * ! 1856: * Two optimisations of this are possible. First, we can reduce expin ! 1857: * modulo p and q before starting. ! 1858: * ! 1859: * Second, since we know the factorisation of p and q (trivially derived ! 1860: * from the factorisation of n = p*q), and expin is relatively prime to ! 1861: * both p and q, we can use Euler's theorem, expin^phi(m) = 1 (mod m), ! 1862: * to throw away multiples of phi(p) or phi(q) in e. ! 1863: * Letting ep = e mod phi(p) and ! 1864: * eq = e mod phi(q) ! 1865: * then combining these two speedups, we only need to evaluate ! 1866: * p2 = ((expin mod p) ^ ep) mod p and ! 1867: * q2 = ((expin mod q) ^ eq) mod q. ! 1868: * ! 1869: * Now we need to apply the CRT. Starting with ! 1870: * expout = p2 (mod p) and ! 1871: * expout = q2 (mod q) ! 1872: * we can say that expout = p2 + p * k, and if we assume that 0 <= p2 < p, ! 1873: * then 0 <= expout < p*q for some 0 <= k < q. Since we want expout = q2 ! 1874: * (mod q), then p*k = q2-p2 (mod q). Since p and q are relatively prime, ! 1875: * p has a multiplicative inverse u mod q. In other words, u = 1/p (mod q). ! 1876: * ! 1877: * Multiplying by u on both sides gives k = u*(q2-p2) (mod q). ! 1878: * Since we want 0 <= k < q, we can thus find k as ! 1879: * k = (u * (q2-p2)) mod q. ! 1880: * ! 1881: * Once we have k, evaluating p2 + p * k is easy, and ! 1882: * that gives us the result. ! 1883: * ! 1884: * In the detailed implementation, there is a temporary, temp, used to ! 1885: * hold intermediate results, p2 is held in expout, and q2 is used as a ! 1886: * temporary in the final step when it is no longer needed. With that, ! 1887: * you should be able to understand the code below. ! 1888: */ ! 1889: { ! 1890: unit q2[MAX_UNIT_PRECISION]; ! 1891: unit temp[MAX_UNIT_PRECISION]; ! 1892: int status; ! 1893: ! 1894: /* First, compiute p2 (physically held in M) */ ! 1895: ! 1896: /* p2 = [ (expin mod p) ^ ep ] mod p */ ! 1897: mp_mod(temp,expin,p); /* temp = expin mod p */ ! 1898: status = mp_modexp(expout,temp,ep,p); ! 1899: if (status < 0) { /* mp_modexp returned an error. */ ! 1900: mp_init(expout,1); ! 1901: return status; /* error return */ ! 1902: } ! 1903: ! 1904: /* And the same thing for q2 */ ! 1905: ! 1906: /* q2 = [ (expin mod q) ^ eq ] mod q */ ! 1907: mp_mod(temp,expin,q); /* temp = expin mod q */ ! 1908: status = mp_modexp(q2,temp,eq,q); ! 1909: if (status < 0) { /* mp_modexp returned an error. */ ! 1910: mp_init(expout,1); ! 1911: return status; /* error return */ ! 1912: } ! 1913: ! 1914: /* Now use the multiplicative inverse u to glue together the ! 1915: two halves. ! 1916: */ ! 1917: #if 0 ! 1918: /* This optimisation is useful if you commonly get small results, ! 1919: but for random results and large q, the odds of (1/q) of it ! 1920: being useful do not warrant the test. ! 1921: */ ! 1922: if (mp_compare(expout,q2) != 0) { ! 1923: #endif ! 1924: /* Find q2-p2 mod q */ ! 1925: if (mp_sub(q2,expout)) /* if the result went negative */ ! 1926: mp_add(q2,q); /* add q to q2 */ ! 1927: ! 1928: /* expout = p2 + ( p * [(q2*u) mod q] ) */ ! 1929: mp_mult(temp,q2,u); /* q2*u */ ! 1930: mp_mod(q2,temp,q); /* (q2*u) mod q */ ! 1931: mp_mult(temp,p,q2); /* p * [(q2*u) mod q] */ ! 1932: mp_add(expout,temp); /* expout = p2 + p * [...] */ ! 1933: #if 0 ! 1934: } ! 1935: #endif ! 1936: ! 1937: mp_burn(q2); /* burn the evidence on the stack...*/ ! 1938: mp_burn(temp); ! 1939: return 0; /* normal return */ ! 1940: } /* mp_modexp_crt */ ! 1941: ! 1942: ! 1943: /****************** end of MPI library ****************************/
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.