--- pgp/src/mpilib.c 2018/04/24 16:37:52 1.1 +++ pgp/src/mpilib.c 2018/04/24 16:40:49 1.1.1.5 @@ -7,22 +7,23 @@ Boulder, CO 80304 (303) 541-0140 - (c) Copyright 1986-92 by Philip Zimmermann. All rights reserved. - The author assumes no liability for damages resulting from the use - of this software, even if the damage results from defects in this - software. No warranty is expressed or implied. The use of this - cryptographic software for developing weapon systems is expressly + (c) Copyright 1986-1994 by Philip Zimmermann. All rights reserved. + The author assumes no liability for damages resulting from the use + of this software, even if the damage results from defects in this + software. No warranty is expressed or implied. The use of this + cryptographic software for developing weapon systems is expressly forbidden. - These routines implement all of the multiprecision arithmetic - necessary for number-theoretic cryptographic algorithms such as - ElGamal, Diffie-Hellman, Rabin, or factoring studies for large - composite numbers, as well as Rivest-Shamir-Adleman (RSA) public + + These routines implement all of the multiprecision arithmetic + necessary for number-theoretic cryptographic algorithms such as + ElGamal, Diffie-Hellman, Rabin, or factoring studies for large + composite numbers, as well as Rivest-Shamir-Adleman (RSA) public key cryptography. - Although originally developed in Microsoft C for the IBM PC, this code - contains few machine dependancies. It assumes 2's complement - arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines, + Although originally developed in Microsoft C for the IBM PC, this code + contains few machine dependencies. It assumes 2's complement + arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines, lowbyte-highbyte order or highbyte-lowbyte order. This version has been converted to ANSI C. @@ -47,13 +48,22 @@ Modified 8 Apr 92 - HAJK Implement new VAX/VMS primitive support. + Modified 30 Sep 92 -Castor Fu + Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long) + + Modified 28 Nov 92 - Thad Smith + Added Smith modmult, generalized non-portable support. */ /* #define COUNTMULTS */ /* count modmults for performance studies */ #ifdef DEBUG #ifdef MSDOS +#ifdef __GO32__ /* DJGPP */ +#include +#else #include +#endif /* __GO32__ */ #define poll_for_break() {while (kbhit()) getch();} #endif /* MSDOS */ #endif /* DEBUG */ @@ -64,6 +74,52 @@ #include "mpilib.h" +#ifdef mp_smula +#ifdef mp_smul + Error: Both mp_smula and mp_smul cannot be defined. +#else +#define mp_smul mp_smula +#endif +#endif + +/* set macros for MULTUNIT */ +#ifdef MUNIT8 +#define MULTUNITSIZE 8 +typedef unsigned char MULTUNIT; +#ifdef UNIT8 +#define MULTUNIT_SIZE_SAME +#endif +#else /* not MUNIT8 */ +#ifdef MUNIT32 +#define MULTUNITSIZE 32 +typedef unsigned long MULTUNIT; +#ifdef UNIT32 +#define MULTUNIT_SIZE_SAME +#else +/* #error is not portable, this has the same effect */ +#include "UNITSIZE cannot be smaller than MULTUNITSIZE" +#endif +#else /* assume MUNIT16 */ +#define MULTUNITSIZE 16 +typedef unsigned short MULTUNIT; +#ifdef UNIT16 +#define MULTUNIT_SIZE_SAME +#endif /* UNIT16 */ +#ifdef UNIT8 +#include "UNITSIZE cannot be smaller than MULTUNITSIZE" +#endif /* UNIT8 */ +#endif /* MUNIT32 */ +#endif /* MUNIT8 */ + +#define MULTUNIT_msb ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */ +#define DMULTUNIT_msb (1L << (2*MULTUNITSIZE-1)) +#define MULTUNIT_mask ((MULTUNIT)((1L << MULTUNITSIZE)-1)) +#define MULTUNITs_perunit (UNITSIZE/MULTUNITSIZE) + + +void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier); +void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier); + short global_precision = 0; /* units of precision for all routines */ /* global_precision is the unit precision last set by set_precision. Initially, set_precision() should be called to define global_precision @@ -71,297 +127,279 @@ short global_precision = 0; /* units of i.e.: set_precision(MAX_UNIT_PRECISION); */ -#ifdef PORTABLE /*************** multiprecision library primitives ****************/ /* The following portable C primitives should be recoded in assembly. - The equivalent assembly primitives are externally defined under - different names, unless PORTABLE is defined. See the header file - "mpilib.h" for further details. - - The carry bit mechanism we use for these portable primitives - will only work if the size of unit is smaller than the size of - a long integer. In most cases, this means UNITSIZE must be - less than 32 bits -- if you use the C portable primitives. + The entry point name should be defined, in "mpilib.h" to the external + entry point name. If undefined, the C version will be used. */ typedef unsigned long int ulint; -#define carrybit ((ulint) 1 << UNITSIZE) -/* ...assumes sizeof(unit) < sizeof(unsigned long) */ +#ifndef mp_addc boolean mp_addc (register unitptr r1,register unitptr r2,register boolean carry) /* multiprecision add with carry r2 to r1, result in r1 */ /* carry is incoming carry flag-- value should be 0 or 1 */ -{ register ulint x; /* won't work if sizeof(unit)==sizeof(long) */ +{ + register unit x; short precision; /* number of units to add */ precision = global_precision; make_lsbptr(r1,precision); make_lsbptr(r2,precision); - while (precision--) - { x = (ulint) *r1 + (ulint) *post_higherunit(r2) + (ulint) carry; + while (precision--) { + if (carry) { + x = *r1 + *r2 + 1; + carry = (*r2 >= (unit)(~ *r1)); + } else { + x = *r1 + *r2; + carry = (x < *r1) ; + } + post_higherunit(r2); *post_higherunit(r1) = x; - carry = ((x & carrybit) != 0L); } - return(carry); /* return the final carry flag bit */ -} /* mp_addc */ - + return carry; /* return the final carry flag bit */ +} /* mp_addc */ +#endif /* mp_addc */ +#ifndef mp_subb boolean mp_subb (register unitptr r1,register unitptr r2,register boolean borrow) /* multiprecision subtract with borrow, r2 from r1, result in r1 */ /* borrow is incoming borrow flag-- value should be 0 or 1 */ -{ register ulint x; /* won't work if sizeof(unit)==sizeof(long) */ +{ + register unit x; short precision; /* number of units to subtract */ precision = global_precision; make_lsbptr(r1,precision); make_lsbptr(r2,precision); - while (precision--) - { x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow; + while (precision--) { + if (borrow) { + x = *r1 - *r2 - 1; + borrow = (*r1 <= *r2); + } else { + x = *r1 - *r2; + borrow = (*r1 < *r2); + } + post_higherunit(r2); *post_higherunit(r1) = x; - borrow = ((x & carrybit) != 0L); } - return(borrow); /* return the final carry/borrow flag bit */ -} /* mp_subb */ - -#undef carrybit - + return borrow; /* return the final carry/borrow flag bit */ +} /* mp_subb */ +#endif /* mp_subb */ +#ifndef mp_rotate_left boolean mp_rotate_left(register unitptr r1,register boolean carry) /* multiprecision rotate left 1 bit with carry, result in r1. */ /* carry is incoming carry flag-- value should be 0 or 1 */ -{ register short precision; /* number of units to rotate */ - register boolean nextcarry; +{ + register int precision; /* number of units to rotate */ + unsigned int mcarry = carry, nextcarry; /* int is supposed to be + * the efficient size for ops*/ precision = global_precision; make_lsbptr(r1,precision); - while (precision--) - { + while (precision--) { nextcarry = (((signedunit) *r1) < 0); - /* nextcarry = ((*r1 & uppermostbit) != 0); */ - *r1 <<= 1 ; - if (carry) *r1 |= 1; - carry = nextcarry; + *r1 = (*r1 << 1) | mcarry; + mcarry = nextcarry; pre_higherunit(r1); } - return(nextcarry); /* return the final carry flag bit */ -} /* mp_rotate_left */ + return nextcarry; /* return the final carry flag bit */ +} /* mp_rotate_left */ +#endif /* mp_rotate_left */ /************** end of primitives that should be in assembly *************/ -#endif /* PORTABLE */ -/* The mp_shift_right_bits function is not called in any time-critical - situations in public-key cryptographic functions, so it doesn't +/* The mp_shift_right_bits function is not called in any time-critical + situations in public-key cryptographic functions, so it doesn't need to be coded in assembly language. */ void mp_shift_right_bits(register unitptr r1,register short bits) - /* multiprecision shift right bits, result in r1. - bits is how many bits to shift, must be < UNITSIZE. + /* multiprecision shift right bits, result in r1. + bits is how many bits to shift, must be <= UNITSIZE. */ -{ register short precision; /* number of units to shift */ +{ + register short precision; /* number of units to shift */ register unit carry,nextcarry,bitmask; register short unbits; if (bits==0) return; /* shift zero bits is a no-op */ carry = 0; bitmask = power_of_2(bits)-1; - unbits = UNITSIZE-bits; /* shift bits must be < UNITSIZE */ + unbits = UNITSIZE-bits; /* shift bits must be <= UNITSIZE */ precision = global_precision; make_msbptr(r1,precision); - while (precision--) - { nextcarry = *r1 & bitmask; - *r1 >>= bits; - *r1 |= carry << unbits; - carry = nextcarry; - pre_lowerunit(r1); + if (bits == UNITSIZE) { + while (precision--) { + nextcarry = *r1; + *r1 = carry; + carry = nextcarry; + pre_lowerunit(r1); + } + } else { + while (precision--) { + nextcarry = *r1 & bitmask; + *r1 >>= bits; + *r1 |= carry << unbits; + carry = nextcarry; + pre_lowerunit(r1); + } } -} /* mp_shift_right_bits */ +} /* mp_shift_right_bits */ -#ifndef VMS - +#ifndef mp_compare short mp_compare(register unitptr r1,register unitptr r2) -/* Compares multiprecision integers *r1, *r2, and returns: - -1 iff *r1 < *r2 - 0 iff *r1 == *r2 - +1 iff *r1 > *r2 -*/ -{ register short precision; /* number of units to compare */ +/* + * Compares multiprecision integers *r1, *r2, and returns: + * -1 iff *r1 < *r2 + * 0 iff *r1 == *r2 + * +1 iff *r1 > *r2 + */ +{ + register short precision; /* number of units to compare */ precision = global_precision; make_msbptr(r1,precision); make_msbptr(r2,precision); - do - { if (*r1 < *r2) - return(-1); - if (*post_lowerunit(r1) > *post_lowerunit(r2)) - return(1); - } while (--precision); - return(0); /* *r1 == *r2 */ -} /* mp_compare */ + do { + if (*r1 < *r2) + return -1; + if (*post_lowerunit(r1) > *post_lowerunit(r2)) + return 1; + } while (--precision); + return 0; /* *r1 == *r2 */ +} /* mp_compare */ +#endif /* mp_compare */ -#endif /* VMS */ boolean mp_inc(register unitptr r) - /* Increment multiprecision integer r. */ -{ register short precision; +/* Increment multiprecision integer r. */ +{ + register short precision; precision = global_precision; make_lsbptr(r,precision); - do - { if ( ++(*r) ) return(0); /* no carry */ - post_higherunit(r); + do { + if ( ++(*r) ) return 0; /* no carry */ + post_higherunit(r); } while (--precision); - return(1); /* return carry set */ -} /* mp_inc */ + return 1; /* return carry set */ +} /* mp_inc */ boolean mp_dec(register unitptr r) - /* Decrement multiprecision integer r. */ -{ register short precision; +/* Decrement multiprecision integer r. */ +{ + register short precision; precision = global_precision; make_lsbptr(r,precision); - do - { if ( (signedunit) (--(*r)) != (signedunit) -1 ) - return(0); /* no borrow */ - post_higherunit(r); + do { + if ( (signedunit) (--(*r)) != (signedunit) -1 ) + return 0; /* no borrow */ + post_higherunit(r); } while (--precision); - return(1); /* return borrow set */ -} /* mp_dec */ + return 1; /* return borrow set */ +} /* mp_dec */ void mp_neg(register unitptr r) /* Compute 2's complement, the arithmetic negative, of r */ -{ register short precision; /* number of units to negate */ +{ + register short precision; /* number of units to negate */ precision = global_precision; mp_dec(r); /* 2's complement is 1's complement plus 1 */ - do /* now do 1's complement */ - { *r = ~(*r); - r++; + do { /* now do 1's complement */ + *r = ~(*r); + r++; } while (--precision); -} /* mp_neg */ - -#ifndef VAXC /* Replaced by a macro under VAX C */ +} /* mp_neg */ +#ifndef mp_move void mp_move(register unitptr dst,register unitptr src) -{ register short precision; /* number of units to move */ +{ + register short precision; /* number of units to move */ precision = global_precision; do { *dst++ = *src++; } while (--precision); -} /* mp_move */ +} /* mp_move */ +#endif /* mp_move */ -#endif /* VAXC */ void mp_init(register unitptr r, word16 value) /* Init multiprecision register r with short value. */ { /* Note that mp_init doesn't extend sign bit for >32767 */ - register short precision; /* number of units to init */ - precision = global_precision; - make_lsbptr(r,precision); - *post_higherunit(r) = value; precision--; + + unitfill0( r, global_precision); + make_lsbptr(r,global_precision); + *post_higherunit(r) = value; #ifdef UNIT8 - *post_higherunit(r) = value >> UNITSIZE; precision--; + *post_higherunit(r) = value >> UNITSIZE; #endif -#ifdef VAXC - - unitfill0( r, precision); /* Use character insts. on a VAX */ - -#else /* VAXC */ - - while (precision--) - *post_higherunit(r) = 0; - -#endif /* VAXC */ - -} /* mp_init */ +} /* mp_init */ short significance(register unitptr r) /* Returns number of significant units in r */ -{ register short precision; +{ + register short precision; precision = global_precision; make_msbptr(r,precision); - do - { if (*post_lowerunit(r)) - return(precision); + do { + if (*post_lowerunit(r)) + return precision; } while (--precision); - return(precision); -} /* significance */ - + return precision; +} /* significance */ -#ifndef VAXC /* Replaced by a macro under VAX C */ +#ifndef unitfill0 void unitfill0(unitptr r,word16 unitcount) /* Zero-fill the unit buffer r. */ -{ while (unitcount--) *r++ = 0; -} /* unitfill0 */ - -#endif /* VAXC */ - -/* The macro normalize(r,precision) "normalizes" a multiprecision integer - by adjusting r and precision to new values. For Motorola-style processors - (MSB-first), r is a pointer to the MSB of the register, and must - be adjusted to point to the first nonzero unit. For Intel/VAX-style - (LSB-first) processors, r is a pointer to the LSB of the register, - and must be left unchanged. The precision counter is always adjusted, - regardless of processor type. In the case of precision = 0, - r becomes undefined. -*/ - - -/* The macro rescale(r,current_precision,new_precision) rescales - a multiprecision integer by adjusting r and its precision to new values. - It can be used to reverse the effects of the normalize - routine given above. See the comments in normalize concerning - Intel vs. Motorola LSB/MSB conventions. - WARNING: You can only safely call rescale on registers that - you have previously normalized with the above normalize routine, - or are known to be big enough for the new precision. You may - specify a new precision that is smaller than the current precision. - You must be careful not to specify a new_precision value that is - too big, or which adjusts the r pointer out of range. -*/ - - -/* The "bit sniffer" macros all begin sniffing at the MSB. - They are used internally by all the various multiply, divide, - modulo, exponentiation, and square root functions. -*/ +{ + while (unitcount--) *r++ = 0; +} /* unitfill0 */ +#endif /* unitfill0 */ int mp_udiv(register unitptr remainder,register unitptr quotient, register unitptr dividend,register unitptr divisor) /* Unsigned divide, treats both operands as positive. */ -{ int bits; +{ + int bits; short dprec; register unit bitmask; + if (testeq(divisor,0)) - return(-1); /* zero divisor means divide error */ + return -1; /* zero divisor means divide error */ mp_init0(remainder); mp_init0(quotient); /* normalize and compute number of bits in dividend first */ init_bitsniffer(dividend,bitmask,dprec,bits); /* rescale quotient to same precision (dprec) as dividend */ rescale(quotient,global_precision,dprec); - make_msbptr(quotient,dprec); + make_msbptr(quotient,dprec); - while (bits--) - { mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0)); - if (mp_compare(remainder,divisor) >= 0) - { mp_sub(remainder,divisor); + while (bits--) { + mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0)); + if (mp_compare(remainder,divisor) >= 0) { + mp_sub(remainder,divisor); stuff_bit(quotient,bitmask); } - bump_2bitsniffers(dividend,quotient,bitmask); + bump_2bitsniffers(dividend,quotient,bitmask); } - return(0); + return 0; } /* mp_udiv */ +#ifdef UPTON_OR_SMITH #define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */ int mp_recip(register unitptr quotient,register unitptr divisor) /* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */ -{ int bits; +{ + int bits; short qprec; register unit bitmask; unit remainder[MAX_UNIT_PRECISION]; if (testeq(divisor,0)) - return(-1); /* zero divisor means divide error */ + return -1; /* zero divisor means divide error */ mp_init0(remainder); mp_init0(quotient); @@ -372,120 +410,126 @@ int mp_recip(register unitptr quotient,r mp_setbit(remainder,(bits-RECIPMARGIN)-1); /* rescale quotient to precision of divisor + RECIPMARGIN bits */ rescale(quotient,global_precision,qprec); - make_msbptr(quotient,qprec); + make_msbptr(quotient,qprec); - while (bits--) - { mp_shift_left(remainder); - if (mp_compare(remainder,divisor) >= 0) - { mp_sub(remainder,divisor); + while (bits--) { + mp_shift_left(remainder); + if (mp_compare(remainder,divisor) >= 0) { + mp_sub(remainder,divisor); stuff_bit(quotient,bitmask); } bump_bitsniffer(quotient,bitmask); } mp_init0(remainder); /* burn sensitive data left on stack */ - return(0); + return 0; } /* mp_recip */ +#endif /* UPTON_OR_SMITH */ int mp_div(register unitptr remainder,register unitptr quotient, register unitptr dividend,register unitptr divisor) /* Signed divide, either or both operands may be negative. */ -{ boolean dvdsign,dsign; +{ + boolean dvdsign,dsign; int status; - dvdsign = (mp_tstminus(dividend)!=0); - dsign = (mp_tstminus(divisor)!=0); + dvdsign = (boolean)(mp_tstminus(dividend)!=0); + dsign = (boolean)(mp_tstminus(divisor)!=0); if (dvdsign) mp_neg(dividend); if (dsign) mp_neg(divisor); status = mp_udiv(remainder,quotient,dividend,divisor); if (dvdsign) mp_neg(dividend); /* repair caller's dividend */ if (dsign) mp_neg(divisor); /* repair caller's divisor */ - if (status<0) return(status); /* divide error? */ + if (status<0) return status; /* divide error? */ if (dvdsign) mp_neg(remainder); if (dvdsign ^ dsign) mp_neg(quotient); - return(status); + return status; } /* mp_div */ word16 mp_shortdiv(register unitptr quotient, register unitptr dividend,register word16 divisor) -/* This function does a fast divide and mod on a multiprecision dividend - using a short integer divisor returning a short integer remainder. - This is an unsigned divide. It treats both operands as positive. - It is used mainly for faster printing of large numbers in base 10. -*/ -{ int bits; +/* + * This function does a fast divide and mod on a multiprecision dividend + * using a short integer divisor returning a short integer remainder. + * This is an unsigned divide. It treats both operands as positive. + * It is used mainly for faster printing of large numbers in base 10. + */ +{ + int bits; short dprec; register unit bitmask; register word16 remainder; if (!divisor) /* if divisor == 0 */ - return(-1); /* zero divisor means divide error */ + return -1; /* zero divisor means divide error */ remainder=0; mp_init0(quotient); /* normalize and compute number of bits in dividend first */ init_bitsniffer(dividend,bitmask,dprec,bits); /* rescale quotient to same precision (dprec) as dividend */ rescale(quotient,global_precision,dprec); - make_msbptr(quotient,dprec); + make_msbptr(quotient,dprec); - while (bits--) - { remainder <<= 1; + while (bits--) { + remainder <<= 1; if (sniff_bit(dividend,bitmask)) remainder++; - if (remainder >= divisor) - { remainder -= divisor; + if (remainder >= divisor) { + remainder -= divisor; stuff_bit(quotient,bitmask); } - bump_2bitsniffers(dividend,quotient,bitmask); - } - return(remainder); + bump_2bitsniffers(dividend,quotient,bitmask); + } + return remainder; } /* mp_shortdiv */ int mp_mod(register unitptr remainder, register unitptr dividend,register unitptr divisor) - /* Unsigned divide, treats both operands as positive. */ -{ int bits; +/* Unsigned divide, treats both operands as positive. */ +{ + int bits; short dprec; register unit bitmask; if (testeq(divisor,0)) - return(-1); /* zero divisor means divide error */ + return -1; /* zero divisor means divide error */ mp_init0(remainder); /* normalize and compute number of bits in dividend first */ init_bitsniffer(dividend,bitmask,dprec,bits); - while (bits--) - { mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0)); + while (bits--) { + mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0)); msub(remainder,divisor); bump_bitsniffer(dividend,bitmask); - } - return(0); + } + return 0; } /* mp_mod */ word16 mp_shortmod(register unitptr dividend,register word16 divisor) -/* This function does a fast mod operation on a multprecision dividend - using a short integer modulus returning a short integer remainder. - This is an unsigned divide. It treats both operands as positive. - It is used mainly for fast sieve searches for large primes. -*/ +/* + * This function does a fast mod operation on a multiprecision dividend + * using a short integer modulus returning a short integer remainder. + * This is an unsigned divide. It treats both operands as positive. + * It is used mainly for fast sieve searches for large primes. + */ { int bits; short dprec; register unit bitmask; register word16 remainder; if (!divisor) /* if divisor == 0 */ - return(-1); /* zero divisor means divide error */ + return -1; /* zero divisor means divide error */ remainder=0; /* normalize and compute number of bits in dividend first */ init_bitsniffer(dividend,bitmask,dprec,bits); - while (bits--) - { remainder <<= 1; + while (bits--) { + remainder <<= 1; if (sniff_bit(dividend,bitmask)) remainder++; if (remainder >= divisor) remainder -= divisor; bump_bitsniffer(dividend,bitmask); - } - return(remainder); + } + return remainder; } /* mp_shortmod */ @@ -495,59 +539,67 @@ word16 mp_shortmod(register unitptr divi int mp_mult(register unitptr prod, register unitptr multiplicand, register unitptr multiplier) - /* Computes multiprecision prod = multiplicand * multiplier */ -{ /* Uses interleaved comb multiply algorithm. - This improved multiply more than twice as fast as a Russian - peasant multiply, because it does a lot fewer shifts. - Must have global_precision set to the size of the multiplicand - plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum - of the lengths of the multiplier and multiplicand. - - BUG ALERT: Unfortunately, this code has a bug. It fails for - some numbers. One such example: - x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 - x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D - 52C8 CDC7 AFB3 61C8 243C 741B - --which is obviously wrong. The answer should be: - x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A - C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 - We'll have to fix this some day. In the meantime, we'll - just have the compiler skip over this code. - */ +/* + * Computes multiprecision prod = multiplicand * multiplier + * + * Uses interleaved comb multiply algorithm. + * This improved multiply more than twice as fast as a Russian + * peasant multiply, because it does a lot fewer shifts. + * Must have global_precision set to the size of the multiplicand + * plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum + * of the lengths of the multiplier and multiplicand. + * + * BUG ALERT: Unfortunately, this code has a bug. It fails for + * some numbers. One such example: + * x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 + * x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D + * 52C8 CDC7 AFB3 61C8 243C 741B + * --which is obviously wrong. The answer should be: + * x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A + * C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 + * We'll have to fix this some day. In the meantime, we'll + * just have the compiler skip over this code. + * + * BUG NOTE: Possibly fixed. Needs testing. + */ +{ int bits; register unit bitmask; unitptr product, mplier, temp; short mprec,mprec2; unit mplicand[MAX_UNIT_PRECISION]; - mp_init(prod,0); /* better clear full width--double precision */ + + /* better clear full width--double precision */ + mp_init(prod+tohigher(global_precision),0); + if (testeq(multiplicand,0)) - return(0); /* zero multiplicand means zero product */ + return 0; /* zero multiplicand means zero product */ mp_move(mplicand,multiplicand); /* save it from damage */ normalize(multiplier,mprec); if (!mprec) - return(0); + return 0; make_lsbptr(multiplier,mprec); bitmask = 1; /* start scan at LSB of multiplier */ - do /* UNITSIZE times */ - { /* do for bits 0-15 */ + do { /* UNITSIZE times */ + /* do for bits 0-15 */ product = prod; mplier = multiplier; mprec2 = mprec; - while (mprec2--) /* do for each word in multiplier */ - { - if (sniff_bit(mplier,bitmask)) - { if (mp_addc(product,multiplicand,0)) /* ripple carry */ + while (mprec2--) { /* do for each word in multiplier */ + + if (sniff_bit(mplier,bitmask)) { + if (mp_addc(product,multiplicand,0)) /* ripple carry */ { /* After 1st time thru, this is rarely encountered. */ temp = msbptr(product,global_precision); pre_higherunit(temp); /* temp now points to LSU of carry region. */ unmake_lsbptr(temp,global_precision); mp_inc(temp); - } /* ripple carry */ + } /* ripple carry */ } pre_higherunit(mplier); pre_higherunit(product); @@ -560,62 +612,67 @@ int mp_mult(register unitptr prod, mp_move(multiplicand,mplicand); /* recover */ - return(0); /* normal return */ -} /* mp_mult */ + return 0; /* normal return */ +} /* mp_mult */ #endif /* COMB_MULT */ /* Because the "comb" multiply has a bug, we will use the slower - Russian peasant multiply instead. Fortunately, the mp_mult + Russian peasant multiply instead. Fortunately, the mp_mult function is not called from any time-critical code. */ int mp_mult(register unitptr prod, register unitptr multiplicand,register unitptr multiplier) - /* Computes multiprecision prod = multiplicand * multiplier */ -{ /* Uses "Russian peasant" multiply algorithm. */ +/* + * Computes multiprecision prod = multiplicand * multiplier + * Uses "Russian peasant" multiply algorithm. + */ +{ int bits; register unit bitmask; short mprec; mp_init(prod,0); if (testeq(multiplicand,0)) - return(0); /* zero multiplicand means zero product */ + return 0; /* zero multiplicand means zero product */ /* normalize and compute number of bits in multiplier first */ init_bitsniffer(multiplier,bitmask,mprec,bits); - while (bits--) - { mp_shift_left(prod); + while (bits--) { + mp_shift_left(prod); if (sniff_bit(multiplier,bitmask)) mp_add(prod,multiplicand); bump_bitsniffer(multiplier,bitmask); } - return(0); -} /* mp_mult */ + return 0; +} /* mp_mult */ -/* mp_modmult computes a multiprecision multiply combined with a - modulo operation. This is the most time-critical function in - this multiprecision arithmetic library for performing modulo - exponentiation. We experimented with different versions of modmult, - depending on the machine architecture and performance requirements. - We will either use a Russian Peasant modmult (peasant_modmult), - Charlie Merritt's modmult (merritt_modmult) or Jimmy Upton's - modmult (upton_modmult). On machines with a hardware atomic - multiply instruction, Upton's modmult is fastest, which depends - on an assembly subroutine to speed up the hardware multiply logic. - If the machine lacks a fast hardware multiply, Merritt's modmult - is preferred, which doesn't call any assembly multiply routine. - We use the alias names mp_modmult, stage_modulus, and modmult_burn - for the corresponding true names, which depend on what flavor of - modmult we are using. - - Before making the first call to mp_modmult, you must set up the - modulus-dependant precomputated tables by calling stage_modulus. - After making all the calls to mp_modmult, you call modmult_burn to - erase the tables created by stage_modulus that were left in memory. -*/ +/* + * mp_modmult computes a multiprecision multiply combined with a + * modulo operation. This is the most time-critical function in + * this multiprecision arithmetic library for performing modulo + * exponentiation. We experimented with different versions of modmult, + * depending on the machine architecture and performance requirements. + * We will either use a Russian Peasant modmult (peasant_modmult), + * Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's + * modmult (upton_modmult), or Thad Smith's modmult (smith_modmult). + * On machines with a hardware atomic multiply instruction, + * Smith's modmult is fastest. It can utilize assembly subroutines to + * speed up the hardware multiply logic and trial quotient calculation. + * If the machine lacks a fast hardware multiply, Merritt's modmult + * is preferred, which doesn't call any assembly multiply routine. + * We use the alias names mp_modmult, stage_modulus, and modmult_burn + * for the corresponding true names, which depend on what flavor of + * modmult we are using. + * + * Before making the first call to mp_modmult, you must set up the + * modulus-dependant precomputated tables by calling stage_modulus. + * After making all the calls to mp_modmult, you call modmult_burn to + * erase the tables created by stage_modulus that were left in memory. + */ #ifdef COUNTMULTS /* "number of modmults" counters, used for performance studies. */ @@ -627,59 +684,64 @@ static unsigned int tally_modsquares = 0 #ifdef PEASANT /* Conventional Russian peasant multiply with modulo algorithm. */ -static unitptr modulus = 0; /* used only by mp_modmult */ +static unitptr pmodulus = 0; /* used only by mp_modmult */ int stage_peasant_modulus(unitptr n) -/* Must pass modulus to stage_modulus before calling modmult. - Assumes that global_precision has already been adjusted to the - size of the modulus, plus SLOP_BITS. -*/ +/* + * Must pass modulus to stage_modulus before calling modmult. + * Assumes that global_precision has already been adjusted to the + * size of the modulus, plus SLOP_BITS. + */ { /* For this simple version of modmult, just copy unit pointer. */ - modulus = n; - return(0); /* normal return */ -} /* stage_peasant_modulus */ + pmodulus = n; + return 0; /* normal return */ +} /* stage_peasant_modulus */ int peasant_modmult(register unitptr prod, unitptr multiplicand,register unitptr multiplier) -{ /* "Russian peasant" multiply algorithm, combined with a modulo - operation. This is a simple naive replacement for Merritt's - faster modmult algorithm. References global unitptr "modulus". - Computes: prod = (multiplicand*multiplier) mod modulus - WARNING: All the arguments must be less than the modulus! - */ +/* "Russian peasant" multiply algorithm, combined with a modulo + * operation. This is a simple naive replacement for Merritt's + * faster modmult algorithm. References global unitptr "modulus". + * Computes: prod = (multiplicand*multiplier) mod modulus + * WARNING: All the arguments must be less than the modulus! + */ +{ int bits; register unit bitmask; short mprec; mp_init(prod,0); /* if (testeq(multiplicand,0)) - return(0); */ /* zero multiplicand means zero product */ + return 0; */ /* zero multiplicand means zero product */ /* normalize and compute number of bits in multiplier first */ init_bitsniffer(multiplier,bitmask,mprec,bits); - while (bits--) - { mp_shift_left(prod); - msub(prod,modulus); /* turns mult into modmult */ - if (sniff_bit(multiplier,bitmask)) - { mp_add(prod,multiplicand); - msub(prod,modulus); /* turns mult into modmult */ + while (bits--) { + mp_shift_left(prod); + msub(prod,pmodulus); /* turns mult into modmult */ + if (sniff_bit(multiplier,bitmask)) { + mp_add(prod,multiplicand); + msub(prod,pmodulus); /* turns mult into modmult */ } bump_bitsniffer(multiplier,bitmask); } - return(0); -} /* peasant_modmult */ + return 0; +} /* peasant_modmult */ -/* If we are using a version of mp_modmult that uses internal tables - in memory, we have to call modmult_burn() at the end of mp_modexp. - This is so that no sensitive data is left in memory after the program - exits. The Russian peasant method doesn't use any such tables. -*/ -static void peasant_burn(void) -/* Alias for modmult_burn, called only from mp_modexp(). Destroys - internal modmult tables. This version does nothing because no - tables are used by the Russian peasant modmult. */ -{ } /* peasant_burn */ +/* + * If we are using a version of mp_modmult that uses internal tables + * in memory, we have to call modmult_burn() at the end of mp_modexp. + * This is so that no sensitive data is left in memory after the program + * exits. The Russian peasant method doesn't use any such tables. + */ +void peasant_burn(void) +/* + * Alias for modmult_burn, called only from mp_modexp(). Destroys + * internal modmult tables. This version does nothing because no + * tables are used by the Russian peasant modmult. + */ +{ } /* peasant_burn */ #endif /* PEASANT */ @@ -689,10 +751,11 @@ static void peasant_burn(void) /* This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ. Also refined by Zhahai Stewart to reduce the number of subtracts. - It performs a multiply combined with a modulo operation, without + Modified by Raymond Brand to reduce the number of SLOP_BITS by 1. + It performs a multiply combined with a modulo operation, without going into "double precision". It is faster than the Russian peasant - method, and still works well on machines that lack a fast hardware - multiply instruction. + method, and still works well on machines that lack a fast hardware + multiply instruction. */ /* The following support functions, macros, and data structures @@ -700,7 +763,8 @@ static void peasant_burn(void) static void mp_lshift_unit(register unitptr r1) /* Shift r1 1 whole unit to the left. Used only by modmult function. */ -{ register short precision; +{ + register short precision; register unitptr r2; precision = global_precision; make_msbptr(r1,precision); @@ -708,65 +772,74 @@ static void mp_lshift_unit(register unit while (--precision) *post_lowerunit(r1) = *pre_lowerunit(r2); *r1 = 0; -} /* mp_lshift_unit */ +} /* mp_lshift_unit */ /* moduli_buf contains shifted images of the modulus, set by stage_modulus */ -static unit moduli_buf[UNITSIZE][MAX_UNIT_PRECISION] = {0}; -static unitptr moduli[UNITSIZE+1] = /* contains pointers into moduli_buf */ -{ 0 - ,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0], - &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0], &moduli_buf[ 7][0] +static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0}; +static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */ +{ 0, moduli_buf[ 0], moduli_buf[ 1], moduli_buf[ 2], + moduli_buf[ 3], moduli_buf[ 4], moduli_buf[ 5], moduli_buf[ 6] #ifndef UNIT8 - ,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0], - &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0], &moduli_buf[15][0] + ,moduli_buf[ 7] ,moduli_buf[ 8], moduli_buf[ 9], moduli_buf[10], + moduli_buf[11], moduli_buf[12], moduli_buf[13], moduli_buf[14] #ifndef UNIT16 /* and not UNIT8 */ - ,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0], - &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0], - &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0], - &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0], &moduli_buf[31][0] + ,moduli_buf[15] ,moduli_buf[16], moduli_buf[17], moduli_buf[18], + moduli_buf[19], moduli_buf[20], moduli_buf[21], moduli_buf[22], + moduli_buf[23], moduli_buf[24], moduli_buf[25], moduli_buf[26], + moduli_buf[27], moduli_buf[28], moduli_buf[29], moduli_buf[30] #endif /* UNIT16 and UNIT8 not defined */ #endif /* UNIT8 not defined */ }; -/* To optimize msubs, need following 2 unit arrays, each filled - with the most significant unit and next-to-most significant unit - of the preshifted images of the modulus. */ -static unit msu_moduli[UNITSIZE+1] = {0}; /* most signif. unit */ -static unit nmsu_moduli[UNITSIZE+1] = {0}; /* next-most signif. unit */ - -/* mpdbuf contains preshifted images of the multiplicand, mod n. - It is used only by mp_modmult. It could be staticly declared - inside of mp_modmult, but we put it outside mp_modmult so that - it can be wiped clean by modmult_burn(), which is called at the - end of mp_modexp. This is so that no sensitive data is left in - memory after the program exits. -*/ +/* + * To optimize msubs, need following 2 unit arrays, each filled + * with the most significant unit and next-to-most significant unit + * of the preshifted images of the modulus. + */ +static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */ +static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */ + +/* + * mpdbuf contains preshifted images of the multiplicand, mod n. + * It is used only by mp_modmult. It could be staticly declared + * inside of mp_modmult, but we put it outside mp_modmult so that + * it can be wiped clean by modmult_burn(), which is called at the + * end of mp_modexp. This is so that no sensitive data is left in + * memory after the program exits. + */ static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0}; static void stage_mp_images(unitptr images[UNITSIZE],unitptr r) -/* Computes UNITSIZE images of r, each shifted left 1 more bit. - Used only by modmult function. -*/ -{ short int i; +/* + * Computes UNITSIZE images of r, each shifted left 1 more bit. + * Used only by modmult function. + */ +{ + short int i; + images[0] = r; /* no need to move the first image, just copy ptr */ - for (i=1; i=LOG_UNITSIZE; i--) + if (mp_compare(prod,moduli[i]) >= 0) + mp_sub(prod,moduli[i]); + */ - The following loop is unrolled for speed, using macros... +#ifndef UNIT8 +#ifndef UNIT16 /* and not UNIT8 */ + msubs(31); + msubs(30); + msubs(29); + msubs(28); + msubs(27); + msubs(26); + msubs(25); + msubs(24); + msubs(23); + msubs(22); + msubs(21); + msubs(20); + msubs(19); + msubs(18); + msubs(17); + msubs(16); +#endif /* not UNIT16 and not UNIT8 */ + msubs(15); + msubs(14); + msubs(13); + msubs(12); + msubs(11); + msubs(10); + msubs(9); + msubs(8); +#endif /* not UNIT8 */ + msubs(7); + msubs(6); + msubs(5); +#ifndef UNIT32 + msubs(4); +#ifndef UNIT16 + msubs(3); +#endif +#endif + + /* + * Sniff each bit in the current unit of the multiplier, + * and conditionally add the corresponding preshifted + * image of the multiplicand to the product. + * This is also part of the multiply phase of modmult. + * + * The following loop is unrolled for speed, using macros... for (i=UNITSIZE-1; i>=0; i--) - if (*multiplier & power_of_2(i)) + if (*multiplier & power_of_2(i)) mp_add(prod,mpd[i]); - */ + */ #ifndef UNIT8 #ifndef UNIT16 /* and not UNIT8 */ - sniffadd(31); - sniffadd(30); - sniffadd(29); + sniffadd(31); + sniffadd(30); + sniffadd(29); sniffadd(28); - sniffadd(27); - sniffadd(26); - sniffadd(25); + sniffadd(27); + sniffadd(26); + sniffadd(25); sniffadd(24); - sniffadd(23); - sniffadd(22); - sniffadd(21); + sniffadd(23); + sniffadd(22); + sniffadd(21); sniffadd(20); - sniffadd(19); - sniffadd(18); - sniffadd(17); + sniffadd(19); + sniffadd(18); + sniffadd(17); sniffadd(16); #endif /* not UNIT16 and not UNIT8 */ - sniffadd(15); - sniffadd(14); - sniffadd(13); + sniffadd(15); + sniffadd(14); + sniffadd(13); sniffadd(12); - sniffadd(11); - sniffadd(10); - sniffadd(9); + sniffadd(11); + sniffadd(10); + sniffadd(9); sniffadd(8); #endif /* not UNIT8 */ - sniffadd(7); - sniffadd(6); - sniffadd(5); + sniffadd(7); + sniffadd(6); + sniffadd(5); sniffadd(4); - sniffadd(3); - sniffadd(2); - sniffadd(1); + sniffadd(3); + sniffadd(2); + sniffadd(1); sniffadd(0); - /* The product may have grown by as many as UNITSIZE+1 - bits. That's why we have global_precision set to the - size of the modulus plus UNITSIZE+1 slop bits. - Now reduce the product back down by conditionally - subtracting the UNITSIZE+1 preshifted images of the - modulus. This is the modulo reduction phase of modmult. - - The following loop is unrolled for speed, using macros... + /* + * The product may have grown by as many as LOG_UNITSIZE+1 + * bits. + * Now reduce the product back down by conditionally + * subtracting LOG_UNITSIZE+1 preshifted images of the + * modulus. This is the modulo reduction phase of modmult. + * + * The following loop is unrolled for speed, using macros... - for (i=UNITSIZE; i>=0; i--) + for (i=LOG_UNITSIZE; i>=0; i--) if (mp_compare(prod,moduli[i]) >= 0) mp_sub(prod,moduli[i]); - */ + */ + #ifndef UNIT8 -#ifndef UNIT16 /* and not UNIT8 */ - msubs(32); - msubs(31); - msubs(30); - msubs(29); - msubs(28); - msubs(27); - msubs(26); - msubs(25); - msubs(24); - msubs(23); - msubs(22); - msubs(21); - msubs(20); - msubs(19); - msubs(18); - msubs(17); -#endif /* not UNIT16 and not UNIT8 */ - msubs(16); - msubs(15); - msubs(14); - msubs(13); - msubs(12); - msubs(11); - msubs(10); - msubs(9); -#endif /* not UNIT8 */ - msubs(8); - msubs(7); - msubs(6); +#ifndef UNIT16 msubs(5); +#endif msubs(4); +#endif msubs(3); msubs(2); msubs(1); @@ -990,28 +1094,30 @@ int merritt_modmult(register unitptr pro /* Bump pointer to next lower unit of multiplier: */ post_lowerunit(multiplier); - } /* Loop for the number of units in the multiplier */ + } /* Loop for the number of units in the multiplier */ - return(0); /* normal return */ + return 0; /* normal return */ -} /* merritt_modmult */ +} /* merritt_modmult */ #undef msubs #undef sniffadd -/* Merritt's mp_modmult function leaves some internal tables in memory, - so we have to call modmult_burn() at the end of mp_modexp. - This is so that no cryptographically sensitive data is left in memory - after the program exits. -*/ -static void merritt_burn(void) -/* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */ -{ unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION); - unitfill0(&(moduli_buf[0][0]),(UNITSIZE)*MAX_UNIT_PRECISION); - unitfill0(msu_moduli,UNITSIZE+1); - unitfill0(nmsu_moduli,UNITSIZE+1); +/* + * Merritt's mp_modmult function leaves some internal tables in memory, + * so we have to call modmult_burn() at the end of mp_modexp. + * This is so that no cryptographically sensitive data is left in memory + * after the program exits. + */ +void merritt_burn(void) +/* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */ +{ + unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION); + unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION); + unitfill0(msu_moduli,UNITSIZE); + unitfill0(nmsu_moduli,UNITSIZE); } /* merritt_burn() */ /******* end of Merritt's MODMULT stuff. *******/ @@ -1019,153 +1125,125 @@ static void merritt_burn(void) #endif /* MERRITT */ -#ifdef UPTON /* using Jimmy Upton's modmult algorithm */ +#ifdef UPTON_OR_SMITH /* Used by Upton's and Smith's modmult algorithms */ -/* Jimmy Upton's multiprecision modmult algorithm in C. - Performs a multiply combined with a modulo operation. +/* + * Jimmy Upton's multiprecision modmult algorithm in C. + * Performs a multiply combined with a modulo operation. + * + * The following support functions and data structures + * are used only by Upton's modmult algorithm... + */ - The following support functions and data structures - are used only by Upton's modmult algorithm... -*/ +short munit_prec; /* global_precision expressed in MULTUNITs */ -/* The MULTUNIT word is the biggest word size for the native atomic - multiply instruction. It may or may not be smaller than UNITSIZE. - Many CPU's have 16-by-16-bit multiplies, yielding a 32-bit product. - In that case, MULTUINIT should be a 16-bit word, even if the rest of - the multiprecision arithmetic functions assume a 32-bit UNIT word. -*/ -typedef unsigned short MULTUNIT; -#define MULTUNITSIZE (8*sizeof(MULTUNIT)) /* size in bits */ - -static short unit_prec; /* global_precision expressed in MULTUNITs */ +/* + * Note that since the SPARC CPU has no hardware integer multiply + * instruction, there is not that much advantage in having an + * assembly version of mp_smul on that machine. It might be faster + * to use Merritt's modmult instead of Upton's modmult on the SPARC. + */ +/* + * Multiply the single-word multiplier times the multiprecision integer + * in multiplicand, accumulating result in prod. The resulting + * multiprecision prod will be 1 word longer than the multiplicand. + * multiplicand is munit_prec words long. We add into prod, so caller + * should zero it out first. For best results, this time-critical + * function should be implemented in assembly. + * NOTE: Unlike other functions in the multiprecision arithmetic + * library, both multiplicand and prod are pointing at the LSB, + * regardless of byte order of the machine. On an 80x86, this makes + * no difference. But if this assembly function is implemented + * on a 680x0, it becomes important. + */ -/* Define MPORTABLE if there is no assembly version of the mp_smul - function available. An assembly version is much faster. - Note that since the SPARC CPU has no hardware integer multiply - instruction, there is not that much advantage in having an - assembly version of mp_smul on that machine. It might be faster - to use Merritt's modmult instead of Upton's modmult on the SPARC. +/* + * Note that this has been modified from the previous version to allow + * better support for Smith's modmult: + * The final carry bit is added to the existing product + * array, rather than simply stored. */ -#ifdef MSDOS /* we do have an assembly version available on the 8086 */ -#undef MPORTABLE -#endif -#ifdef MPORTABLE /* use slow portable C version instead of assembly */ - -/* - Multiply the single-word multiplier times the multiprecision integer - in multiplicand, accumulating result in prod. The resulting - multiprecision prod will be 1 word longer than the multiplicand. - multiplicand is unit_prec words long. We add into prod, so caller - should zero it out first. For best results, this time-critical - function should be implemented in assembly. - NOTE: Unlike other functions in the multiprecision arithmetic - library, both multiplicand and prod are pointing at the LSB, - regardless of byte order of the machine. On an 80x86, this makes - no difference. But if this assembly function is implemented - on a 680x0, it becomes important. -*/ +#ifndef mp_smul void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier) { short i; unsigned long p, carry; carry = 0; - for (i=0; i> MULTUNITSIZE; } - /* We know that the high-order word of prod will always be 0 */ - *prod = (MULTUNIT)carry; /* copy carry to empty high word of prod */ -} /* mp_smul */ - -#else /* not MPORTABLE-- use assembly routine */ - -#define mp_smul P_SMUL /* use external assembly routine */ - -#endif /* MPORTABLE */ - -#ifdef VMS - -#define mp_dmul p_dmul /* use external assembly routine */ - -void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier); - -#else /* VMS */ + /* Add carry to the next higher word of product / dividend */ + *prod += (MULTUNIT)carry; +} /* mp_smul */ +#endif -/* mp_dmul is a double-precision multiply multiplicand times multiplier, - result into prod. prod must be pointing at a "double precision" - buffer. E.g. If global_precision is 10 words, prod must be - pointing at a 20-word buffer. -*/ -static void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier) +/* + * mp_dmul is a double-precision multiply multiplicand times multiplier, + * result into prod. prod must be pointing at a "double precision" + * buffer. E.g. If global_precision is 10 words, prod must be + * pointing at a 20-word buffer. + */ +#ifndef mp_dmul +void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier) { register int i; register MULTUNIT *p_multiplicand, *p_multiplier; register MULTUNIT *prodp; + unitfill0(prod,global_precision*2); /* Pre-zero prod */ /* Calculate precision in units of MULTUNIT */ - unit_prec = global_precision * UNITSIZE / MULTUNITSIZE; + munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; p_multiplicand = (MULTUNIT *)multiplicand; p_multiplier = (MULTUNIT *)multiplier; prodp = (MULTUNIT *)prod; - make_lsbptr(p_multiplicand,unit_prec); - make_lsbptr(p_multiplier,unit_prec); - make_lsbptr(prodp,unit_prec*2); + make_lsbptr(p_multiplicand,munit_prec); + make_lsbptr(p_multiplier,munit_prec); + make_lsbptr(prodp,munit_prec*2); /* Multiply multiplicand by each word in multiplier, accumulating prod: */ - for (i=0; i>MULTUNITSIZE) & (q2>>MULTUNITSIZE); + q = q1 + q2; + + /* The following statement is equivalent to shifting the sum right + one bit while shifting in the carry bit. + */ + q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); +#else /* optimized C version */ + q0 = (q1>>1) + (q2>>1) + 1; +#endif + +/* Compute the middle significant product group. */ + + q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph; + q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl; +#ifdef ASM_PROTOTYPE + q = q1 + q2; + q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); + +/* Add in the most significant word of the first group. + The last term takes care of the carry from adding the lsb's + that were shifted out prior to addition. +*/ + q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2)); +#else /* optimized C version */ + q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1; +#endif + +/* Compute the most significant term and add in the others */ + + q = (q >> (MULTUNITSIZE-2)) + + (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1); + q >>= mshift; + +/* Prevent overflow and then wipe out the intermediate results. */ + + mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1); + q= q0 = q1 = q2 = 0; lsb_factor = 0; (void)lsb_factor; + return mutemp; +} +#endif /* mp_quo_digit */ + +/* + * stage_smith_modulus() - Prepare for a Smith modmult. + * + * Calculate the reciprocal of modulus with a precision of two MULTUNITs. + * Assumes that global_precision has already been adjusted to the + * size of the modulus, plus SLOP_BITS. + * + * Note: This routine was designed to work with large values and + * doesn't have the necessary testing or handling to work with a + * modulus having less than three significant units. For such cases, + * the separate multiply and modulus routines can be used. + * + * stage_smith_modulus() is aliased to stage_modulus(). + */ + +int stage_smith_modulus(unitptr n_modulus) +{ + int original_precision; + int sigmod; /* significant units in modulus */ + unitptr mp; /* modulus most significant pointer */ + MULTUNIT *mpm; /* reciprocal pointer */ + int prec; /* precision of reciprocal calc in units */ + + mp_move(modulus, n_modulus); + modmpl = (MULTUNIT*) modulus; + modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit); + nbits = countbits(modulus); + modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE; + + original_precision = global_precision; + + /* The following code copies the three most significant units of + * modulus to mod_divisor. + */ + mp = modulus; + sigmod = significance (modulus); + rescale (mp, original_precision, sigmod); +/* prec is the unit precision required for 3 MULTUNITs */ + prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit; + set_precision (prec); + + /* set mp = ptr to most significant units of modulus, then move + * the most significant units to mp_divisor + */ + mp = msbptr(mp,sigmod) -tohigher(prec-1); + unmake_lsbptr (mp, prec); + mp_move (mod_divisor, mp); + + /* Keep 2*MULTUNITSIZE bits in mod_divisor. + * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits. + */ + mshift = countbits (mod_divisor) - 2*MULTUNITSIZE; + mp_shift_right_bits (mod_divisor, mshift); + mp_recip(mod_quotient,mod_divisor); + mp_shift_right_bits (mod_quotient,1); + + /* Reduce to: 0 < mshift <= MULTUNITSIZE */ + mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1; + /* round up, rescaling if necessary */ + mp_inc (mod_quotient); + if (countbits (mod_quotient) > 2*MULTUNITSIZE) { + mp_shift_right_bits (mod_quotient,1); + mshift--; /* now 0 <= mshift <= MULTUNITSIZE */ + } + mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit); + recipl = *post_higherunit (mpm); + reciph = *mpm; + mp_set_recip (reciph, recipl, mshift); + set_precision (original_precision); + return 0; /* normal return */ +} /* stage_smith_modulus */ + +/* Smith's algorithm performs a multiply combined with a modulo operation. + Computes: prod = (multiplicand*multiplier) mod modulus + The modulus must first be set by stage_smith_modulus(). + smith_modmult() is aliased to mp_modmult(). +*/ + +int +smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) +{ + unitptr d; /* ptr to product */ + MULTUNIT *dmph, *dmpl, *dmp; /* ptrs to dividend (high, low, first) + * aligned for subtraction */ +/* Note that dmph points one MULTUNIT higher than indicated by + global precision. This allows us to zero out a word one higher than + the normal precision. +*/ + short orig_precision; + short nqd; /* number of quotient digits remaining to + * be generated */ + short dmi; /* number of significant MULTUNITs in product */ + + d = ds_data + 1; /* room for leading MSB if HIGHFIRST */ + orig_precision = global_precision; + mp_dmul(d, multiplicand, multiplier); + + rescale(d, orig_precision * 2, orig_precision * 2 + 1); + set_precision(orig_precision * 2 + 1); + *msbptr(d, global_precision) = 0; /* leading 0 unit */ + +/* We now start working with MULTUNITs. + Determine the most significant MULTUNIT of the product so we don't + have to process leading zeros in our divide loop. +*/ + dmi = significance(d) * MULTUNITs_perunit; + if (dmi >= modlenMULTUNITS) { + /* Make dividend negative. This allows the use of mp_smul to + * "subtract" the product of the modulus and the trial divisor + * by actually adding to a negative dividend. + * The one's complement of the dividend is used, since it causes + * a zero value to be represented as all ones. This facilitates + * testing the result for possible overflow, since a sign bit + * indicates that no adjustment is necessary, and we should not + * attempt to adjust if the result of the addition is zero. + */ + mp_inc(d); + mp_neg(d); + set_precision(orig_precision); + munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; + + /* Set msb, lsb, and normal ptrs of dividend */ + dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) * + MULTUNITs_perunit) + tohigher(dmi); + nqd = dmi + 1 - modlenMULTUNITS; + dmpl = dmph - tohigher(modlenMULTUNITS); + +/* + * Divide loop. + * Each iteration computes the next quotient MULTUNIT digit, then + * multiplies the divisor (modulus) by the quotient digit and adds + * it to the one's complement of the dividend (equivalent to + * subtracting). If the product was greater than the remaining dividend, + * we get a non-negative result, in which case we subtract off the + * modulus to get the proper negative remainder. + */ + for (; nqd; nqd--) { + MULTUNIT q; /* quotient trial digit */ + + q = mp_quo_digit(dmph); + if (q > 0) { + mp_smul(dmpl, modmpl, q); + + /* Perform correction if q too large. + * This rarely occurs. + */ + if (!(*dmph & MULTUNIT_msb)) { + dmp = dmpl; + unmake_lsbptr(dmp, orig_precision * + MULTUNITs_perunit); + if (mp_musubb(dmp, + (MULTUNIT *) modulus, 0)) + (*dmph) --; + } + } + pre_lowerunit(dmph); + pre_lowerunit(dmpl); + } + /* d contains the one's complement of the remainder. */ + rescale(d, orig_precision * 2 + 1, orig_precision); + set_precision(orig_precision); + mp_neg(d); + mp_dec(d); + } else { + /* Product was less than modulus. Return it. */ + rescale(d, orig_precision * 2 + 1, orig_precision); + set_precision(orig_precision); + } + mp_move(prod, d); + return (0); /* normal return */ +} /* smith_modmult */ + + +/* Smith's mp_modmult function leaves some internal arrays in memory, + so we have to call modmult_burn() at the end of mp_modexp. + This is so that no cryptographically sensitive data is left in memory + after the program exits. + smith_burn() is aliased to modmult_burn(). +*/ +void smith_burn(void) +{ + empty_array (modulus); + empty_array (ds_data); + empty_array (mod_quotient); + empty_array (mod_divisor); + modmpl = 0; + mshift =nbits = 0; + reciph = recipl = 0; + modlenMULTUNITS = mutemp = 0; + mp_set_recip (0,0,0); +} /* smith_burn */ -#endif /* UPTON */ +/* End of Thad Smith's implementation of modmult. */ + +#endif /* SMITH */ int countbits(unitptr r) /* Returns number of significant bits in r */ -{ int bits; +{ + int bits; short prec; register unit bitmask; init_bitsniffer(r,bitmask,prec,bits); - return(bits); + return bits; } /* countbits */ @@ -1270,11 +1737,13 @@ char *copyright_notice(void) int mp_modexp(register unitptr expout,register unitptr expin, register unitptr exponent,register unitptr modulus) -{ /* Russian peasant combined exponentiation/modulo algorithm. - Calls modmult instead of mult. - Computes: expout = (expin**exponent) mod modulus - WARNING: All the arguments must be less than the modulus! - */ +/* + * Russian peasant combined exponentiation/modulo algorithm. + * Calls modmult instead of mult. + * Computes: expout = (expin**exponent) mod modulus + * WARNING: All the arguments must be less than the modulus! + */ +{ int bits; short oldprecision; register unit bitmask; @@ -1286,21 +1755,21 @@ int mp_modexp(register unitptr expout,re tally_modsquares = 0; /* clear "number of modsquares" counter */ #endif /* COUNTMULTS */ mp_init(expout,1); - if (testeq(exponent,0)) - { if (testeq(expin,0)) - return(-1); /* 0 to the 0th power means return error */ - return(0); /* otherwise, zero exponent means expout is 1 */ + if (testeq(exponent,0)) { + if (testeq(expin,0)) + return -1; /* 0 to the 0th power means return error */ + return 0; /* otherwise, zero exponent means expout is 1 */ } if (testeq(modulus,0)) - return(-2); /* zero modulus means error */ + return -2; /* zero modulus means error */ #if SLOP_BITS > 0 /* if there's room for sign bits */ if (mp_tstminus(modulus)) - return(-2); /* negative modulus means error */ + return -2; /* negative modulus means error */ #endif /* SLOP_BITS > 0 */ if (mp_compare(expin,modulus) >= 0) - return(-3); /* if expin >= modulus, return error */ + return -3; /* if expin >= modulus, return error */ if (mp_compare(exponent,modulus) >= 0) - return(-4); /* if exponent >= modulus, return error */ + return -4; /* if exponent >= modulus, return error */ oldprecision = global_precision; /* save global_precision */ /* set smallest optimum precision for this modulus */ @@ -1311,9 +1780,9 @@ int mp_modexp(register unitptr expout,re rescale(exponent,oldprecision,global_precision); rescale(expout,oldprecision,global_precision); - if (stage_modulus(modulus)) - { set_precision(oldprecision); /* restore original precision */ - return(-5); /* unstageable modulus (STEWART algorithm) */ + if (stage_modulus(modulus)) { + set_precision(oldprecision); /* restore original precision */ + return -5; /* unstageable modulus (STEWART algorithm) */ } /* normalize and compute number of bits in exponent first */ @@ -1323,29 +1792,29 @@ int mp_modexp(register unitptr expout,re bits--; /* We know for sure at this point that bits>0 */ mp_move(expout,expin); /* expout = (1*1)*expin; */ bump_bitsniffer(exponent,bitmask); - - while (bits--) - { + + while (bits--) { poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */ #ifdef COUNTMULTS tally_modsquares++; /* bump "number of modsquares" counter */ #endif /* COUNTMULTS */ mp_modsquare(product,expout); - mp_move(expout,product); - if (sniff_bit(exponent,bitmask)) - { mp_modmult(product,expout,expin); - mp_move(expout,product); + if (sniff_bit(exponent,bitmask)) { + mp_modmult(expout,product,expin); #ifdef COUNTMULTS tally_modmults++; /* bump "number of modmults" counter */ #endif /* COUNTMULTS */ + } else { + mp_move(expout,product); } bump_bitsniffer(exponent,bitmask); - } /* while bits-- */ + } /* while bits-- */ mp_burn(product); /* burn the evidence on the stack */ modmult_burn(); /* ask mp_modmult to also burn its own evidence */ #ifdef COUNTMULTS /* diagnostic analysis */ - { long atomic_mults; + { + long atomic_mults; unsigned int unitcount,totalmults; unitcount = bits2units(countbits(modulus)); /* calculation assumes modsquare takes as long as a modmult: */ @@ -1365,109 +1834,110 @@ int mp_modexp(register unitptr expout,re will be forced to include it in the executable object image... */ copyright_notice(); /* has no real effect at run time */ - return(0); /* normal return */ -} /* mp_modexp */ - + return 0; /* normal return */ +} /* mp_modexp */ - - -/********************************************************************* - RSA-specific routines follow. These are the only functions that - are specific to the RSA public key cryptosystem. The other - multiprecision integer math functions may be used for non-RSA - applications. - - The RSA public key cryptosystem is patented by the Massachusetts - Institute of Technology (U.S. patent #4,405,829). This patent does - not apply outside the USA. Public Key Partners (PKP) holds the - exclusive commercial license to sell and sub-license the RSA public - key cryptosystem. The author of this software implementation of the - RSA algorithm is providing this software for educational use only. - Licensing this algorithm from PKP is the responsibility of you, the - user, not Philip Zimmermann, the author of this software. The author - assumes no liability for any breach of patent law resulting from the - unlicensed use of this software by the user. -*********************************************************************/ - - -int rsa_decrypt(unitptr M, unitptr C, - unitptr d, unitptr p, unitptr q, unitptr u) - /* Uses Quisquater's Chinese Remainder Theorem shortcut - for RSA decryption. - M is the output plaintext message. - C is the input ciphertext message. - d is the secret decryption exponent. - p and q are the secret prime factors of n. - u is the multiplicative inverse of p, mod q. - Note that u is precomputed on the assumption that p= 0) /* ensure that p