--- pgp/src/mpilib.c 2018/04/24 16:40:49 1.1.1.5 +++ pgp/src/mpilib.c 2018/04/24 16:43:27 1.1.1.7 @@ -1,1943 +1,1965 @@ -/* C source code for multiprecision arithmetic library routines. - Implemented Nov 86 by Philip Zimmermann - Last revised 27 Nov 91 by PRZ - - Boulder Software Engineering - 3021 Eleventh Street - Boulder, CO 80304 - (303) 541-0140 - - (c) Copyright 1986-1994 by Philip Zimmermann. All rights reserved. - The author assumes no liability for damages resulting from the use - of this software, even if the damage results from defects in this - software. No warranty is expressed or implied. The use of this - cryptographic software for developing weapon systems is expressly - forbidden. - - - These routines implement all of the multiprecision arithmetic - necessary for number-theoretic cryptographic algorithms such as - ElGamal, Diffie-Hellman, Rabin, or factoring studies for large - composite numbers, as well as Rivest-Shamir-Adleman (RSA) public - key cryptography. - - Although originally developed in Microsoft C for the IBM PC, this code - contains few machine dependencies. It assumes 2's complement - arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines, - lowbyte-highbyte order or highbyte-lowbyte order. This version - has been converted to ANSI C. - - - The internal representation for these extended precision integer - "registers" is an array of "units". A unit is a machine word, which - is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit - unsigned integer, depending on the machine's word size. For example, - an IBM PC or AT uses a unit size of 16 bits. To perform arithmetic - on these huge precision integers, we pass pointers to these unit - arrays to various subroutines. A pointer to an array of units is of - type unitptr. This is a pointer to a huge integer "register". - - When calling a subroutine, we always pass a pointer to the BEGINNING - of the array of units, regardless of the byte order of the machine. - On a lowbyte-first machine, such as the Intel 80x86, this unitptr - points to the LEAST significant unit, and the array of units increases - significance to the right. On a highbyte-first machine, such as the - Motorola 680x0, this unitptr points to the MOST significant unit, and - the array of units decreases significance to the right. - - Modified 8 Apr 92 - HAJK - Implement new VAX/VMS primitive support. - - Modified 30 Sep 92 -Castor Fu - Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long) - - Modified 28 Nov 92 - Thad Smith - Added Smith modmult, generalized non-portable support. -*/ - -/* #define COUNTMULTS */ /* count modmults for performance studies */ - -#ifdef DEBUG -#ifdef MSDOS -#ifdef __GO32__ /* DJGPP */ -#include -#else -#include -#endif /* __GO32__ */ -#define poll_for_break() {while (kbhit()) getch();} -#endif /* MSDOS */ -#endif /* DEBUG */ - -#ifndef poll_for_break -#define poll_for_break() /* stub */ -#endif - -#include "mpilib.h" - -#ifdef mp_smula -#ifdef mp_smul - Error: Both mp_smula and mp_smul cannot be defined. -#else -#define mp_smul mp_smula -#endif -#endif - -/* set macros for MULTUNIT */ -#ifdef MUNIT8 -#define MULTUNITSIZE 8 -typedef unsigned char MULTUNIT; -#ifdef UNIT8 -#define MULTUNIT_SIZE_SAME -#endif -#else /* not MUNIT8 */ -#ifdef MUNIT32 -#define MULTUNITSIZE 32 -typedef unsigned long MULTUNIT; -#ifdef UNIT32 -#define MULTUNIT_SIZE_SAME -#else -/* #error is not portable, this has the same effect */ -#include "UNITSIZE cannot be smaller than MULTUNITSIZE" -#endif -#else /* assume MUNIT16 */ -#define MULTUNITSIZE 16 -typedef unsigned short MULTUNIT; -#ifdef UNIT16 -#define MULTUNIT_SIZE_SAME -#endif /* UNIT16 */ -#ifdef UNIT8 -#include "UNITSIZE cannot be smaller than MULTUNITSIZE" -#endif /* UNIT8 */ -#endif /* MUNIT32 */ -#endif /* MUNIT8 */ - -#define MULTUNIT_msb ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */ -#define DMULTUNIT_msb (1L << (2*MULTUNITSIZE-1)) -#define MULTUNIT_mask ((MULTUNIT)((1L << MULTUNITSIZE)-1)) -#define MULTUNITs_perunit (UNITSIZE/MULTUNITSIZE) - - -void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier); -void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier); - -short global_precision = 0; /* units of precision for all routines */ -/* global_precision is the unit precision last set by set_precision. - Initially, set_precision() should be called to define global_precision - before using any of these other multiprecision library routines. - i.e.: set_precision(MAX_UNIT_PRECISION); -*/ - -/*************** multiprecision library primitives ****************/ -/* The following portable C primitives should be recoded in assembly. - The entry point name should be defined, in "mpilib.h" to the external - entry point name. If undefined, the C version will be used. -*/ - -typedef unsigned long int ulint; - -#ifndef mp_addc -boolean mp_addc - (register unitptr r1,register unitptr r2,register boolean carry) - /* multiprecision add with carry r2 to r1, result in r1 */ - /* carry is incoming carry flag-- value should be 0 or 1 */ -{ - register unit x; - short precision; /* number of units to add */ - precision = global_precision; - make_lsbptr(r1,precision); - make_lsbptr(r2,precision); - while (precision--) { - if (carry) { - x = *r1 + *r2 + 1; - carry = (*r2 >= (unit)(~ *r1)); - } else { - x = *r1 + *r2; - carry = (x < *r1) ; - } - post_higherunit(r2); - *post_higherunit(r1) = x; - } - return carry; /* return the final carry flag bit */ -} /* mp_addc */ -#endif /* mp_addc */ - -#ifndef mp_subb -boolean mp_subb - (register unitptr r1,register unitptr r2,register boolean borrow) - /* multiprecision subtract with borrow, r2 from r1, result in r1 */ - /* borrow is incoming borrow flag-- value should be 0 or 1 */ -{ - register unit x; - short precision; /* number of units to subtract */ - precision = global_precision; - make_lsbptr(r1,precision); - make_lsbptr(r2,precision); - while (precision--) { - if (borrow) { - x = *r1 - *r2 - 1; - borrow = (*r1 <= *r2); - } else { - x = *r1 - *r2; - borrow = (*r1 < *r2); - } - post_higherunit(r2); - *post_higherunit(r1) = x; - } - return borrow; /* return the final carry/borrow flag bit */ -} /* mp_subb */ -#endif /* mp_subb */ - -#ifndef mp_rotate_left -boolean mp_rotate_left(register unitptr r1,register boolean carry) - /* multiprecision rotate left 1 bit with carry, result in r1. */ - /* carry is incoming carry flag-- value should be 0 or 1 */ -{ - register int precision; /* number of units to rotate */ - unsigned int mcarry = carry, nextcarry; /* int is supposed to be - * the efficient size for ops*/ - precision = global_precision; - make_lsbptr(r1,precision); - while (precision--) { - nextcarry = (((signedunit) *r1) < 0); - *r1 = (*r1 << 1) | mcarry; - mcarry = nextcarry; - pre_higherunit(r1); - } - return nextcarry; /* return the final carry flag bit */ -} /* mp_rotate_left */ -#endif /* mp_rotate_left */ - -/************** end of primitives that should be in assembly *************/ - - -/* The mp_shift_right_bits function is not called in any time-critical - situations in public-key cryptographic functions, so it doesn't - need to be coded in assembly language. -*/ -void mp_shift_right_bits(register unitptr r1,register short bits) - /* multiprecision shift right bits, result in r1. - bits is how many bits to shift, must be <= UNITSIZE. - */ -{ - register short precision; /* number of units to shift */ - register unit carry,nextcarry,bitmask; - register short unbits; - if (bits==0) return; /* shift zero bits is a no-op */ - carry = 0; - bitmask = power_of_2(bits)-1; - unbits = UNITSIZE-bits; /* shift bits must be <= UNITSIZE */ - precision = global_precision; - make_msbptr(r1,precision); - if (bits == UNITSIZE) { - while (precision--) { - nextcarry = *r1; - *r1 = carry; - carry = nextcarry; - pre_lowerunit(r1); - } - } else { - while (precision--) { - nextcarry = *r1 & bitmask; - *r1 >>= bits; - *r1 |= carry << unbits; - carry = nextcarry; - pre_lowerunit(r1); - } - } -} /* mp_shift_right_bits */ - - -#ifndef mp_compare -short mp_compare(register unitptr r1,register unitptr r2) -/* - * Compares multiprecision integers *r1, *r2, and returns: - * -1 iff *r1 < *r2 - * 0 iff *r1 == *r2 - * +1 iff *r1 > *r2 - */ -{ - register short precision; /* number of units to compare */ - - precision = global_precision; - make_msbptr(r1,precision); - make_msbptr(r2,precision); - do { - if (*r1 < *r2) - return -1; - if (*post_lowerunit(r1) > *post_lowerunit(r2)) - return 1; - } while (--precision); - return 0; /* *r1 == *r2 */ -} /* mp_compare */ -#endif /* mp_compare */ - - -boolean mp_inc(register unitptr r) -/* Increment multiprecision integer r. */ -{ - register short precision; - precision = global_precision; - make_lsbptr(r,precision); - do { - if ( ++(*r) ) return 0; /* no carry */ - post_higherunit(r); - } while (--precision); - return 1; /* return carry set */ -} /* mp_inc */ - - -boolean mp_dec(register unitptr r) -/* Decrement multiprecision integer r. */ -{ - register short precision; - precision = global_precision; - make_lsbptr(r,precision); - do { - if ( (signedunit) (--(*r)) != (signedunit) -1 ) - return 0; /* no borrow */ - post_higherunit(r); - } while (--precision); - return 1; /* return borrow set */ -} /* mp_dec */ - - -void mp_neg(register unitptr r) - /* Compute 2's complement, the arithmetic negative, of r */ -{ - register short precision; /* number of units to negate */ - precision = global_precision; - mp_dec(r); /* 2's complement is 1's complement plus 1 */ - do { /* now do 1's complement */ - *r = ~(*r); - r++; - } while (--precision); -} /* mp_neg */ - -#ifndef mp_move -void mp_move(register unitptr dst,register unitptr src) -{ - register short precision; /* number of units to move */ - precision = global_precision; - do { *dst++ = *src++; } while (--precision); -} /* mp_move */ -#endif /* mp_move */ - -void mp_init(register unitptr r, word16 value) - /* Init multiprecision register r with short value. */ -{ /* Note that mp_init doesn't extend sign bit for >32767 */ - - unitfill0( r, global_precision); - make_lsbptr(r,global_precision); - *post_higherunit(r) = value; -#ifdef UNIT8 - *post_higherunit(r) = value >> UNITSIZE; -#endif -} /* mp_init */ - - -short significance(register unitptr r) - /* Returns number of significant units in r */ -{ - register short precision; - precision = global_precision; - make_msbptr(r,precision); - do { - if (*post_lowerunit(r)) - return precision; - } while (--precision); - return precision; -} /* significance */ - - -#ifndef unitfill0 -void unitfill0(unitptr r,word16 unitcount) - /* Zero-fill the unit buffer r. */ -{ - while (unitcount--) *r++ = 0; -} /* unitfill0 */ -#endif /* unitfill0 */ - - -int mp_udiv(register unitptr remainder,register unitptr quotient, - register unitptr dividend,register unitptr divisor) - /* Unsigned divide, treats both operands as positive. */ -{ - int bits; - short dprec; - register unit bitmask; - - if (testeq(divisor,0)) - return -1; /* zero divisor means divide error */ - mp_init0(remainder); - mp_init0(quotient); - /* normalize and compute number of bits in dividend first */ - init_bitsniffer(dividend,bitmask,dprec,bits); - /* rescale quotient to same precision (dprec) as dividend */ - rescale(quotient,global_precision,dprec); - make_msbptr(quotient,dprec); - - while (bits--) { - mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0)); - if (mp_compare(remainder,divisor) >= 0) { - mp_sub(remainder,divisor); - stuff_bit(quotient,bitmask); - } - bump_2bitsniffers(dividend,quotient,bitmask); - } - return 0; -} /* mp_udiv */ - - -#ifdef UPTON_OR_SMITH -#define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */ - -int mp_recip(register unitptr quotient,register unitptr divisor) - /* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */ -{ - int bits; - short qprec; - register unit bitmask; - unit remainder[MAX_UNIT_PRECISION]; - if (testeq(divisor,0)) - return -1; /* zero divisor means divide error */ - mp_init0(remainder); - mp_init0(quotient); - - /* normalize and compute number of bits in quotient first */ - bits = countbits(divisor) + RECIPMARGIN; - bitmask = bitmsk(bits); /* bitmask within a single unit */ - qprec = bits2units(bits+1); - mp_setbit(remainder,(bits-RECIPMARGIN)-1); - /* rescale quotient to precision of divisor + RECIPMARGIN bits */ - rescale(quotient,global_precision,qprec); - make_msbptr(quotient,qprec); - - while (bits--) { - mp_shift_left(remainder); - if (mp_compare(remainder,divisor) >= 0) { - mp_sub(remainder,divisor); - stuff_bit(quotient,bitmask); - } - bump_bitsniffer(quotient,bitmask); - } - mp_init0(remainder); /* burn sensitive data left on stack */ - return 0; -} /* mp_recip */ -#endif /* UPTON_OR_SMITH */ - - -int mp_div(register unitptr remainder,register unitptr quotient, - register unitptr dividend,register unitptr divisor) - /* Signed divide, either or both operands may be negative. */ -{ - boolean dvdsign,dsign; - int status; - dvdsign = (boolean)(mp_tstminus(dividend)!=0); - dsign = (boolean)(mp_tstminus(divisor)!=0); - if (dvdsign) mp_neg(dividend); - if (dsign) mp_neg(divisor); - status = mp_udiv(remainder,quotient,dividend,divisor); - if (dvdsign) mp_neg(dividend); /* repair caller's dividend */ - if (dsign) mp_neg(divisor); /* repair caller's divisor */ - if (status<0) return status; /* divide error? */ - if (dvdsign) mp_neg(remainder); - if (dvdsign ^ dsign) mp_neg(quotient); - return status; -} /* mp_div */ - - -word16 mp_shortdiv(register unitptr quotient, - register unitptr dividend,register word16 divisor) -/* - * This function does a fast divide and mod on a multiprecision dividend - * using a short integer divisor returning a short integer remainder. - * This is an unsigned divide. It treats both operands as positive. - * It is used mainly for faster printing of large numbers in base 10. - */ -{ - int bits; - short dprec; - register unit bitmask; - register word16 remainder; - if (!divisor) /* if divisor == 0 */ - return -1; /* zero divisor means divide error */ - remainder=0; - mp_init0(quotient); - /* normalize and compute number of bits in dividend first */ - init_bitsniffer(dividend,bitmask,dprec,bits); - /* rescale quotient to same precision (dprec) as dividend */ - rescale(quotient,global_precision,dprec); - make_msbptr(quotient,dprec); - - while (bits--) { - remainder <<= 1; - if (sniff_bit(dividend,bitmask)) - remainder++; - if (remainder >= divisor) { - remainder -= divisor; - stuff_bit(quotient,bitmask); - } - bump_2bitsniffers(dividend,quotient,bitmask); - } - return remainder; -} /* mp_shortdiv */ - - -int mp_mod(register unitptr remainder, - register unitptr dividend,register unitptr divisor) -/* Unsigned divide, treats both operands as positive. */ -{ - int bits; - short dprec; - register unit bitmask; - if (testeq(divisor,0)) - return -1; /* zero divisor means divide error */ - mp_init0(remainder); - /* normalize and compute number of bits in dividend first */ - init_bitsniffer(dividend,bitmask,dprec,bits); - - while (bits--) { - mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0)); - msub(remainder,divisor); - bump_bitsniffer(dividend,bitmask); - } - return 0; -} /* mp_mod */ - - -word16 mp_shortmod(register unitptr dividend,register word16 divisor) -/* - * This function does a fast mod operation on a multiprecision dividend - * using a short integer modulus returning a short integer remainder. - * This is an unsigned divide. It treats both operands as positive. - * It is used mainly for fast sieve searches for large primes. - */ -{ int bits; - short dprec; - register unit bitmask; - register word16 remainder; - if (!divisor) /* if divisor == 0 */ - return -1; /* zero divisor means divide error */ - remainder=0; - /* normalize and compute number of bits in dividend first */ - init_bitsniffer(dividend,bitmask,dprec,bits); - - while (bits--) { - remainder <<= 1; - if (sniff_bit(dividend,bitmask)) - remainder++; - if (remainder >= divisor) remainder -= divisor; - bump_bitsniffer(dividend,bitmask); - } - return remainder; -} /* mp_shortmod */ - - - -#ifdef COMB_MULT /* use faster "comb" multiply algorithm */ - /* We are skipping this code because it has a bug... */ - -int mp_mult(register unitptr prod, - register unitptr multiplicand, register unitptr multiplier) -/* - * Computes multiprecision prod = multiplicand * multiplier - * - * Uses interleaved comb multiply algorithm. - * This improved multiply more than twice as fast as a Russian - * peasant multiply, because it does a lot fewer shifts. - * Must have global_precision set to the size of the multiplicand - * plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum - * of the lengths of the multiplier and multiplicand. - * - * BUG ALERT: Unfortunately, this code has a bug. It fails for - * some numbers. One such example: - * x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 - * x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D - * 52C8 CDC7 AFB3 61C8 243C 741B - * --which is obviously wrong. The answer should be: - * x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A - * C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 - * We'll have to fix this some day. In the meantime, we'll - * just have the compiler skip over this code. - * - * BUG NOTE: Possibly fixed. Needs testing. - */ -{ - int bits; - register unit bitmask; - unitptr product, mplier, temp; - short mprec,mprec2; - unit mplicand[MAX_UNIT_PRECISION]; - - /* better clear full width--double precision */ - mp_init(prod+tohigher(global_precision),0); - - if (testeq(multiplicand,0)) - return 0; /* zero multiplicand means zero product */ - - mp_move(mplicand,multiplicand); /* save it from damage */ - - normalize(multiplier,mprec); - if (!mprec) - return 0; - - make_lsbptr(multiplier,mprec); - bitmask = 1; /* start scan at LSB of multiplier */ - - do { /* UNITSIZE times */ - /* do for bits 0-15 */ - product = prod; - mplier = multiplier; - mprec2 = mprec; - while (mprec2--) { /* do for each word in multiplier */ - - if (sniff_bit(mplier,bitmask)) { - if (mp_addc(product,multiplicand,0)) /* ripple carry */ - { /* After 1st time thru, this is rarely encountered. */ - temp = msbptr(product,global_precision); - pre_higherunit(temp); - /* temp now points to LSU of carry region. */ - unmake_lsbptr(temp,global_precision); - mp_inc(temp); - } /* ripple carry */ - } - pre_higherunit(mplier); - pre_higherunit(product); - } - if (!(bitmask <<= 1)) - break; - mp_shift_left(multiplicand); - - } while (TRUE); - - mp_move(multiplicand,mplicand); /* recover */ - - return 0; /* normal return */ -} /* mp_mult */ - -#endif /* COMB_MULT */ - - -/* Because the "comb" multiply has a bug, we will use the slower - Russian peasant multiply instead. Fortunately, the mp_mult - function is not called from any time-critical code. -*/ - -int mp_mult(register unitptr prod, - register unitptr multiplicand,register unitptr multiplier) -/* - * Computes multiprecision prod = multiplicand * multiplier - * Uses "Russian peasant" multiply algorithm. - */ -{ - int bits; - register unit bitmask; - short mprec; - mp_init(prod,0); - if (testeq(multiplicand,0)) - return 0; /* zero multiplicand means zero product */ - /* normalize and compute number of bits in multiplier first */ - init_bitsniffer(multiplier,bitmask,mprec,bits); - - while (bits--) { - mp_shift_left(prod); - if (sniff_bit(multiplier,bitmask)) - mp_add(prod,multiplicand); - bump_bitsniffer(multiplier,bitmask); - } - return 0; -} /* mp_mult */ - - - -/* - * mp_modmult computes a multiprecision multiply combined with a - * modulo operation. This is the most time-critical function in - * this multiprecision arithmetic library for performing modulo - * exponentiation. We experimented with different versions of modmult, - * depending on the machine architecture and performance requirements. - * We will either use a Russian Peasant modmult (peasant_modmult), - * Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's - * modmult (upton_modmult), or Thad Smith's modmult (smith_modmult). - * On machines with a hardware atomic multiply instruction, - * Smith's modmult is fastest. It can utilize assembly subroutines to - * speed up the hardware multiply logic and trial quotient calculation. - * If the machine lacks a fast hardware multiply, Merritt's modmult - * is preferred, which doesn't call any assembly multiply routine. - * We use the alias names mp_modmult, stage_modulus, and modmult_burn - * for the corresponding true names, which depend on what flavor of - * modmult we are using. - * - * Before making the first call to mp_modmult, you must set up the - * modulus-dependant precomputated tables by calling stage_modulus. - * After making all the calls to mp_modmult, you call modmult_burn to - * erase the tables created by stage_modulus that were left in memory. - */ - -#ifdef COUNTMULTS -/* "number of modmults" counters, used for performance studies. */ -static unsigned int tally_modmults = 0; -static unsigned int tally_modsquares = 0; -#endif /* COUNTMULTS */ - - -#ifdef PEASANT -/* Conventional Russian peasant multiply with modulo algorithm. */ - -static unitptr pmodulus = 0; /* used only by mp_modmult */ - -int stage_peasant_modulus(unitptr n) -/* - * Must pass modulus to stage_modulus before calling modmult. - * Assumes that global_precision has already been adjusted to the - * size of the modulus, plus SLOP_BITS. - */ -{ /* For this simple version of modmult, just copy unit pointer. */ - pmodulus = n; - return 0; /* normal return */ -} /* stage_peasant_modulus */ - - -int peasant_modmult(register unitptr prod, - unitptr multiplicand,register unitptr multiplier) -/* "Russian peasant" multiply algorithm, combined with a modulo - * operation. This is a simple naive replacement for Merritt's - * faster modmult algorithm. References global unitptr "modulus". - * Computes: prod = (multiplicand*multiplier) mod modulus - * WARNING: All the arguments must be less than the modulus! - */ -{ - int bits; - register unit bitmask; - short mprec; - mp_init(prod,0); -/* if (testeq(multiplicand,0)) - return 0; */ /* zero multiplicand means zero product */ - /* normalize and compute number of bits in multiplier first */ - init_bitsniffer(multiplier,bitmask,mprec,bits); - - while (bits--) { - mp_shift_left(prod); - msub(prod,pmodulus); /* turns mult into modmult */ - if (sniff_bit(multiplier,bitmask)) { - mp_add(prod,multiplicand); - msub(prod,pmodulus); /* turns mult into modmult */ - } - bump_bitsniffer(multiplier,bitmask); - } - return 0; -} /* peasant_modmult */ - - -/* - * If we are using a version of mp_modmult that uses internal tables - * in memory, we have to call modmult_burn() at the end of mp_modexp. - * This is so that no sensitive data is left in memory after the program - * exits. The Russian peasant method doesn't use any such tables. - */ -void peasant_burn(void) -/* - * Alias for modmult_burn, called only from mp_modexp(). Destroys - * internal modmult tables. This version does nothing because no - * tables are used by the Russian peasant modmult. - */ -{ } /* peasant_burn */ - -#endif /* PEASANT */ - - -#ifdef MERRITT -/*=========================================================================*/ -/* - This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ. - Also refined by Zhahai Stewart to reduce the number of subtracts. - Modified by Raymond Brand to reduce the number of SLOP_BITS by 1. - It performs a multiply combined with a modulo operation, without - going into "double precision". It is faster than the Russian peasant - method, and still works well on machines that lack a fast hardware - multiply instruction. -*/ - -/* The following support functions, macros, and data structures - are used only by Merritt's modmult algorithm... */ - -static void mp_lshift_unit(register unitptr r1) -/* Shift r1 1 whole unit to the left. Used only by modmult function. */ -{ - register short precision; - register unitptr r2; - precision = global_precision; - make_msbptr(r1,precision); - r2 = r1; - while (--precision) - *post_lowerunit(r1) = *pre_lowerunit(r2); - *r1 = 0; -} /* mp_lshift_unit */ - - -/* moduli_buf contains shifted images of the modulus, set by stage_modulus */ -static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0}; -static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */ -{ 0, moduli_buf[ 0], moduli_buf[ 1], moduli_buf[ 2], - moduli_buf[ 3], moduli_buf[ 4], moduli_buf[ 5], moduli_buf[ 6] -#ifndef UNIT8 - ,moduli_buf[ 7] ,moduli_buf[ 8], moduli_buf[ 9], moduli_buf[10], - moduli_buf[11], moduli_buf[12], moduli_buf[13], moduli_buf[14] -#ifndef UNIT16 /* and not UNIT8 */ - ,moduli_buf[15] ,moduli_buf[16], moduli_buf[17], moduli_buf[18], - moduli_buf[19], moduli_buf[20], moduli_buf[21], moduli_buf[22], - moduli_buf[23], moduli_buf[24], moduli_buf[25], moduli_buf[26], - moduli_buf[27], moduli_buf[28], moduli_buf[29], moduli_buf[30] -#endif /* UNIT16 and UNIT8 not defined */ -#endif /* UNIT8 not defined */ -}; - -/* - * To optimize msubs, need following 2 unit arrays, each filled - * with the most significant unit and next-to-most significant unit - * of the preshifted images of the modulus. - */ -static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */ -static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */ - -/* - * mpdbuf contains preshifted images of the multiplicand, mod n. - * It is used only by mp_modmult. It could be staticly declared - * inside of mp_modmult, but we put it outside mp_modmult so that - * it can be wiped clean by modmult_burn(), which is called at the - * end of mp_modexp. This is so that no sensitive data is left in - * memory after the program exits. - */ -static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0}; - - -static void stage_mp_images(unitptr images[UNITSIZE],unitptr r) -/* - * Computes UNITSIZE images of r, each shifted left 1 more bit. - * Used only by modmult function. - */ -{ - short int i; - - images[0] = r; /* no need to move the first image, just copy ptr */ - for (i=1; i= 0) && \ - (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \ - ) mp_sub(prod,moduli[i]) -*/ - -/* Fully-optimized msubs macro (msubs2) follows... */ -#define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \ - (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \ - (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \ - mp_sub(prod,moduli[i]) - - -int merritt_modmult(register unitptr prod, - unitptr multiplicand,register unitptr multiplier) -/* - * Performs combined multiply/modulo operation. - * Computes: prod = (multiplicand*multiplier) mod modulus - * WARNING: All the arguments must be less than the modulus! - * Assumes the modulus has been predefined by first calling - * stage_modulus. - */ -{ - /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/ - register signedunit p_m; - register unitptr msu_prod; /* ptr to most significant unit of product */ - register unitptr nmsu_prod; /* next-most signif. unit of product */ - short mprec; /* precision of multiplier, in units */ - /* Array mpd contains a list of pointers to preshifted images of - the multiplicand: */ - static unitptr mpd[UNITSIZE] = { - 0, mpdbuf[ 0], mpdbuf[ 1], mpdbuf[ 2], - mpdbuf[ 3], mpdbuf[ 4], mpdbuf[ 5], mpdbuf[ 6] -#ifndef UNIT8 - ,mpdbuf[ 7], mpdbuf[ 8], mpdbuf[ 9], mpdbuf[10], - mpdbuf[11], mpdbuf[12], mpdbuf[13], mpdbuf[14] -#ifndef UNIT16 /* and not UNIT8 */ - ,mpdbuf[15], mpdbuf[16], mpdbuf[17], mpdbuf[18], - mpdbuf[19], mpdbuf[20], mpdbuf[21], mpdbuf[22], - mpdbuf[23], mpdbuf[24], mpdbuf[25], mpdbuf[26], - mpdbuf[27], mpdbuf[28], mpdbuf[29], mpdbuf[30] -#endif /* UNIT16 and UNIT8 not defined */ -#endif /* UNIT8 not defined */ - }; - - /* Compute preshifted images of multiplicand, mod n: */ - stage_mp_images(mpd,multiplicand); - - /* To optimize msubs, set up msu_prod and nmsu_prod: */ - msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */ - nmsu_prod = msu_prod; - post_lowerunit(nmsu_prod); /* Get next-MSU of prod */ - - /* - * To understand this algorithm, it would be helpful to first - * study the conventional Russian peasant modmult algorithm. - * This one does about the same thing as Russian peasant, but - * is organized differently to save some steps. It loops - * through the multiplier a word (unit) at a time, instead of - * a bit at a time. It word-shifts the product instead of - * bit-shifting it, so it should be faster. It also does about - * half as many subtracts as Russian peasant. - */ - - mp_init(prod,0); /* Initialize product to 0. */ - - /* - * The way mp_modmult is actually used in cryptographic - * applications, there will NEVER be a zero multiplier or - * multiplicand. So there is no need to optimize for that - * condition. - */ -/* if (testeq(multiplicand,0)) - return 0; */ /* zero multiplicand means zero product */ - /* Normalize and compute number of units in multiplier first: */ - normalize(multiplier,mprec); - if (mprec==0) /* if precision of multiplier is 0 */ - return 0; /* zero multiplier means zero product */ - make_msbptr(multiplier,mprec); /* start at MSU of multiplier */ - - while (mprec--) { /* Loop for the number of units in the multiplier */ - /* - * Shift the product one whole unit to the left. - * This is part of the multiply phase of modmult. - */ - - mp_lshift_unit(prod); - - /* - * The product may have grown by as many as UNITSIZE - * bits. That's why we have global_precision set to the - * size of the modulus plus UNITSIZE slop bits. - * Now reduce the product back down by conditionally - * subtracting the preshifted images of the modulus. - * This is part of the modulo reduction phase of modmult. - * The following loop is unrolled for speed, using macros... - - for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--) - if (mp_compare(prod,moduli[i]) >= 0) - mp_sub(prod,moduli[i]); - */ - -#ifndef UNIT8 -#ifndef UNIT16 /* and not UNIT8 */ - msubs(31); - msubs(30); - msubs(29); - msubs(28); - msubs(27); - msubs(26); - msubs(25); - msubs(24); - msubs(23); - msubs(22); - msubs(21); - msubs(20); - msubs(19); - msubs(18); - msubs(17); - msubs(16); -#endif /* not UNIT16 and not UNIT8 */ - msubs(15); - msubs(14); - msubs(13); - msubs(12); - msubs(11); - msubs(10); - msubs(9); - msubs(8); -#endif /* not UNIT8 */ - msubs(7); - msubs(6); - msubs(5); -#ifndef UNIT32 - msubs(4); -#ifndef UNIT16 - msubs(3); -#endif -#endif - - /* - * Sniff each bit in the current unit of the multiplier, - * and conditionally add the corresponding preshifted - * image of the multiplicand to the product. - * This is also part of the multiply phase of modmult. - * - * The following loop is unrolled for speed, using macros... - - for (i=UNITSIZE-1; i>=0; i--) - if (*multiplier & power_of_2(i)) - mp_add(prod,mpd[i]); - */ -#ifndef UNIT8 -#ifndef UNIT16 /* and not UNIT8 */ - sniffadd(31); - sniffadd(30); - sniffadd(29); - sniffadd(28); - sniffadd(27); - sniffadd(26); - sniffadd(25); - sniffadd(24); - sniffadd(23); - sniffadd(22); - sniffadd(21); - sniffadd(20); - sniffadd(19); - sniffadd(18); - sniffadd(17); - sniffadd(16); -#endif /* not UNIT16 and not UNIT8 */ - sniffadd(15); - sniffadd(14); - sniffadd(13); - sniffadd(12); - sniffadd(11); - sniffadd(10); - sniffadd(9); - sniffadd(8); -#endif /* not UNIT8 */ - sniffadd(7); - sniffadd(6); - sniffadd(5); - sniffadd(4); - sniffadd(3); - sniffadd(2); - sniffadd(1); - sniffadd(0); - - /* - * The product may have grown by as many as LOG_UNITSIZE+1 - * bits. - * Now reduce the product back down by conditionally - * subtracting LOG_UNITSIZE+1 preshifted images of the - * modulus. This is the modulo reduction phase of modmult. - * - * The following loop is unrolled for speed, using macros... - - for (i=LOG_UNITSIZE; i>=0; i--) - if (mp_compare(prod,moduli[i]) >= 0) - mp_sub(prod,moduli[i]); - */ - -#ifndef UNIT8 -#ifndef UNIT16 - msubs(5); -#endif - msubs(4); -#endif - msubs(3); - msubs(2); - msubs(1); - msubs(0); - - /* Bump pointer to next lower unit of multiplier: */ - post_lowerunit(multiplier); - - } /* Loop for the number of units in the multiplier */ - - return 0; /* normal return */ - -} /* merritt_modmult */ - - -#undef msubs -#undef sniffadd - - -/* - * Merritt's mp_modmult function leaves some internal tables in memory, - * so we have to call modmult_burn() at the end of mp_modexp. - * This is so that no cryptographically sensitive data is left in memory - * after the program exits. - */ -void merritt_burn(void) -/* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */ -{ - unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION); - unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION); - unitfill0(msu_moduli,UNITSIZE); - unitfill0(nmsu_moduli,UNITSIZE); -} /* merritt_burn() */ - -/******* end of Merritt's MODMULT stuff. *******/ -/*=========================================================================*/ -#endif /* MERRITT */ - - -#ifdef UPTON_OR_SMITH /* Used by Upton's and Smith's modmult algorithms */ - -/* - * Jimmy Upton's multiprecision modmult algorithm in C. - * Performs a multiply combined with a modulo operation. - * - * The following support functions and data structures - * are used only by Upton's modmult algorithm... - */ - -short munit_prec; /* global_precision expressed in MULTUNITs */ - -/* - * Note that since the SPARC CPU has no hardware integer multiply - * instruction, there is not that much advantage in having an - * assembly version of mp_smul on that machine. It might be faster - * to use Merritt's modmult instead of Upton's modmult on the SPARC. - */ - -/* - * Multiply the single-word multiplier times the multiprecision integer - * in multiplicand, accumulating result in prod. The resulting - * multiprecision prod will be 1 word longer than the multiplicand. - * multiplicand is munit_prec words long. We add into prod, so caller - * should zero it out first. For best results, this time-critical - * function should be implemented in assembly. - * NOTE: Unlike other functions in the multiprecision arithmetic - * library, both multiplicand and prod are pointing at the LSB, - * regardless of byte order of the machine. On an 80x86, this makes - * no difference. But if this assembly function is implemented - * on a 680x0, it becomes important. - */ - -/* - * Note that this has been modified from the previous version to allow - * better support for Smith's modmult: - * The final carry bit is added to the existing product - * array, rather than simply stored. -*/ - -#ifndef mp_smul -void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier) -{ - short i; - unsigned long p, carry; - - carry = 0; - for (i=0; i> MULTUNITSIZE; - } - /* Add carry to the next higher word of product / dividend */ - *prod += (MULTUNIT)carry; -} /* mp_smul */ -#endif - -/* - * mp_dmul is a double-precision multiply multiplicand times multiplier, - * result into prod. prod must be pointing at a "double precision" - * buffer. E.g. If global_precision is 10 words, prod must be - * pointing at a 20-word buffer. - */ -#ifndef mp_dmul -void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier) -{ - register int i; - register MULTUNIT *p_multiplicand, *p_multiplier; - register MULTUNIT *prodp; - - - unitfill0(prod,global_precision*2); /* Pre-zero prod */ - /* Calculate precision in units of MULTUNIT */ - munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; - p_multiplicand = (MULTUNIT *)multiplicand; - p_multiplier = (MULTUNIT *)multiplier; - prodp = (MULTUNIT *)prod; - make_lsbptr(p_multiplicand,munit_prec); - make_lsbptr(p_multiplier,munit_prec); - make_lsbptr(prodp,munit_prec*2); - /* Multiply multiplicand by each word in multiplier, accumulating prod: */ - for (i=0; i 0) - mp_sub (d,modulus); - - mp_move(prod,d); - return 0; /* normal return */ -} /* upton_modmult */ - - -/* Upton's mp_modmult function leaves some internal arrays in memory, - so we have to call modmult_burn() at the end of mp_modexp. - This is so that no cryptographically sensitive data is left in memory - after the program exits. - upton_burn() is aliased to modmult_burn(). -*/ -void upton_burn(void) -{ - unitfill0(modulus,MAX_UNIT_PRECISION); - unitfill0(reciprocal,MAX_UNIT_PRECISION); - unitfill0(dhi,MAX_UNIT_PRECISION); - unitfill0(d_data,MAX_UNIT_PRECISION*2); - unitfill0(e_data,MAX_UNIT_PRECISION*2); - unitfill0(f_data,MAX_UNIT_PRECISION*2); - nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0; -} /* upton_burn */ - -/******* end of Upton's MODMULT stuff. *******/ -/*=========================================================================*/ -#endif /* UPTON */ - -#ifdef SMITH /* using Thad Smith's modmult algorithm */ - -/* - * Thad Smith's implementation of multiprecision modmult algorithm in C. - * Performs a multiply combined with a modulo operation. - * The multiplication is done with mp_dmul, the same as for Upton's - * modmult. The modulus reduction is done by long division, in - * which a trial quotient "digit" is determined, then the product of - * that digit and the divisor are subtracted from the dividend. - * - * In this case, the digit is MULTUNIT in size and the subtraction - * is done by adding the product to the one's complement of the - * dividend, which allows use of the existing mp_smul routine. - * - * The following support functions and data structures - * are used only by Smith's modmult algorithm... - */ - -/* - * These scratchpad arrays are used only by smith_modmult (mp_modmult). - * Some of them could be statically declared inside of mp_modmult, but we - * put them outside mp_modmult so that they can be wiped clean by - * modmult_burn(), which is called at the end of mp_modexp. This is - * so that no sensitive data is left in memory after the program exits. - */ - -static unit ALIGN ds_data[MAX_UNIT_PRECISION*2+2]; - -static unit mod_quotient [4]; -static unit mod_divisor [4]; /* 2 most signif. MULTUNITs of modulus */ - -static MULTUNIT *modmpl; /* ptr to modulus least significant - ** MULTUNIT */ -static int mshift; /* number of bits for - ** recip scaling */ -static MULTUNIT reciph; /* MSunit of scaled recip */ -static MULTUNIT recipl; /* LSunit of scaled recip */ - -static short modlenMULTUNITS; /* length of modulus in MULTUNITs */ -static MULTUNIT mutemp; /* temporary */ - -/* - * The routines mp_smul and mp_dmul are the same as for UPTON and - * should be coded in assembly. Note, however, that the previous - * Upton's mp_smul version has been modified to compatible with - * Smith's modmult. The new version also still works for Upton's - * modmult. - */ - -#ifndef mp_set_recip -#define mp_set_recip(rh,rl,m) /* null */ -#else -/* setup routine for external mp_quo_digit */ -void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m); -#endif -MULTUNIT mp_quo_digit (MULTUNIT *dividend); - -#ifdef MULTUNIT_SIZE_SAME -#define mp_musubb mp_subb /* use existing routine */ -#else /* ! MULTUNIT_SIZE_SAME */ - -/* - * This performs the same function as mp_subb, but with MULTUNITs. - * Note: Processors without alignment requirements may be able to use - * mp_subb, even though MULTUNITs are smaller than units. In that case, - * use mp_subb, since it would be faster if coded in assembly. Note that - * this implementation won't work for MULTUNITs which are long -- use - * mp_subb in that case. - */ -#ifndef mp_musubb -boolean mp_musubb - (register MULTUNIT* r1,register MULTUNIT* r2,register boolean borrow) -/* - * multiprecision subtract of MULTUNITs with borrow, r2 from r1, result in r1 - * borrow is incoming borrow flag-- value should be 0 or 1 - */ -{ - register ulint x; /* won't work if sizeof(MULTUNIT)== - sizeof(long) */ - short precision; /* number of MULTUNITs to subtract */ - precision = global_precision * MULTUNITs_perunit; - make_lsbptr(r1,precision); - make_lsbptr(r2,precision); - while (precision--) { - x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow; - *post_higherunit(r1) = x; - borrow = (((1L << MULTUNITSIZE) & x) != 0L); - } - return (borrow); -} /* mp_musubb */ -#endif /* mp_musubb */ -#endif /* MULTUNIT_SIZE_SAME */ - -/* - * The function mp_quo_digit is the heart of Smith's modulo reduction, - * which uses a form of long division. It computes a trial quotient - * "digit" (MULTUNIT-sized digit) by multiplying the three most - * significant MULTUNITs of the dividend by the two most significant - * MULTUNITs of the reciprocal of the modulus. Note that this function - * requires that MULTUNITSIZE * 2 <= sizeof(unsigned long). - * - * An important part of this technique is that the quotient never be - * too small, although it may occasionally be too large. This was - * done to eliminate the need to check and correct for a remainder - * exceeding the divisor. It is easier to check for a negative - * remainder. The following technique rarely needs correction for - * MULTUNITs of at least 16 bits. - * - * The following routine has two implementations: - * - * ASM_PROTOTYPE defined: written to be an executable prototype for - * an efficient assembly language implementation. Note that several - * of the following masks and shifts can be done by directly - * manipulating single precision registers on some architectures. - * - * ASM_PROTOTYPE undefined: a slightly more efficient implementation - * in C. Although this version returns a result larger than the - * optimum (which is corrected in smith_modmult) more often than the - * prototype version, the faster execution in C more than makes up - * for the difference. - * - * Parameter: dividend - points to the most significant MULTUNIT - * of the dividend. Note that dividend actually contains the - * one's complement of the actual dividend value (see comments for - * smith_modmult). - * - * Return: the trial quotient digit resulting from dividing the first - * three MULTUNITs at dividend by the upper two MULTUNITs of the - * modulus. - */ - -/* #define ASM_PROTOTYPE */ /* undefined: use C-optimized version */ - -#ifndef mp_quo_digit -MULTUNIT mp_quo_digit (MULTUNIT *dividend) { - unsigned long q, q0, q1, q2; - unsigned short lsb_factor; - -/* - * Compute the least significant product group. - *The last terms of q1 and q2 perform upward rounding, which is - *needed to guarantee that the result not be too small. - */ - q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long)reciph - + reciph; - q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)recipl - + (1L << MULTUNITSIZE) ; -#ifdef ASM_PROTOTYPE - lsb_factor = 1 & (q1>>MULTUNITSIZE) & (q2>>MULTUNITSIZE); - q = q1 + q2; - - /* The following statement is equivalent to shifting the sum right - one bit while shifting in the carry bit. - */ - q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); -#else /* optimized C version */ - q0 = (q1>>1) + (q2>>1) + 1; -#endif - -/* Compute the middle significant product group. */ - - q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph; - q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl; -#ifdef ASM_PROTOTYPE - q = q1 + q2; - q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); - -/* Add in the most significant word of the first group. - The last term takes care of the carry from adding the lsb's - that were shifted out prior to addition. -*/ - q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2)); -#else /* optimized C version */ - q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1; -#endif - -/* Compute the most significant term and add in the others */ - - q = (q >> (MULTUNITSIZE-2)) + - (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1); - q >>= mshift; - -/* Prevent overflow and then wipe out the intermediate results. */ - - mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1); - q= q0 = q1 = q2 = 0; lsb_factor = 0; (void)lsb_factor; - return mutemp; -} -#endif /* mp_quo_digit */ - -/* - * stage_smith_modulus() - Prepare for a Smith modmult. - * - * Calculate the reciprocal of modulus with a precision of two MULTUNITs. - * Assumes that global_precision has already been adjusted to the - * size of the modulus, plus SLOP_BITS. - * - * Note: This routine was designed to work with large values and - * doesn't have the necessary testing or handling to work with a - * modulus having less than three significant units. For such cases, - * the separate multiply and modulus routines can be used. - * - * stage_smith_modulus() is aliased to stage_modulus(). - */ - -int stage_smith_modulus(unitptr n_modulus) -{ - int original_precision; - int sigmod; /* significant units in modulus */ - unitptr mp; /* modulus most significant pointer */ - MULTUNIT *mpm; /* reciprocal pointer */ - int prec; /* precision of reciprocal calc in units */ - - mp_move(modulus, n_modulus); - modmpl = (MULTUNIT*) modulus; - modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit); - nbits = countbits(modulus); - modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE; - - original_precision = global_precision; - - /* The following code copies the three most significant units of - * modulus to mod_divisor. - */ - mp = modulus; - sigmod = significance (modulus); - rescale (mp, original_precision, sigmod); -/* prec is the unit precision required for 3 MULTUNITs */ - prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit; - set_precision (prec); - - /* set mp = ptr to most significant units of modulus, then move - * the most significant units to mp_divisor - */ - mp = msbptr(mp,sigmod) -tohigher(prec-1); - unmake_lsbptr (mp, prec); - mp_move (mod_divisor, mp); - - /* Keep 2*MULTUNITSIZE bits in mod_divisor. - * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits. - */ - mshift = countbits (mod_divisor) - 2*MULTUNITSIZE; - mp_shift_right_bits (mod_divisor, mshift); - mp_recip(mod_quotient,mod_divisor); - mp_shift_right_bits (mod_quotient,1); - - /* Reduce to: 0 < mshift <= MULTUNITSIZE */ - mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1; - /* round up, rescaling if necessary */ - mp_inc (mod_quotient); - if (countbits (mod_quotient) > 2*MULTUNITSIZE) { - mp_shift_right_bits (mod_quotient,1); - mshift--; /* now 0 <= mshift <= MULTUNITSIZE */ - } - mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit); - recipl = *post_higherunit (mpm); - reciph = *mpm; - mp_set_recip (reciph, recipl, mshift); - set_precision (original_precision); - return 0; /* normal return */ -} /* stage_smith_modulus */ - -/* Smith's algorithm performs a multiply combined with a modulo operation. - Computes: prod = (multiplicand*multiplier) mod modulus - The modulus must first be set by stage_smith_modulus(). - smith_modmult() is aliased to mp_modmult(). -*/ - -int -smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) -{ - unitptr d; /* ptr to product */ - MULTUNIT *dmph, *dmpl, *dmp; /* ptrs to dividend (high, low, first) - * aligned for subtraction */ -/* Note that dmph points one MULTUNIT higher than indicated by - global precision. This allows us to zero out a word one higher than - the normal precision. -*/ - short orig_precision; - short nqd; /* number of quotient digits remaining to - * be generated */ - short dmi; /* number of significant MULTUNITs in product */ - - d = ds_data + 1; /* room for leading MSB if HIGHFIRST */ - orig_precision = global_precision; - mp_dmul(d, multiplicand, multiplier); - - rescale(d, orig_precision * 2, orig_precision * 2 + 1); - set_precision(orig_precision * 2 + 1); - *msbptr(d, global_precision) = 0; /* leading 0 unit */ - -/* We now start working with MULTUNITs. - Determine the most significant MULTUNIT of the product so we don't - have to process leading zeros in our divide loop. -*/ - dmi = significance(d) * MULTUNITs_perunit; - if (dmi >= modlenMULTUNITS) { - /* Make dividend negative. This allows the use of mp_smul to - * "subtract" the product of the modulus and the trial divisor - * by actually adding to a negative dividend. - * The one's complement of the dividend is used, since it causes - * a zero value to be represented as all ones. This facilitates - * testing the result for possible overflow, since a sign bit - * indicates that no adjustment is necessary, and we should not - * attempt to adjust if the result of the addition is zero. - */ - mp_inc(d); - mp_neg(d); - set_precision(orig_precision); - munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; - - /* Set msb, lsb, and normal ptrs of dividend */ - dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) * - MULTUNITs_perunit) + tohigher(dmi); - nqd = dmi + 1 - modlenMULTUNITS; - dmpl = dmph - tohigher(modlenMULTUNITS); - -/* - * Divide loop. - * Each iteration computes the next quotient MULTUNIT digit, then - * multiplies the divisor (modulus) by the quotient digit and adds - * it to the one's complement of the dividend (equivalent to - * subtracting). If the product was greater than the remaining dividend, - * we get a non-negative result, in which case we subtract off the - * modulus to get the proper negative remainder. - */ - for (; nqd; nqd--) { - MULTUNIT q; /* quotient trial digit */ - - q = mp_quo_digit(dmph); - if (q > 0) { - mp_smul(dmpl, modmpl, q); - - /* Perform correction if q too large. - * This rarely occurs. - */ - if (!(*dmph & MULTUNIT_msb)) { - dmp = dmpl; - unmake_lsbptr(dmp, orig_precision * - MULTUNITs_perunit); - if (mp_musubb(dmp, - (MULTUNIT *) modulus, 0)) - (*dmph) --; - } - } - pre_lowerunit(dmph); - pre_lowerunit(dmpl); - } - /* d contains the one's complement of the remainder. */ - rescale(d, orig_precision * 2 + 1, orig_precision); - set_precision(orig_precision); - mp_neg(d); - mp_dec(d); - } else { - /* Product was less than modulus. Return it. */ - rescale(d, orig_precision * 2 + 1, orig_precision); - set_precision(orig_precision); - } - mp_move(prod, d); - return (0); /* normal return */ -} /* smith_modmult */ - - -/* Smith's mp_modmult function leaves some internal arrays in memory, - so we have to call modmult_burn() at the end of mp_modexp. - This is so that no cryptographically sensitive data is left in memory - after the program exits. - smith_burn() is aliased to modmult_burn(). -*/ -void smith_burn(void) -{ - empty_array (modulus); - empty_array (ds_data); - empty_array (mod_quotient); - empty_array (mod_divisor); - modmpl = 0; - mshift =nbits = 0; - reciph = recipl = 0; - modlenMULTUNITS = mutemp = 0; - mp_set_recip (0,0,0); -} /* smith_burn */ - -/* End of Thad Smith's implementation of modmult. */ - -#endif /* SMITH */ - - -int countbits(unitptr r) - /* Returns number of significant bits in r */ -{ - int bits; - short prec; - register unit bitmask; - init_bitsniffer(r,bitmask,prec,bits); - return bits; -} /* countbits */ - - -char *copyright_notice(void) -/* force linker to include copyright notice in the executable object image. */ -{ return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */ - - -int mp_modexp(register unitptr expout,register unitptr expin, - register unitptr exponent,register unitptr modulus) -/* - * Russian peasant combined exponentiation/modulo algorithm. - * Calls modmult instead of mult. - * Computes: expout = (expin**exponent) mod modulus - * WARNING: All the arguments must be less than the modulus! - */ -{ - int bits; - short oldprecision; - register unit bitmask; - unit product[MAX_UNIT_PRECISION]; - short eprec; - -#ifdef COUNTMULTS - tally_modmults = 0; /* clear "number of modmults" counter */ - tally_modsquares = 0; /* clear "number of modsquares" counter */ -#endif /* COUNTMULTS */ - mp_init(expout,1); - if (testeq(exponent,0)) { - if (testeq(expin,0)) - return -1; /* 0 to the 0th power means return error */ - return 0; /* otherwise, zero exponent means expout is 1 */ - } - if (testeq(modulus,0)) - return -2; /* zero modulus means error */ -#if SLOP_BITS > 0 /* if there's room for sign bits */ - if (mp_tstminus(modulus)) - return -2; /* negative modulus means error */ -#endif /* SLOP_BITS > 0 */ - if (mp_compare(expin,modulus) >= 0) - return -3; /* if expin >= modulus, return error */ - if (mp_compare(exponent,modulus) >= 0) - return -4; /* if exponent >= modulus, return error */ - - oldprecision = global_precision; /* save global_precision */ - /* set smallest optimum precision for this modulus */ - set_precision(bits2units(countbits(modulus)+SLOP_BITS)); - /* rescale all these registers to global_precision we just defined */ - rescale(modulus,oldprecision,global_precision); - rescale(expin,oldprecision,global_precision); - rescale(exponent,oldprecision,global_precision); - rescale(expout,oldprecision,global_precision); - - if (stage_modulus(modulus)) { - set_precision(oldprecision); /* restore original precision */ - return -5; /* unstageable modulus (STEWART algorithm) */ - } - - /* normalize and compute number of bits in exponent first */ - init_bitsniffer(exponent,bitmask,eprec,bits); - - /* We can "optimize out" the first modsquare and modmult: */ - bits--; /* We know for sure at this point that bits>0 */ - mp_move(expout,expin); /* expout = (1*1)*expin; */ - bump_bitsniffer(exponent,bitmask); - - while (bits--) { - poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */ -#ifdef COUNTMULTS - tally_modsquares++; /* bump "number of modsquares" counter */ -#endif /* COUNTMULTS */ - mp_modsquare(product,expout); - if (sniff_bit(exponent,bitmask)) { - mp_modmult(expout,product,expin); -#ifdef COUNTMULTS - tally_modmults++; /* bump "number of modmults" counter */ -#endif /* COUNTMULTS */ - } else { - mp_move(expout,product); - } - bump_bitsniffer(exponent,bitmask); - } /* while bits-- */ - mp_burn(product); /* burn the evidence on the stack */ - modmult_burn(); /* ask mp_modmult to also burn its own evidence */ - -#ifdef COUNTMULTS /* diagnostic analysis */ - { - long atomic_mults; - unsigned int unitcount,totalmults; - unitcount = bits2units(countbits(modulus)); - /* calculation assumes modsquare takes as long as a modmult: */ - atomic_mults = (long) tally_modmults * (unitcount * unitcount); - atomic_mults += (long) tally_modsquares * (unitcount * unitcount); - printf("%ld atomic mults for ",atomic_mults); - printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n", - tally_modsquares,tally_modmults, - tally_modsquares+tally_modmults, - countbits(modulus), unitcount); - } -#endif /* COUNTMULTS */ - - set_precision(oldprecision); /* restore original precision */ - - /* Do an explicit reference to the copyright notice so that the linker - will be forced to include it in the executable object image... */ - copyright_notice(); /* has no real effect at run time */ - - return 0; /* normal return */ -} /* mp_modexp */ - -int mp_modexp_crt(unitptr expout, unitptr expin, - unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u) -/* - * This is a faster modexp for moduli with a known factorisation into two - * relatively prime factors p and q, and an input relatively prime to the - * modulus, the Chinese Remainder Theorem to do the computation mod p and - * mod q, and then combine the results. This relies on a number of - * precomputed values, but does not actually require the modulus n or the - * exponent e. - * - * expout = expin ^ e mod (p*q). - * We form this by evaluating - * p2 = (expin ^ e) mod p and - * q2 = (expin ^ e) mod q - * and then combining the two by the CRT. - * - * Two optimisations of this are possible. First, we can reduce expin - * modulo p and q before starting. - * - * Second, since we know the factorisation of p and q (trivially derived - * from the factorisation of n = p*q), and expin is relatively prime to - * both p and q, we can use Euler's theorem, expin^phi(m) = 1 (mod m), - * to throw away multiples of phi(p) or phi(q) in e. - * Letting ep = e mod phi(p) and - * eq = e mod phi(q) - * then combining these two speedups, we only need to evaluate - * p2 = ((expin mod p) ^ ep) mod p and - * q2 = ((expin mod q) ^ eq) mod q. - * - * Now we need to apply the CRT. Starting with - * expout = p2 (mod p) and - * expout = q2 (mod q) - * we can say that expout = p2 + p * k, and if we assume that 0 <= p2 < p, - * then 0 <= expout < p*q for some 0 <= k < q. Since we want expout = q2 - * (mod q), then p*k = q2-p2 (mod q). Since p and q are relatively prime, - * p has a multiplicative inverse u mod q. In other words, u = 1/p (mod q). - * - * Multiplying by u on both sides gives k = u*(q2-p2) (mod q). - * Since we want 0 <= k < q, we can thus find k as - * k = (u * (q2-p2)) mod q. - * - * Once we have k, evaluating p2 + p * k is easy, and - * that gives us the result. - * - * In the detailed implementation, there is a temporary, temp, used to - * hold intermediate results, p2 is held in expout, and q2 is used as a - * temporary in the final step when it is no longer needed. With that, - * you should be able to understand the code below. - */ -{ - unit q2[MAX_UNIT_PRECISION]; - unit temp[MAX_UNIT_PRECISION]; - int status; - -/* First, compiute p2 (physically held in M) */ - -/* p2 = [ (expin mod p) ^ ep ] mod p */ - mp_mod(temp,expin,p); /* temp = expin mod p */ - status = mp_modexp(expout,temp,ep,p); - if (status < 0) { /* mp_modexp returned an error. */ - mp_init(expout,1); - return status; /* error return */ - } - -/* And the same thing for q2 */ - -/* q2 = [ (expin mod q) ^ eq ] mod q */ - mp_mod(temp,expin,q); /* temp = expin mod q */ - status = mp_modexp(q2,temp,eq,q); - if (status < 0) { /* mp_modexp returned an error. */ - mp_init(expout,1); - return status; /* error return */ - } - -/* Now use the multiplicative inverse u to glue together the - two halves. -*/ -#if 0 -/* This optimisation is useful if you commonly get small results, - but for random results and large q, the odds of (1/q) of it - being useful do not warrant the test. -*/ - if (mp_compare(expout,q2) != 0) { -#endif - /* Find q2-p2 mod q */ - if (mp_sub(q2,expout)) /* if the result went negative */ - mp_add(q2,q); /* add q to q2 */ - - /* expout = p2 + ( p * [(q2*u) mod q] ) */ - mp_mult(temp,q2,u); /* q2*u */ - mp_mod(q2,temp,q); /* (q2*u) mod q */ - mp_mult(temp,p,q2); /* p * [(q2*u) mod q] */ - mp_add(expout,temp); /* expout = p2 + p * [...] */ -#if 0 - } -#endif - - mp_burn(q2); /* burn the evidence on the stack...*/ - mp_burn(temp); - return 0; /* normal return */ -} /* mp_modexp_crt */ - - -/****************** end of MPI library ****************************/ +/* C source code for multiprecision arithmetic library routines. + Implemented Nov 86 by Philip Zimmermann + Last revised 27 Nov 91 by PRZ + + Boulder Software Engineering + 3021 Eleventh Street + Boulder, CO 80304 + (303) 541-0140 + + (c) Copyright 1986-1994 by Philip Zimmermann. All rights reserved. + The author assumes no liability for damages resulting from the use + of this software, even if the damage results from defects in this + software. No warranty is expressed or implied. The use of this + cryptographic software for developing weapon systems is expressly + forbidden. + + + These routines implement all of the multiprecision arithmetic + necessary for number-theoretic cryptographic algorithms such as + ElGamal, Diffie-Hellman, Rabin, or factoring studies for large + composite numbers, as well as Rivest-Shamir-Adleman (RSA) public + key cryptography. + + Although originally developed in Microsoft C for the IBM PC, this code + contains few machine dependencies. It assumes 2's complement + arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines, + lowbyte-highbyte order or highbyte-lowbyte order. This version + has been converted to ANSI C. + + + The internal representation for these extended precision integer + "registers" is an array of "units". A unit is a machine word, which + is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit + unsigned integer, depending on the machine's word size. For example, + an IBM PC or AT uses a unit size of 16 bits. To perform arithmetic + on these huge precision integers, we pass pointers to these unit + arrays to various subroutines. A pointer to an array of units is of + type unitptr. This is a pointer to a huge integer "register". + + When calling a subroutine, we always pass a pointer to the BEGINNING + of the array of units, regardless of the byte order of the machine. + On a lowbyte-first machine, such as the Intel 80x86, this unitptr + points to the LEAST significant unit, and the array of units increases + significance to the right. On a highbyte-first machine, such as the + Motorola 680x0, this unitptr points to the MOST significant unit, and + the array of units decreases significance to the right. + + Modified 8 Apr 92 - HAJK + Implement new VAX/VMS primitive support. + + Modified 30 Sep 92 -Castor Fu + Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long) + + Modified 28 Nov 92 - Thad Smith + Added Smith modmult, generalized non-portable support. + */ + +/* #define COUNTMULTS *//* count modmults for performance studies */ + +#ifdef DEBUG +#ifdef MSDOS +#ifdef __GO32__ /* DJGPP */ +#include +#else +#include +#endif /* __GO32__ */ +#define poll_for_break() {while (kbhit()) getch();} +#endif /* MSDOS */ +#endif /* DEBUG */ + +#ifndef poll_for_break +#define poll_for_break() /* stub */ +#endif + +#include "mpilib.h" + +#ifdef mp_smula +#ifdef mp_smul +Error:Both mp_smula and mp_smul cannot be defined. +#else +#define mp_smul mp_smula +#endif +#endif + +/* set macros for MULTUNIT */ +#ifdef MUNIT8 +#define MULTUNITSIZE 8 +typedef unsigned char MULTUNIT; +#ifdef UNIT8 +#define MULTUNIT_SIZE_SAME +#endif +#else /* not MUNIT8 */ +#ifdef MUNIT32 +#define MULTUNITSIZE 32 +typedef unsigned long MULTUNIT; +#ifdef UNIT32 +#define MULTUNIT_SIZE_SAME +#else +/* #error is not portable, this has the same effect */ +#include "UNITSIZE cannot be smaller than MULTUNITSIZE" +#endif +#else /* assume MUNIT16 */ +#define MULTUNITSIZE 16 +typedef unsigned short MULTUNIT; +#ifdef UNIT16 +#define MULTUNIT_SIZE_SAME +#endif /* UNIT16 */ +#ifdef UNIT8 +#include "UNITSIZE cannot be smaller than MULTUNITSIZE" +#endif /* UNIT8 */ +#endif /* MUNIT32 */ +#endif /* MUNIT8 */ + +#define MULTUNIT_msb ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of + MULTUNIT */ +#define DMULTUNIT_msb (1L << (2*MULTUNITSIZE-1)) +#define MULTUNIT_mask ((MULTUNIT)((1L << MULTUNITSIZE)-1)) +#define MULTUNITs_perunit (UNITSIZE/MULTUNITSIZE) + + +void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier); +void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier); + +short global_precision = 0; /* units of precision for all routines */ +/* global_precision is the unit precision last set by set_precision. + Initially, set_precision() should be called to define global_precision + before using any of these other multiprecision library routines. + i.e.: set_precision(MAX_UNIT_PRECISION); + */ + +/*************** multiprecision library primitives ****************/ +/* The following portable C primitives should be recoded in assembly. + The entry point name should be defined, in "mpilib.h" to the external + entry point name. If undefined, the C version will be used. + */ + +typedef unsigned long int ulint; + +#ifndef mp_addc +/* + multiprecision add with carry r2 to r1, result in r1 + carry is incoming carry flag-- value should be 0 or 1 +*/ +boolean mp_addc + (register unitptr r1, register unitptr r2, register boolean carry) +{ + register unit x; + short precision; /* number of units to add */ + precision = global_precision; + make_lsbptr(r1, precision); + make_lsbptr(r2, precision); + while (precision--) { + if (carry) { + x = *r1 + *r2 + 1; + carry = (*r2 >= (unit) (~*r1)); + } else { + x = *r1 + *r2; + carry = (x < *r1); + } + post_higherunit(r2); + *post_higherunit(r1) = x; + } + return carry; /* return the final carry flag bit */ +} /* mp_addc */ +#endif /* mp_addc */ + +#ifndef mp_subb + +/* + multiprecision subtract with borrow, r2 from r1, result in r1 + borrow is incoming borrow flag-- value should be 0 or 1 + */ +boolean mp_subb + (register unitptr r1, register unitptr r2, register boolean borrow) +{ + register unit x; + short precision; /* number of units to subtract */ + precision = global_precision; + make_lsbptr(r1, precision); + make_lsbptr(r2, precision); + while (precision--) { + if (borrow) { + x = *r1 - *r2 - 1; + borrow = (*r1 <= *r2); + } else { + x = *r1 - *r2; + borrow = (*r1 < *r2); + } + post_higherunit(r2); + *post_higherunit(r1) = x; + } + return borrow; /* return the final carry/borrow flag bit */ +} /* mp_subb */ +#endif /* mp_subb */ + +#ifndef mp_rotate_left + +/* + multiprecision rotate left 1 bit with carry, result in r1. + carry is incoming carry flag-- value should be 0 or 1 +*/ +boolean mp_rotate_left(register unitptr r1, register boolean carry) +{ + register int precision; /* number of units to rotate */ + unsigned int mcarry = carry, nextcarry; /* int is supposed to be + * the efficient size for ops*/ + precision = global_precision; + make_lsbptr(r1, precision); + while (precision--) { + nextcarry = (((signedunit) * r1) < 0); + *r1 = (*r1 << 1) | mcarry; + mcarry = nextcarry; + pre_higherunit(r1); + } + return nextcarry; /* return the final carry flag bit */ +} /* mp_rotate_left */ +#endif /* mp_rotate_left */ + +/************** end of primitives that should be in assembly *************/ + +/* The mp_shift_right_bits function is not called in any time-critical + situations in public-key cryptographic functions, so it doesn't + need to be coded in assembly language. + */ + +/* + multiprecision shift right bits, result in r1. + bits is how many bits to shift, must be <= UNITSIZE. +*/ +void mp_shift_right_bits(register unitptr r1, register short bits) +{ + register short precision; /* number of units to shift */ + register unit carry, nextcarry, bitmask; + register short unbits; + if (bits == 0) + return; /* shift zero bits is a no-op */ + carry = 0; + bitmask = power_of_2(bits) - 1; + unbits = UNITSIZE - bits; /* shift bits must be <= UNITSIZE */ + precision = global_precision; + make_msbptr(r1, precision); + if (bits == UNITSIZE) { + while (precision--) { + nextcarry = *r1; + *r1 = carry; + carry = nextcarry; + pre_lowerunit(r1); + } + } else { + while (precision--) { + nextcarry = *r1 & bitmask; + *r1 >>= bits; + *r1 |= carry << unbits; + carry = nextcarry; + pre_lowerunit(r1); + } + } +} /* mp_shift_right_bits */ + +#ifndef mp_compare + +/* + * Compares multiprecision integers *r1, *r2, and returns: + * -1 iff *r1 < *r2 + * 0 iff *r1 == *r2 + * +1 iff *r1 > *r2 + */ +short mp_compare(register unitptr r1, register unitptr r2) +{ + register short precision; /* number of units to compare */ + + precision = global_precision; + make_msbptr(r1, precision); + make_msbptr(r2, precision); + do { + if (*r1 < *r2) + return -1; + if (*post_lowerunit(r1) > *post_lowerunit(r2)) + return 1; + } while (--precision); + return 0; /* *r1 == *r2 */ +} /* mp_compare */ +#endif /* mp_compare */ + +/* Increment multiprecision integer r. */ +boolean mp_inc(register unitptr r) +{ + register short precision; + precision = global_precision; + make_lsbptr(r, precision); + do { + if (++(*r)) + return 0; /* no carry */ + post_higherunit(r); + } while (--precision); + return 1; /* return carry set */ +} /* mp_inc */ + +/* Decrement multiprecision integer r. */ +boolean mp_dec(register unitptr r) +{ + register short precision; + precision = global_precision; + make_lsbptr(r, precision); + do { + if ((signedunit) (--(*r)) != (signedunit) - 1) + return 0; /* no borrow */ + post_higherunit(r); + } while (--precision); + return 1; /* return borrow set */ +} /* mp_dec */ + +/* Compute 2's complement, the arithmetic negative, of r */ +void mp_neg(register unitptr r) +{ + register short precision; /* number of units to negate */ + precision = global_precision; + mp_dec(r); /* 2's complement is 1's complement plus 1 */ + do { /* now do 1's complement */ + *r = ~(*r); + r++; + } while (--precision); +} /* mp_neg */ + +#ifndef mp_move + +void mp_move(register unitptr dst, register unitptr src) +{ + register short precision; /* number of units to move */ + precision = global_precision; + do { + *dst++ = *src++; + } while (--precision); +} /* mp_move */ +#endif /* mp_move */ + +/* Init multiprecision register r with short value. */ +void mp_init(register unitptr r, word16 value) +{ /* Note that mp_init doesn't extend sign bit for >32767 */ + + unitfill0(r, global_precision); + make_lsbptr(r, global_precision); + *post_higherunit(r) = value; +#ifdef UNIT8 + *post_higherunit(r) = value >> UNITSIZE; +#endif +} /* mp_init */ + +/* Returns number of significant units in r */ +short significance(register unitptr r) +{ + register short precision; + precision = global_precision; + make_msbptr(r, precision); + do { + if (*post_lowerunit(r)) + return precision; + } while (--precision); + return precision; +} /* significance */ + + +#ifndef unitfill0 + +/* Zero-fill the unit buffer r. */ +void unitfill0(unitptr r, word16 unitcount) +{ + while (unitcount--) + *r++ = 0; +} /* unitfill0 */ +#endif /* unitfill0 */ + +/* Unsigned divide, treats both operands as positive. */ +int mp_udiv(register unitptr remainder, register unitptr quotient, + register unitptr dividend, register unitptr divisor) +{ + int bits; + short dprec; + register unit bitmask; + + if (testeq(divisor, 0)) + return -1; /* zero divisor means divide error */ + mp_init0(remainder); + mp_init0(quotient); + /* normalize and compute number of bits in dividend first */ + init_bitsniffer(dividend, bitmask, dprec, bits); + /* rescale quotient to same precision (dprec) as dividend */ + rescale(quotient, global_precision, dprec); + make_msbptr(quotient, dprec); + + while (bits--) { + mp_rotate_left(remainder, + (boolean) (sniff_bit(dividend, bitmask) != 0)); + if (mp_compare(remainder, divisor) >= 0) { + mp_sub(remainder, divisor); + stuff_bit(quotient, bitmask); + } + bump_2bitsniffers(dividend, quotient, bitmask); + } + return 0; +} /* mp_udiv */ + +#ifdef UPTON_OR_SMITH + +#define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */ + +/* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */ +int mp_recip(register unitptr quotient, register unitptr divisor) +{ + int bits; + short qprec; + register unit bitmask; + unit remainder[MAX_UNIT_PRECISION]; + if (testeq(divisor, 0)) + return -1; /* zero divisor means divide error */ + mp_init0(remainder); + mp_init0(quotient); + + /* normalize and compute number of bits in quotient first */ + bits = countbits(divisor) + RECIPMARGIN; + bitmask = bitmsk(bits); /* bitmask within a single unit */ + qprec = bits2units(bits + 1); + mp_setbit(remainder, (bits - RECIPMARGIN) - 1); + /* rescale quotient to precision of divisor + RECIPMARGIN bits */ + rescale(quotient, global_precision, qprec); + make_msbptr(quotient, qprec); + + while (bits--) { + mp_shift_left(remainder); + if (mp_compare(remainder, divisor) >= 0) { + mp_sub(remainder, divisor); + stuff_bit(quotient, bitmask); + } + bump_bitsniffer(quotient, bitmask); + } + mp_init0(remainder); /* burn sensitive data left on stack */ + return 0; +} /* mp_recip */ +#endif /* UPTON_OR_SMITH */ + +/* Signed divide, either or both operands may be negative. */ +int mp_div(register unitptr remainder, register unitptr quotient, + register unitptr dividend, register unitptr divisor) +{ + boolean dvdsign, dsign; + int status; + dvdsign = (boolean) (mp_tstminus(dividend) != 0); + dsign = (boolean) (mp_tstminus(divisor) != 0); + if (dvdsign) + mp_neg(dividend); + if (dsign) + mp_neg(divisor); + status = mp_udiv(remainder, quotient, dividend, divisor); + if (dvdsign) + mp_neg(dividend); /* repair caller's dividend */ + if (dsign) + mp_neg(divisor); /* repair caller's divisor */ + if (status < 0) + return status; /* divide error? */ + if (dvdsign) + mp_neg(remainder); + if (dvdsign ^ dsign) + mp_neg(quotient); + return status; +} /* mp_div */ + +/* + * This function does a fast divide and mod on a multiprecision dividend + * using a short integer divisor returning a short integer remainder. + * This is an unsigned divide. It treats both operands as positive. + * It is used mainly for faster printing of large numbers in base 10. + */ +word16 mp_shortdiv(register unitptr quotient, + register unitptr dividend, register word16 divisor) +{ + int bits; + short dprec; + register unit bitmask; + register word16 remainder; + if (!divisor) /* if divisor == 0 */ + return -1; /* zero divisor means divide error */ + remainder = 0; + mp_init0(quotient); + /* normalize and compute number of bits in dividend first */ + init_bitsniffer(dividend, bitmask, dprec, bits); + /* rescale quotient to same precision (dprec) as dividend */ + rescale(quotient, global_precision, dprec); + make_msbptr(quotient, dprec); + + while (bits--) { + remainder <<= 1; + if (sniff_bit(dividend, bitmask)) + remainder++; + if (remainder >= divisor) { + remainder -= divisor; + stuff_bit(quotient, bitmask); + } + bump_2bitsniffers(dividend, quotient, bitmask); + } + return remainder; +} /* mp_shortdiv */ + +/* Unsigned divide, treats both operands as positive. */ +int mp_mod(register unitptr remainder, + register unitptr dividend, register unitptr divisor) +{ + int bits; + short dprec; + register unit bitmask; + if (testeq(divisor, 0)) + return -1; /* zero divisor means divide error */ + mp_init0(remainder); + /* normalize and compute number of bits in dividend first */ + init_bitsniffer(dividend, bitmask, dprec, bits); + + while (bits--) { + mp_rotate_left(remainder, + (boolean) (sniff_bit(dividend, bitmask) != 0)); + msub(remainder, divisor); + bump_bitsniffer(dividend, bitmask); + } + return 0; +} /* mp_mod */ + +/* + * This function does a fast mod operation on a multiprecision dividend + * using a short integer modulus returning a short integer remainder. + * This is an unsigned divide. It treats both operands as positive. + * It is used mainly for fast sieve searches for large primes. + */ +word16 mp_shortmod(register unitptr dividend, register word16 divisor) +{ + int bits; + short dprec; + register unit bitmask; + register word16 remainder; + if (!divisor) /* if divisor == 0 */ + return -1; /* zero divisor means divide error */ + remainder = 0; + /* normalize and compute number of bits in dividend first */ + init_bitsniffer(dividend, bitmask, dprec, bits); + + while (bits--) { + remainder <<= 1; + if (sniff_bit(dividend, bitmask)) + remainder++; + if (remainder >= divisor) + remainder -= divisor; + bump_bitsniffer(dividend, bitmask); + } + return remainder; +} /* mp_shortmod */ + + + +#ifdef COMB_MULT /* use faster "comb" multiply algorithm */ +/* We are skipping this code because it has a bug... */ + +/* + * Computes multiprecision prod = multiplicand * multiplier + * + * Uses interleaved comb multiply algorithm. + * This improved multiply more than twice as fast as a Russian + * peasant multiply, because it does a lot fewer shifts. + * Must have global_precision set to the size of the multiplicand + * plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum + * of the lengths of the multiplier and multiplicand. + * + * BUG ALERT: Unfortunately, this code has a bug. It fails for + * some numbers. One such example: + * x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 + * x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D + * 52C8 CDC7 AFB3 61C8 243C 741B + * --which is obviously wrong. The answer should be: + * x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A + * C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 + * We'll have to fix this some day. In the meantime, we'll + * just have the compiler skip over this code. + * + * BUG NOTE: Possibly fixed. Needs testing. + */ +int mp_mult(register unitptr prod, + register unitptr multiplicand, register unitptr multiplier) +{ + int bits; + register unit bitmask; + unitptr product, mplier, temp; + short mprec, mprec2; + unit mplicand[MAX_UNIT_PRECISION]; + + /* better clear full width--double precision */ + mp_init(prod + tohigher(global_precision), 0); + + if (testeq(multiplicand, 0)) + return 0; /* zero multiplicand means zero product */ + + mp_move(mplicand, multiplicand); /* save it from damage */ + + normalize(multiplier, mprec); + if (!mprec) + return 0; + + make_lsbptr(multiplier, mprec); + bitmask = 1; /* start scan at LSB of multiplier */ + + do { /* UNITSIZE times */ + /* do for bits 0-15 */ + product = prod; + mplier = multiplier; + mprec2 = mprec; + while (mprec2--) { /* do for each word in multiplier */ + + if (sniff_bit(mplier, bitmask)) { + if (mp_addc(product, multiplicand, 0)) { /* ripple carry */ + /* After 1st time thru, this is rarely encountered. */ + temp = msbptr(product, global_precision); + pre_higherunit(temp); + /* temp now points to LSU of carry region. */ + unmake_lsbptr(temp, global_precision); + mp_inc(temp); + } /* ripple carry */ + } + pre_higherunit(mplier); + pre_higherunit(product); + } + if (!(bitmask <<= 1)) + break; + mp_shift_left(multiplicand); + + } while (TRUE); + + mp_move(multiplicand, mplicand); /* recover */ + + return 0; /* normal return */ +} /* mp_mult */ + +#endif /* COMB_MULT */ + + +/* Because the "comb" multiply has a bug, we will use the slower + Russian peasant multiply instead. Fortunately, the mp_mult + function is not called from any time-critical code. + */ + +/* + * Computes multiprecision prod = multiplicand * multiplier + * Uses "Russian peasant" multiply algorithm. + */ +int mp_mult(register unitptr prod, + register unitptr multiplicand, register unitptr multiplier) +{ + int bits; + register unit bitmask; + short mprec; + mp_init(prod, 0); + if (testeq(multiplicand, 0)) + return 0; /* zero multiplicand means zero product */ + /* normalize and compute number of bits in multiplier first */ + init_bitsniffer(multiplier, bitmask, mprec, bits); + + while (bits--) { + mp_shift_left(prod); + if (sniff_bit(multiplier, bitmask)) + mp_add(prod, multiplicand); + bump_bitsniffer(multiplier, bitmask); + } + return 0; +} /* mp_mult */ + +/* + * mp_modmult computes a multiprecision multiply combined with a + * modulo operation. This is the most time-critical function in + * this multiprecision arithmetic library for performing modulo + * exponentiation. We experimented with different versions of modmult, + * depending on the machine architecture and performance requirements. + * We will either use a Russian Peasant modmult (peasant_modmult), + * Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's + * modmult (upton_modmult), or Thad Smith's modmult (smith_modmult). + * On machines with a hardware atomic multiply instruction, + * Smith's modmult is fastest. It can utilize assembly subroutines to + * speed up the hardware multiply logic and trial quotient calculation. + * If the machine lacks a fast hardware multiply, Merritt's modmult + * is preferred, which doesn't call any assembly multiply routine. + * We use the alias names mp_modmult, stage_modulus, and modmult_burn + * for the corresponding true names, which depend on what flavor of + * modmult we are using. + * + * Before making the first call to mp_modmult, you must set up the + * modulus-dependant precomputated tables by calling stage_modulus. + * After making all the calls to mp_modmult, you call modmult_burn to + * erase the tables created by stage_modulus that were left in memory. + */ + +#ifdef COUNTMULTS +/* "number of modmults" counters, used for performance studies. */ +static unsigned int tally_modmults = 0; +static unsigned int tally_modsquares = 0; +#endif /* COUNTMULTS */ + + +#ifdef PEASANT +/* Conventional Russian peasant multiply with modulo algorithm. */ + +static unitptr pmodulus = 0; /* used only by mp_modmult */ + +/* + * Must pass modulus to stage_modulus before calling modmult. + * Assumes that global_precision has already been adjusted to the + * size of the modulus, plus SLOP_BITS. + */ +int stage_peasant_modulus(unitptr n) +{ /* For this simple version of modmult, just copy unit pointer. */ + pmodulus = n; + return 0; /* normal return */ +} /* stage_peasant_modulus */ + +/* "Russian peasant" multiply algorithm, combined with a modulo + * operation. This is a simple naive replacement for Merritt's + * faster modmult algorithm. References global unitptr "modulus". + * Computes: prod = (multiplicand*multiplier) mod modulus + * WARNING: All the arguments must be less than the modulus! + */ +int peasant_modmult(register unitptr prod, + unitptr multiplicand, register unitptr multiplier) +{ + int bits; + register unit bitmask; + short mprec; + mp_init(prod, 0); +/* if (testeq(multiplicand,0)) + return 0; *//* zero multiplicand means zero product */ + /* normalize and compute number of bits in multiplier first */ + init_bitsniffer(multiplier, bitmask, mprec, bits); + + while (bits--) { + mp_shift_left(prod); + msub(prod, pmodulus); /* turns mult into modmult */ + if (sniff_bit(multiplier, bitmask)) { + mp_add(prod, multiplicand); + msub(prod, pmodulus); /* turns mult into modmult */ + } + bump_bitsniffer(multiplier, bitmask); + } + return 0; +} /* peasant_modmult */ + +/* + * If we are using a version of mp_modmult that uses internal tables + * in memory, we have to call modmult_burn() at the end of mp_modexp. + * This is so that no sensitive data is left in memory after the program + * exits. The Russian peasant method doesn't use any such tables. + */ + +/* + * Alias for modmult_burn, called only from mp_modexp(). Destroys + * internal modmult tables. This version does nothing because no + * tables are used by the Russian peasant modmult. + */ +void peasant_burn(void) +{ +} /* peasant_burn */ + +#endif /* PEASANT */ + + +#ifdef MERRITT +/*=========================================================================*/ +/* + This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ. + Also refined by Zhahai Stewart to reduce the number of subtracts. + Modified by Raymond Brand to reduce the number of SLOP_BITS by 1. + It performs a multiply combined with a modulo operation, without + going into "double precision". It is faster than the Russian peasant + method, and still works well on machines that lack a fast hardware + multiply instruction. + */ + +/* The following support functions, macros, and data structures + are used only by Merritt's modmult algorithm... */ + +/* Shift r1 1 whole unit to the left. Used only by modmult function. */ +static void mp_lshift_unit(register unitptr r1) +{ + register short precision; + register unitptr r2; + precision = global_precision; + make_msbptr(r1, precision); + r2 = r1; + while (--precision) + *post_lowerunit(r1) = *pre_lowerunit(r2); + *r1 = 0; +} /* mp_lshift_unit */ + +/* moduli_buf contains shifted images of the modulus, set by stage_modulus */ +static unit moduli_buf[UNITSIZE - 1][MAX_UNIT_PRECISION] = +{0}; +static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */ +{0, moduli_buf[0], moduli_buf[1], moduli_buf[2], + moduli_buf[3], moduli_buf[4], moduli_buf[5], moduli_buf[6] +#ifndef UNIT8 + ,moduli_buf[7], moduli_buf[8], moduli_buf[9], moduli_buf[10], + moduli_buf[11], moduli_buf[12], moduli_buf[13], moduli_buf[14] +#ifndef UNIT16 /* and not UNIT8 */ + ,moduli_buf[15], moduli_buf[16], moduli_buf[17], moduli_buf[18], + moduli_buf[19], moduli_buf[20], moduli_buf[21], moduli_buf[22], + moduli_buf[23], moduli_buf[24], moduli_buf[25], moduli_buf[26], + moduli_buf[27], moduli_buf[28], moduli_buf[29], moduli_buf[30] +#endif /* UNIT16 and UNIT8 not defined */ +#endif /* UNIT8 not defined */ +}; + +/* + * To optimize msubs, need following 2 unit arrays, each filled + * with the most significant unit and next-to-most significant unit + * of the preshifted images of the modulus. + */ +static unit msu_moduli[UNITSIZE] = +{0}; /* most signif. unit */ +static unit nmsu_moduli[UNITSIZE] = +{0}; /* next-most signif. unit */ + +/* + * mpdbuf contains preshifted images of the multiplicand, mod n. + * It is used only by mp_modmult. It could be staticly declared + * inside of mp_modmult, but we put it outside mp_modmult so that + * it can be wiped clean by modmult_burn(), which is called at the + * end of mp_modexp. This is so that no sensitive data is left in + * memory after the program exits. + */ +static unit mpdbuf[UNITSIZE - 1][MAX_UNIT_PRECISION] = +{0}; + +/* + * Computes UNITSIZE images of r, each shifted left 1 more bit. + * Used only by modmult function. + */ +static void stage_mp_images(unitptr images[UNITSIZE], unitptr r) +{ + short int i; + + images[0] = r; /* no need to move the first image, just copy ptr */ + for (i = 1; i < UNITSIZE; i++) { + mp_move(images[i], images[i - 1]); + mp_shift_left(images[i]); + msub(images[i], moduli[0]); + } +} /* stage_mp_images */ + +/* + * Computes UNITSIZE images of modulus, each shifted left 1 more bit. + * Before calling mp_modmult, you must first stage the moduli images by + * calling stage_modulus. n is the pointer to the modulus. + * Assumes that global_precision has already been adjusted to the + * size of the modulus, plus SLOP_BITS. + */ +int stage_merritt_modulus(unitptr n) +{ + short int i; + unitptr msu; /* ptr to most significant unit, for faster msubs */ + + moduli[0] = n; /* no need to move the first image, just copy ptr */ + + /* used by optimized msubs macro... */ + msu = msbptr(n, global_precision); /* needed by msubs */ + msu_moduli[0] = *post_lowerunit(msu); + nmsu_moduli[0] = *msu; + + for (i = 1; i < UNITSIZE; i++) { + mp_move(moduli[i], moduli[i - 1]); + mp_shift_left(moduli[i]); + + /* used by optimized msubs macro... */ + msu = msbptr(moduli[i], global_precision); /* needed by msubs */ + msu_moduli[i] = *post_lowerunit(msu); /* for faster msubs */ + nmsu_moduli[i] = *msu; + } + return 0; /* normal return */ +} /* stage_merritt_modulus */ + +/* The following macros, sniffadd and msubs, are used by modmult... */ +#define sniffadd(i) if (*multiplier & power_of_2(i)) mp_add(prod,mpd[i]) + +/* Unoptimized msubs macro (msubs0) follows... */ +/* #define msubs0(i) msub(prod,moduli[i]) + */ + +/* + * To optimize msubs, compare the most significant units of the + * product and the shifted modulus before deciding to call the full + * mp_compare routine. Better still, compare the upper two units + * before deciding to call mp_compare. + * Optimization of msubs relies heavily on standard C left-to-right + * parsimonious evaluation of logical expressions. + */ + +/* Partially-optimized msubs macro (msubs1) follows... */ +/* #define msubs1(i) if ( \ + ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \ + (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \ + ) mp_sub(prod,moduli[i]) + */ + +/* Fully-optimized msubs macro (msubs2) follows... */ +#define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \ + (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \ + (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \ + mp_sub(prod,moduli[i]) + +/* + * Performs combined multiply/modulo operation. + * Computes: prod = (multiplicand*multiplier) mod modulus + * WARNING: All the arguments must be less than the modulus! + * Assumes the modulus has been predefined by first calling + * stage_modulus. + */ +int merritt_modmult(register unitptr prod, + unitptr multiplicand, register unitptr multiplier) +{ + /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro... */ + register signedunit p_m; + register unitptr msu_prod; /* ptr to most significant unit of product */ + register unitptr nmsu_prod; /* next-most signif. unit of product */ + short mprec; /* precision of multiplier, in units */ + /* Array mpd contains a list of pointers to preshifted images of + the multiplicand: */ + static unitptr mpd[UNITSIZE] = + { + 0, mpdbuf[0], mpdbuf[1], mpdbuf[2], + mpdbuf[3], mpdbuf[4], mpdbuf[5], mpdbuf[6] +#ifndef UNIT8 + ,mpdbuf[7], mpdbuf[8], mpdbuf[9], mpdbuf[10], + mpdbuf[11], mpdbuf[12], mpdbuf[13], mpdbuf[14] +#ifndef UNIT16 /* and not UNIT8 */ + ,mpdbuf[15], mpdbuf[16], mpdbuf[17], mpdbuf[18], + mpdbuf[19], mpdbuf[20], mpdbuf[21], mpdbuf[22], + mpdbuf[23], mpdbuf[24], mpdbuf[25], mpdbuf[26], + mpdbuf[27], mpdbuf[28], mpdbuf[29], mpdbuf[30] +#endif /* UNIT16 and UNIT8 not defined */ +#endif /* UNIT8 not defined */ + }; + + /* Compute preshifted images of multiplicand, mod n: */ + stage_mp_images(mpd, multiplicand); + + /* To optimize msubs, set up msu_prod and nmsu_prod: */ + msu_prod = msbptr(prod, global_precision); /* Get ptr to MSU of prod */ + nmsu_prod = msu_prod; + post_lowerunit(nmsu_prod); /* Get next-MSU of prod */ + + /* + * To understand this algorithm, it would be helpful to first + * study the conventional Russian peasant modmult algorithm. + * This one does about the same thing as Russian peasant, but + * is organized differently to save some steps. It loops + * through the multiplier a word (unit) at a time, instead of + * a bit at a time. It word-shifts the product instead of + * bit-shifting it, so it should be faster. It also does about + * half as many subtracts as Russian peasant. + */ + + mp_init(prod, 0); /* Initialize product to 0. */ + + /* + * The way mp_modmult is actually used in cryptographic + * applications, there will NEVER be a zero multiplier or + * multiplicand. So there is no need to optimize for that + * condition. + */ +/* if (testeq(multiplicand,0)) + return 0; *//* zero multiplicand means zero product */ + /* Normalize and compute number of units in multiplier first: */ + normalize(multiplier, mprec); + if (mprec == 0) /* if precision of multiplier is 0 */ + return 0; /* zero multiplier means zero product */ + make_msbptr(multiplier, mprec); /* start at MSU of multiplier */ + + while (mprec--) { /* Loop for the number of units in the multiplier */ + /* + * Shift the product one whole unit to the left. + * This is part of the multiply phase of modmult. + */ + + mp_lshift_unit(prod); + + /* + * The product may have grown by as many as UNITSIZE + * bits. That's why we have global_precision set to the + * size of the modulus plus UNITSIZE slop bits. + * Now reduce the product back down by conditionally + * subtracting the preshifted images of the modulus. + * This is part of the modulo reduction phase of modmult. + * The following loop is unrolled for speed, using macros... + + for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--) + if (mp_compare(prod,moduli[i]) >= 0) + mp_sub(prod,moduli[i]); + */ + +#ifndef UNIT8 +#ifndef UNIT16 /* and not UNIT8 */ + msubs(31); + msubs(30); + msubs(29); + msubs(28); + msubs(27); + msubs(26); + msubs(25); + msubs(24); + msubs(23); + msubs(22); + msubs(21); + msubs(20); + msubs(19); + msubs(18); + msubs(17); + msubs(16); +#endif /* not UNIT16 and not UNIT8 */ + msubs(15); + msubs(14); + msubs(13); + msubs(12); + msubs(11); + msubs(10); + msubs(9); + msubs(8); +#endif /* not UNIT8 */ + msubs(7); + msubs(6); + msubs(5); +#ifndef UNIT32 + msubs(4); +#ifndef UNIT16 + msubs(3); +#endif +#endif + + /* + * Sniff each bit in the current unit of the multiplier, + * and conditionally add the corresponding preshifted + * image of the multiplicand to the product. + * This is also part of the multiply phase of modmult. + * + * The following loop is unrolled for speed, using macros... + + for (i=UNITSIZE-1; i>=0; i--) + if (*multiplier & power_of_2(i)) + mp_add(prod,mpd[i]); + */ +#ifndef UNIT8 +#ifndef UNIT16 /* and not UNIT8 */ + sniffadd(31); + sniffadd(30); + sniffadd(29); + sniffadd(28); + sniffadd(27); + sniffadd(26); + sniffadd(25); + sniffadd(24); + sniffadd(23); + sniffadd(22); + sniffadd(21); + sniffadd(20); + sniffadd(19); + sniffadd(18); + sniffadd(17); + sniffadd(16); +#endif /* not UNIT16 and not UNIT8 */ + sniffadd(15); + sniffadd(14); + sniffadd(13); + sniffadd(12); + sniffadd(11); + sniffadd(10); + sniffadd(9); + sniffadd(8); +#endif /* not UNIT8 */ + sniffadd(7); + sniffadd(6); + sniffadd(5); + sniffadd(4); + sniffadd(3); + sniffadd(2); + sniffadd(1); + sniffadd(0); + + /* + * The product may have grown by as many as LOG_UNITSIZE+1 + * bits. + * Now reduce the product back down by conditionally + * subtracting LOG_UNITSIZE+1 preshifted images of the + * modulus. This is the modulo reduction phase of modmult. + * + * The following loop is unrolled for speed, using macros... + + for (i=LOG_UNITSIZE; i>=0; i--) + if (mp_compare(prod,moduli[i]) >= 0) + mp_sub(prod,moduli[i]); + */ + +#ifndef UNIT8 +#ifndef UNIT16 + msubs(5); +#endif + msubs(4); +#endif + msubs(3); + msubs(2); + msubs(1); + msubs(0); + + /* Bump pointer to next lower unit of multiplier: */ + post_lowerunit(multiplier); + + } /* Loop for the number of units in the + multiplier */ + + return 0; /* normal return */ + +} /* merritt_modmult */ + +#undef msubs +#undef sniffadd + +/* + * Merritt's mp_modmult function leaves some internal tables in memory, + * so we have to call modmult_burn() at the end of mp_modexp. + * This is so that no cryptographically sensitive data is left in memory + * after the program exits. + */ + +/* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */ +void merritt_burn(void) +{ + unitfill0(&(mpdbuf[0][0]), (UNITSIZE - 1) * MAX_UNIT_PRECISION); + unitfill0(&(moduli_buf[0][0]), (UNITSIZE - 1) * MAX_UNIT_PRECISION); + unitfill0(msu_moduli, UNITSIZE); + unitfill0(nmsu_moduli, UNITSIZE); +} /* merritt_burn() */ + +/******* end of Merritt's MODMULT stuff. *******/ +/*=========================================================================*/ +#endif /* MERRITT */ + + +#ifdef UPTON_OR_SMITH /* Used by Upton's and Smith's + modmult algorithms */ + +/* + * Jimmy Upton's multiprecision modmult algorithm in C. + * Performs a multiply combined with a modulo operation. + * + * The following support functions and data structures + * are used only by Upton's modmult algorithm... + */ + +short munit_prec; /* global_precision expressed in MULTUNITs */ + +/* + * Note that since the SPARC CPU has no hardware integer multiply + * instruction, there is not that much advantage in having an + * assembly version of mp_smul on that machine. It might be faster + * to use Merritt's modmult instead of Upton's modmult on the SPARC. + */ + +/* + * Multiply the single-word multiplier times the multiprecision integer + * in multiplicand, accumulating result in prod. The resulting + * multiprecision prod will be 1 word longer than the multiplicand. + * multiplicand is munit_prec words long. We add into prod, so caller + * should zero it out first. For best results, this time-critical + * function should be implemented in assembly. + * NOTE: Unlike other functions in the multiprecision arithmetic + * library, both multiplicand and prod are pointing at the LSB, + * regardless of byte order of the machine. On an 80x86, this makes + * no difference. But if this assembly function is implemented + * on a 680x0, it becomes important. + */ + +/* + * Note that this has been modified from the previous version to allow + * better support for Smith's modmult: + * The final carry bit is added to the existing product + * array, rather than simply stored. + */ + +#ifndef mp_smul +void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier) +{ + short i; + unsigned long p, carry; + + carry = 0; + for (i = 0; i < munit_prec; ++i) { + p = (unsigned long) multiplier **post_higherunit(multiplicand); + p += *prod + carry; + *post_higherunit(prod) = (MULTUNIT) p; + carry = p >> MULTUNITSIZE; + } + /* Add carry to the next higher word of product / dividend */ + *prod += (MULTUNIT) carry; +} /* mp_smul */ + +#endif + +/* + * mp_dmul is a double-precision multiply multiplicand times multiplier, + * result into prod. prod must be pointing at a "double precision" + * buffer. E.g. If global_precision is 10 words, prod must be + * pointing at a 20-word buffer. + */ +#ifndef mp_dmul +void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier) +{ + register int i; + register MULTUNIT *p_multiplicand, *p_multiplier; + register MULTUNIT *prodp; + + + unitfill0(prod, global_precision * 2); /* Pre-zero prod */ + /* Calculate precision in units of MULTUNIT */ + munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; + p_multiplicand = (MULTUNIT *) multiplicand; + p_multiplier = (MULTUNIT *) multiplier; + prodp = (MULTUNIT *) prod; + make_lsbptr(p_multiplicand, munit_prec); + make_lsbptr(p_multiplier, munit_prec); + make_lsbptr(prodp, munit_prec * 2); + /* Multiply multiplicand by each word in multiplier, accumulating prod: */ + for (i = 0; i < munit_prec; ++i) + mp_smul(post_higherunit(prodp), + p_multiplicand, *post_higherunit(p_multiplier)); +} /* mp_dmul */ +#endif /* mp_dmul */ + +static unit ALIGN modulus[MAX_UNIT_PRECISION]; +static short nbits; /* number of modulus significant bits */ +#endif /* UPTON_OR_SMITH */ + +#ifdef UPTON + +/* + * These scratchpad arrays are used only by upton_modmult (mp_modmult). + * Some of them could be staticly declared inside of mp_modmult, but we + * put them outside mp_modmult so that they can be wiped clean by + * modmult_burn(), which is called at the end of mp_modexp. This is + * so that no sensitive data is left in memory after the program exits. + */ + +static unit ALIGN reciprocal[MAX_UNIT_PRECISION]; +static unit ALIGN dhi[MAX_UNIT_PRECISION]; +static unit ALIGN d_data[MAX_UNIT_PRECISION * 2]; /* double-wide + register */ +static unit ALIGN e_data[MAX_UNIT_PRECISION * 2]; /* double-wide + register */ +static unit ALIGN f_data[MAX_UNIT_PRECISION * 2]; /* double-wide + register */ + +static short nbitsDivUNITSIZE; +static short nbitsModUNITSIZE; + +/* + * stage_upton_modulus() is aliased to stage_modulus(). + * Prepare for an Upton modmult. Calculate the reciprocal of modulus, + * and save both. Note that reciprocal will have one more bit than + * modulus. + * Assumes that global_precision has already been adjusted to the + * size of the modulus, plus SLOP_BITS. + */ +int stage_upton_modulus(unitptr n) +{ + mp_move(modulus, n); + mp_recip(reciprocal, modulus); + nbits = countbits(modulus); + nbitsDivUNITSIZE = nbits / UNITSIZE; + nbitsModUNITSIZE = nbits % UNITSIZE; + return 0; /* normal return */ +} /* stage_upton_modulus */ + + +/* + * Upton's algorithm performs a multiply combined with a modulo operation. + * Computes: prod = (multiplicand*multiplier) mod modulus + * WARNING: All the arguments must be less than the modulus! + * References global unitptr modulus and reciprocal. + * The reciprocal of modulus is 1 bit longer than the modulus. + * upton_modmult() is aliased to mp_modmult(). + */ +int upton_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) +{ + unitptr d = d_data; + unitptr d1 = d_data; + unitptr e = e_data; + unitptr f = f_data; + short orig_precision; + + orig_precision = global_precision; + mp_dmul(d, multiplicand, multiplier); + + /* Throw off low nbits of d */ +#ifndef HIGHFIRST + d1 = d + nbitsDivUNITSIZE; +#else + d1 = d + orig_precision - nbitsDivUNITSIZE; +#endif + mp_move(dhi, d1); /* Don't screw up d, we need it later */ + mp_shift_right_bits(dhi, nbitsModUNITSIZE); + + mp_dmul(e, dhi, reciprocal); /* Note - reciprocal has nbits+1 bits */ + +#ifndef HIGHFIRST + e += nbitsDivUNITSIZE; +#else + e += orig_precision - nbitsDivUNITSIZE; +#endif + mp_shift_right_bits(e, nbitsModUNITSIZE); + + mp_dmul(f, e, modulus); + + /* Now for the only double-precision call to mpilib: */ + set_precision(orig_precision * 2); + mp_sub(d, f); + + /* d's precision should be <= orig_precision */ + rescale(d, orig_precision * 2, orig_precision); + set_precision(orig_precision); + + /* Should never have to do this final subtract more than twice: */ + while (mp_compare(d, modulus) > 0) + mp_sub(d, modulus); + + mp_move(prod, d); + return 0; /* normal return */ +} /* upton_modmult */ + +/* Upton's mp_modmult function leaves some internal arrays in memory, + so we have to call modmult_burn() at the end of mp_modexp. + This is so that no cryptographically sensitive data is left in memory + after the program exits. + upton_burn() is aliased to modmult_burn(). + */ +void upton_burn(void) +{ + unitfill0(modulus, MAX_UNIT_PRECISION); + unitfill0(reciprocal, MAX_UNIT_PRECISION); + unitfill0(dhi, MAX_UNIT_PRECISION); + unitfill0(d_data, MAX_UNIT_PRECISION * 2); + unitfill0(e_data, MAX_UNIT_PRECISION * 2); + unitfill0(f_data, MAX_UNIT_PRECISION * 2); + nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0; +} /* upton_burn */ + +/******* end of Upton's MODMULT stuff. *******/ +/*=========================================================================*/ +#endif /* UPTON */ + +#ifdef SMITH /* using Thad Smith's modmult algorithm */ + +/* + * Thad Smith's implementation of multiprecision modmult algorithm in C. + * Performs a multiply combined with a modulo operation. + * The multiplication is done with mp_dmul, the same as for Upton's + * modmult. The modulus reduction is done by long division, in + * which a trial quotient "digit" is determined, then the product of + * that digit and the divisor are subtracted from the dividend. + * + * In this case, the digit is MULTUNIT in size and the subtraction + * is done by adding the product to the one's complement of the + * dividend, which allows use of the existing mp_smul routine. + * + * The following support functions and data structures + * are used only by Smith's modmult algorithm... + */ + +/* + * These scratchpad arrays are used only by smith_modmult (mp_modmult). + * Some of them could be statically declared inside of mp_modmult, but we + * put them outside mp_modmult so that they can be wiped clean by + * modmult_burn(), which is called at the end of mp_modexp. This is + * so that no sensitive data is left in memory after the program exits. + */ + +static unit ALIGN ds_data[MAX_UNIT_PRECISION * 2 + 2]; + +static unit mod_quotient[4]; +static unit mod_divisor[4]; /* 2 most signif. MULTUNITs of modulus */ + +static MULTUNIT *modmpl; /* ptr to modulus least significant + ** MULTUNIT */ +static int mshift; /* number of bits for + ** recip scaling */ +static MULTUNIT reciph; /* MSunit of scaled recip */ +static MULTUNIT recipl; /* LSunit of scaled recip */ + +static short modlenMULTUNITS; /* length of modulus in MULTUNITs */ +static MULTUNIT mutemp; /* temporary */ + +/* + * The routines mp_smul and mp_dmul are the same as for UPTON and + * should be coded in assembly. Note, however, that the previous + * Upton's mp_smul version has been modified to compatible with + * Smith's modmult. The new version also still works for Upton's + * modmult. + */ + +#ifndef mp_set_recip +#define mp_set_recip(rh,rl,m) /* null */ +#else +/* setup routine for external mp_quo_digit */ +void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m); +#endif +MULTUNIT mp_quo_digit(MULTUNIT * dividend); + +#ifdef MULTUNIT_SIZE_SAME +#define mp_musubb mp_subb /* use existing routine */ +#else /* ! MULTUNIT_SIZE_SAME */ + +/* + * This performs the same function as mp_subb, but with MULTUNITs. + * Note: Processors without alignment requirements may be able to use + * mp_subb, even though MULTUNITs are smaller than units. In that case, + * use mp_subb, since it would be faster if coded in assembly. Note that + * this implementation won't work for MULTUNITs which are long -- use + * mp_subb in that case. + */ +#ifndef mp_musubb + +/* + * multiprecision subtract of MULTUNITs with borrow, r2 from r1, result in r1 + * borrow is incoming borrow flag-- value should be 0 or 1 + */ +boolean mp_musubb + (register MULTUNIT * r1, register MULTUNIT * r2, register boolean borrow) +{ + register ulint x; /* won't work if sizeof(MULTUNIT)== + sizeof(long) */ + short precision; /* number of MULTUNITs to subtract */ + precision = global_precision * MULTUNITs_perunit; + make_lsbptr(r1, precision); + make_lsbptr(r2, precision); + while (precision--) { + x = (ulint) * r1 - (ulint) * post_higherunit(r2) - (ulint) borrow; + *post_higherunit(r1) = x; + borrow = (((1L << MULTUNITSIZE) & x) != 0L); + } + return (borrow); +} /* mp_musubb */ +#endif /* mp_musubb */ +#endif /* MULTUNIT_SIZE_SAME */ + +/* + * The function mp_quo_digit is the heart of Smith's modulo reduction, + * which uses a form of long division. It computes a trial quotient + * "digit" (MULTUNIT-sized digit) by multiplying the three most + * significant MULTUNITs of the dividend by the two most significant + * MULTUNITs of the reciprocal of the modulus. Note that this function + * requires that MULTUNITSIZE * 2 <= sizeof(unsigned long). + * + * An important part of this technique is that the quotient never be + * too small, although it may occasionally be too large. This was + * done to eliminate the need to check and correct for a remainder + * exceeding the divisor. It is easier to check for a negative + * remainder. The following technique rarely needs correction for + * MULTUNITs of at least 16 bits. + * + * The following routine has two implementations: + * + * ASM_PROTOTYPE defined: written to be an executable prototype for + * an efficient assembly language implementation. Note that several + * of the following masks and shifts can be done by directly + * manipulating single precision registers on some architectures. + * + * ASM_PROTOTYPE undefined: a slightly more efficient implementation + * in C. Although this version returns a result larger than the + * optimum (which is corrected in smith_modmult) more often than the + * prototype version, the faster execution in C more than makes up + * for the difference. + * + * Parameter: dividend - points to the most significant MULTUNIT + * of the dividend. Note that dividend actually contains the + * one's complement of the actual dividend value (see comments for + * smith_modmult). + * + * Return: the trial quotient digit resulting from dividing the first + * three MULTUNITs at dividend by the upper two MULTUNITs of the + * modulus. + */ + +/* #define ASM_PROTOTYPE *//* undefined: use C-optimized version */ + +#ifndef mp_quo_digit +MULTUNIT mp_quo_digit(MULTUNIT * dividend) +{ + unsigned long q, q0, q1, q2; + unsigned short lsb_factor; + +/* + * Compute the least significant product group. + * The last terms of q1 and q2 perform upward rounding, which is + * needed to guarantee that the result not be too small. + */ + q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long) reciph + + reciph; + q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long) recipl + + (1L << MULTUNITSIZE); +#ifdef ASM_PROTOTYPE + lsb_factor = 1 & (q1 >> MULTUNITSIZE) & (q2 >> MULTUNITSIZE); + q = q1 + q2; + + /* The following statement is equivalent to shifting the sum right + one bit while shifting in the carry bit. + */ + q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); +#else /* optimized C version */ + q0 = (q1 >> 1) + (q2 >> 1) + 1; +#endif + +/* Compute the middle significant product group. */ + + q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long) reciph; + q2 = (dividend[0] ^ MULTUNIT_mask) * (unsigned long) recipl; +#ifdef ASM_PROTOTYPE + q = q1 + q2; + q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1); + +/* Add in the most significant word of the first group. + The last term takes care of the carry from adding the lsb's + that were shifted out prior to addition. + */ + q = (q0 >> MULTUNITSIZE) + q + (lsb_factor & (q1 ^ q2)); +#else /* optimized C version */ + q = (q0 >> MULTUNITSIZE) + (q1 >> 1) + (q2 >> 1) + 1; +#endif + +/* Compute the most significant term and add in the others */ + + q = (q >> (MULTUNITSIZE - 2)) + + (((dividend[0] ^ MULTUNIT_mask) * (unsigned long) reciph) << 1); + q >>= mshift; + +/* Prevent overflow and then wipe out the intermediate results. */ + + mutemp = (MULTUNIT) min(q, (1L << MULTUNITSIZE) - 1); + q = q0 = q1 = q2 = 0; + lsb_factor = 0; + (void) lsb_factor; + return mutemp; +} +#endif /* mp_quo_digit */ + +/* + * stage_smith_modulus() - Prepare for a Smith modmult. + * + * Calculate the reciprocal of modulus with a precision of two MULTUNITs. + * Assumes that global_precision has already been adjusted to the + * size of the modulus, plus SLOP_BITS. + * + * Note: This routine was designed to work with large values and + * doesn't have the necessary testing or handling to work with a + * modulus having less than three significant units. For such cases, + * the separate multiply and modulus routines can be used. + * + * stage_smith_modulus() is aliased to stage_modulus(). + */ +int stage_smith_modulus(unitptr n_modulus) +{ + int original_precision; + int sigmod; /* significant units in modulus */ + unitptr mp; /* modulus most significant pointer */ + MULTUNIT *mpm; /* reciprocal pointer */ + int prec; /* precision of reciprocal calc in units */ + + mp_move(modulus, n_modulus); + modmpl = (MULTUNIT *) modulus; + modmpl = lsbptr(modmpl, global_precision * MULTUNITs_perunit); + nbits = countbits(modulus); + modlenMULTUNITS = (nbits + MULTUNITSIZE - 1) / MULTUNITSIZE; + + original_precision = global_precision; + + /* The following code copies the three most significant units of + * modulus to mod_divisor. + */ + mp = modulus; + sigmod = significance(modulus); + rescale(mp, original_precision, sigmod); +/* prec is the unit precision required for 3 MULTUNITs */ + prec = (3 + (MULTUNITs_perunit - 1)) / MULTUNITs_perunit; + set_precision(prec); + + /* set mp = ptr to most significant units of modulus, then move + * the most significant units to mp_divisor + */ + mp = msbptr(mp, sigmod) - tohigher(prec - 1); + unmake_lsbptr(mp, prec); + mp_move(mod_divisor, mp); + + /* Keep 2*MULTUNITSIZE bits in mod_divisor. + * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits. + */ + mshift = countbits(mod_divisor) - 2 * MULTUNITSIZE; + mp_shift_right_bits(mod_divisor, mshift); + mp_recip(mod_quotient, mod_divisor); + mp_shift_right_bits(mod_quotient, 1); + + /* Reduce to: 0 < mshift <= MULTUNITSIZE */ + mshift = ((mshift + (MULTUNITSIZE - 1)) % MULTUNITSIZE) + 1; + /* round up, rescaling if necessary */ + mp_inc(mod_quotient); + if (countbits(mod_quotient) > 2 * MULTUNITSIZE) { + mp_shift_right_bits(mod_quotient, 1); + mshift--; /* now 0 <= mshift <= MULTUNITSIZE */ + } + mpm = lsbptr((MULTUNIT *) mod_quotient, prec * MULTUNITs_perunit); + recipl = *post_higherunit(mpm); + reciph = *mpm; + mp_set_recip(reciph, recipl, mshift); + set_precision(original_precision); + return 0; /* normal return */ +} /* stage_smith_modulus */ + +/* Smith's algorithm performs a multiply combined with a modulo operation. + Computes: prod = (multiplicand*multiplier) mod modulus + The modulus must first be set by stage_smith_modulus(). + smith_modmult() is aliased to mp_modmult(). + */ +int smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier) +{ + unitptr d; /* ptr to product */ + MULTUNIT *dmph, *dmpl, *dmp; /* ptrs to dividend (high, low, first) + * aligned for subtraction */ +/* Note that dmph points one MULTUNIT higher than indicated by + global precision. This allows us to zero out a word one higher than + the normal precision. + */ + short orig_precision; + short nqd; /* number of quotient digits remaining to + * be generated */ + short dmi; /* number of significant MULTUNITs in + product */ + + d = ds_data + 1; /* room for leading MSB if HIGHFIRST */ + orig_precision = global_precision; + mp_dmul(d, multiplicand, multiplier); + + rescale(d, orig_precision * 2, orig_precision * 2 + 1); + set_precision(orig_precision * 2 + 1); + *msbptr(d, global_precision) = 0; /* leading 0 unit */ + +/* We now start working with MULTUNITs. + Determine the most significant MULTUNIT of the product so we don't + have to process leading zeros in our divide loop. + */ + dmi = significance(d) * MULTUNITs_perunit; + if (dmi >= modlenMULTUNITS) { + /* Make dividend negative. This allows the use of mp_smul to + * "subtract" the product of the modulus and the trial divisor + * by actually adding to a negative dividend. + * The one's complement of the dividend is used, since it causes + * a zero value to be represented as all ones. This facilitates + * testing the result for possible overflow, since a sign bit + * indicates that no adjustment is necessary, and we should not + * attempt to adjust if the result of the addition is zero. + */ + mp_inc(d); + mp_neg(d); + set_precision(orig_precision); + munit_prec = global_precision * UNITSIZE / MULTUNITSIZE; + + /* Set msb, lsb, and normal ptrs of dividend */ + dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) * + MULTUNITs_perunit) + tohigher(dmi); + nqd = dmi + 1 - modlenMULTUNITS; + dmpl = dmph - tohigher(modlenMULTUNITS); + +/* + * Divide loop. + * Each iteration computes the next quotient MULTUNIT digit, then + * multiplies the divisor (modulus) by the quotient digit and adds + * it to the one's complement of the dividend (equivalent to + * subtracting). If the product was greater than the remaining dividend, + * we get a non-negative result, in which case we subtract off the + * modulus to get the proper negative remainder. + */ + for (; nqd; nqd--) { + MULTUNIT q; /* quotient trial digit */ + + q = mp_quo_digit(dmph); + if (q > 0) { + mp_smul(dmpl, modmpl, q); + + /* Perform correction if q too large. + * This rarely occurs. + */ + if (!(*dmph & MULTUNIT_msb)) { + dmp = dmpl; + unmake_lsbptr(dmp, orig_precision * + MULTUNITs_perunit); + if (mp_musubb(dmp, + (MULTUNIT *) modulus, 0)) + (*dmph)--; + } + } + pre_lowerunit(dmph); + pre_lowerunit(dmpl); + } + /* d contains the one's complement of the remainder. */ + rescale(d, orig_precision * 2 + 1, orig_precision); + set_precision(orig_precision); + mp_neg(d); + mp_dec(d); + } else { + /* Product was less than modulus. Return it. */ + rescale(d, orig_precision * 2 + 1, orig_precision); + set_precision(orig_precision); + } + mp_move(prod, d); + return (0); /* normal return */ +} /* smith_modmult */ + +/* Smith's mp_modmult function leaves some internal arrays in memory, + so we have to call modmult_burn() at the end of mp_modexp. + This is so that no cryptographically sensitive data is left in memory + after the program exits. + smith_burn() is aliased to modmult_burn(). + */ +void smith_burn(void) +{ + empty_array(modulus); + empty_array(ds_data); + empty_array(mod_quotient); + empty_array(mod_divisor); + modmpl = 0; + mshift = nbits = 0; + reciph = recipl = 0; + modlenMULTUNITS = mutemp = 0; + mp_set_recip(0, 0, 0); +} /* smith_burn */ + +/* End of Thad Smith's implementation of modmult. */ + +#endif /* SMITH */ + +/* Returns number of significant bits in r */ +int countbits(unitptr r) +{ + int bits; + short prec; + register unit bitmask; + init_bitsniffer(r, bitmask, prec, bits); + return bits; +} /* countbits */ + +char *copyright_notice(void) +/* force linker to include copyright notice in the executable object image. */ +{ + return ("(c)1986 Philip Zimmermann"); +} /* copyright_notice */ + +/* + * Russian peasant combined exponentiation/modulo algorithm. + * Calls modmult instead of mult. + * Computes: expout = (expin**exponent) mod modulus + * WARNING: All the arguments must be less than the modulus! + */ +int mp_modexp(register unitptr expout, register unitptr expin, + register unitptr exponent, register unitptr modulus) +{ + int bits; + short oldprecision; + register unit bitmask; + unit product[MAX_UNIT_PRECISION]; + short eprec; + +#ifdef COUNTMULTS + tally_modmults = 0; /* clear "number of modmults" counter */ + tally_modsquares = 0; /* clear "number of modsquares" counter */ +#endif /* COUNTMULTS */ + mp_init(expout, 1); + if (testeq(exponent, 0)) { + if (testeq(expin, 0)) + return -1; /* 0 to the 0th power means return error */ + return 0; /* otherwise, zero exponent means expout + is 1 */ + } + if (testeq(modulus, 0)) + return -2; /* zero modulus means error */ +#if SLOP_BITS > 0 /* if there's room for sign bits */ + if (mp_tstminus(modulus)) + return -2; /* negative modulus means error */ +#endif /* SLOP_BITS > 0 */ + if (mp_compare(expin, modulus) >= 0) + return -3; /* if expin >= modulus, return error */ + if (mp_compare(exponent, modulus) >= 0) + return -4; /* if exponent >= modulus, return error */ + + oldprecision = global_precision; /* save global_precision */ + /* set smallest optimum precision for this modulus */ + set_precision(bits2units(countbits(modulus) + SLOP_BITS)); + /* rescale all these registers to global_precision we just defined */ + rescale(modulus, oldprecision, global_precision); + rescale(expin, oldprecision, global_precision); + rescale(exponent, oldprecision, global_precision); + rescale(expout, oldprecision, global_precision); + + if (stage_modulus(modulus)) { + set_precision(oldprecision); /* restore original precision */ + return -5; /* unstageable modulus (STEWART algorithm) */ + } + /* normalize and compute number of bits in exponent first */ + init_bitsniffer(exponent, bitmask, eprec, bits); + + /* We can "optimize out" the first modsquare and modmult: */ + bits--; /* We know for sure at this point that + bits>0 */ + mp_move(expout, expin); /* expout = (1*1)*expin; */ + bump_bitsniffer(exponent, bitmask); + + while (bits--) { + poll_for_break(); /* polls keyboard, allows ctrl-C to + abort program */ +#ifdef COUNTMULTS + tally_modsquares++; /* bump "number of modsquares" counter */ +#endif /* COUNTMULTS */ + mp_modsquare(product, expout); + if (sniff_bit(exponent, bitmask)) { + mp_modmult(expout, product, expin); +#ifdef COUNTMULTS + tally_modmults++; /* bump "number of modmults" counter */ +#endif /* COUNTMULTS */ + } else { + mp_move(expout, product); + } + bump_bitsniffer(exponent, bitmask); + } /* while bits-- */ + mp_burn(product); /* burn the evidence on the stack */ + modmult_burn(); /* ask mp_modmult to also burn its + own evidence */ + +#ifdef COUNTMULTS /* diagnostic analysis */ + { + long atomic_mults; + unsigned int unitcount, totalmults; + unitcount = bits2units(countbits(modulus)); + /* calculation assumes modsquare takes as long as a modmult: */ + atomic_mults = (long) tally_modmults *(unitcount * unitcount); + atomic_mults += (long) tally_modsquares *(unitcount * unitcount); + printf("%ld atomic mults for ", atomic_mults); + printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n", + tally_modsquares, tally_modmults, + tally_modsquares + tally_modmults, + countbits(modulus), unitcount); + } +#endif /* COUNTMULTS */ + + set_precision(oldprecision); /* restore original precision */ + + /* Do an explicit reference to the copyright notice so that the linker + will be forced to include it in the executable object image... */ + copyright_notice(); /* has no real effect at run time */ + + return 0; /* normal return */ +} /* mp_modexp */ + +/* + * This is a faster modexp for moduli with a known factorisation into two + * relatively prime factors p and q, and an input relatively prime to the + * modulus, the Chinese Remainder Theorem to do the computation mod p and + * mod q, and then combine the results. This relies on a number of + * precomputed values, but does not actually require the modulus n or the + * exponent e. + * + * expout = expin ^ e mod (p*q). + * We form this by evaluating + * p2 = (expin ^ e) mod p and + * q2 = (expin ^ e) mod q + * and then combining the two by the CRT. + * + * Two optimisations of this are possible. First, we can reduce expin + * modulo p and q before starting. + * + * Second, since we know the factorisation of p and q (trivially derived + * from the factorisation of n = p*q), and expin is relatively prime to + * both p and q, we can use Euler's theorem, expin^phi(m) = 1 (mod m), + * to throw away multiples of phi(p) or phi(q) in e. + * Letting ep = e mod phi(p) and + * eq = e mod phi(q) + * then combining these two speedups, we only need to evaluate + * p2 = ((expin mod p) ^ ep) mod p and + * q2 = ((expin mod q) ^ eq) mod q. + * + * Now we need to apply the CRT. Starting with + * expout = p2 (mod p) and + * expout = q2 (mod q) + * we can say that expout = p2 + p * k, and if we assume that 0 <= p2 < p, + * then 0 <= expout < p*q for some 0 <= k < q. Since we want expout = q2 + * (mod q), then p*k = q2-p2 (mod q). Since p and q are relatively prime, + * p has a multiplicative inverse u mod q. In other words, u = 1/p (mod q). + * + * Multiplying by u on both sides gives k = u*(q2-p2) (mod q). + * Since we want 0 <= k < q, we can thus find k as + * k = (u * (q2-p2)) mod q. + * + * Once we have k, evaluating p2 + p * k is easy, and + * that gives us the result. + * + * In the detailed implementation, there is a temporary, temp, used to + * hold intermediate results, p2 is held in expout, and q2 is used as a + * temporary in the final step when it is no longer needed. With that, + * you should be able to understand the code below. + */ +int mp_modexp_crt(unitptr expout, unitptr expin, + unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u) +{ + unit q2[MAX_UNIT_PRECISION]; + unit temp[MAX_UNIT_PRECISION]; + int status; + +/* First, compiute p2 (physically held in M) */ + +/* p2 = [ (expin mod p) ^ ep ] mod p */ + mp_mod(temp, expin, p); /* temp = expin mod p */ + status = mp_modexp(expout, temp, ep, p); + if (status < 0) { /* mp_modexp returned an error. */ + mp_init(expout, 1); + return status; /* error return */ + } +/* And the same thing for q2 */ + +/* q2 = [ (expin mod q) ^ eq ] mod q */ + mp_mod(temp, expin, q); /* temp = expin mod q */ + status = mp_modexp(q2, temp, eq, q); + if (status < 0) { /* mp_modexp returned an error. */ + mp_init(expout, 1); + return status; /* error return */ + } +/* Now use the multiplicative inverse u to glue together the + two halves. + */ +#if 0 +/* This optimisation is useful if you commonly get small results, + but for random results and large q, the odds of (1/q) of it + being useful do not warrant the test. + */ + if (mp_compare(expout, q2) != 0) { +#endif + /* Find q2-p2 mod q */ + if (mp_sub(q2, expout)) /* if the result went negative */ + mp_add(q2, q); /* add q to q2 */ + + /* expout = p2 + ( p * [(q2*u) mod q] ) */ + mp_mult(temp, q2, u); /* q2*u */ + mp_mod(q2, temp, q); /* (q2*u) mod q */ + mp_mult(temp, p, q2); /* p * [(q2*u) mod q] */ + mp_add(expout, temp); /* expout = p2 + p * [...] */ +#if 0 + } +#endif + + mp_burn(q2); /* burn the evidence on the stack... */ + mp_burn(temp); + return 0; /* normal return */ +} /* mp_modexp_crt */ + + +/****************** end of MPI library ****************************/