pgp/src/mpilib.c - annotate

Return to mpilib.c CVS log
Up to [PGP] / pgp / src
Annotation of pgp/src/mpilib.c, revision 1.1.1.7

1.1.1.7 ! root        1: /* C source code for multiprecision arithmetic library routines.
        !             2:    Implemented Nov 86 by Philip Zimmermann
        !             3:    Last revised 27 Nov 91 by PRZ
        !             4: 
        !             5:    Boulder Software Engineering
        !             6:    3021 Eleventh Street
        !             7:    Boulder, CO 80304
        !             8:    (303) 541-0140
        !             9: 
        !            10:    (c) Copyright 1986-1994 by Philip Zimmermann.  All rights reserved.
        !            11:    The author assumes no liability for damages resulting from the use
        !            12:    of this software, even if the damage results from defects in this
        !            13:    software.  No warranty is expressed or implied.  The use of this
        !            14:    cryptographic software for developing weapon systems is expressly
        !            15:    forbidden.
        !            16: 
        !            17: 
        !            18:    These routines implement all of the multiprecision arithmetic
        !            19:    necessary for number-theoretic cryptographic algorithms such as
        !            20:    ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
        !            21:    composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
        !            22:    key cryptography.
        !            23: 
        !            24:    Although originally developed in Microsoft C for the IBM PC, this code
        !            25:    contains few machine dependencies.  It assumes 2's complement
        !            26:    arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines,
        !            27:    lowbyte-highbyte order or highbyte-lowbyte order.  This version
        !            28:    has been converted to ANSI C.
        !            29: 
        !            30: 
        !            31:    The internal representation for these extended precision integer
        !            32:    "registers" is an array of "units".  A unit is a machine word, which
        !            33:    is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
        !            34:    unsigned integer, depending on the machine's word size.  For example,
        !            35:    an IBM PC or AT uses a unit size of 16 bits.  To perform arithmetic
        !            36:    on these huge precision integers, we pass pointers to these unit
        !            37:    arrays to various subroutines.  A pointer to an array of units is of
        !            38:    type unitptr.  This is a pointer to a huge integer "register".
        !            39: 
        !            40:    When calling a subroutine, we always pass a pointer to the BEGINNING
        !            41:    of the array of units, regardless of the byte order of the machine.
        !            42:    On a lowbyte-first machine, such as the Intel 80x86, this unitptr
        !            43:    points to the LEAST significant unit, and the array of units increases
        !            44:    significance to the right.  On a highbyte-first machine, such as the
        !            45:    Motorola 680x0, this unitptr points to the MOST significant unit, and
        !            46:    the array of units decreases significance to the right.
        !            47: 
        !            48:    Modified 8 Apr 92 - HAJK
        !            49:    Implement new VAX/VMS primitive support.
1.1.1.6   root       50: 
1.1.1.7 ! root       51:    Modified 30 Sep 92 -Castor Fu <[email protected]>
        !            52:    Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long)
1.1.1.6   root       53: 
1.1.1.7 ! root       54:    Modified 28 Nov 92 - Thad Smith
        !            55:    Added Smith modmult, generalized non-portable support.
        !            56:  */
1.1.1.6   root       57: 
1.1.1.7 ! root       58: /* #define COUNTMULTS *//* count modmults for performance studies */
1.1.1.6   root       59: 
                     60: #ifdef DEBUG
                     61: #ifdef MSDOS
1.1.1.7 ! root       62: #ifdef __GO32__                        /* DJGPP */
1.1.1.6   root       63: #include <pc.h>
                     64: #else
                     65: #include <conio.h>
1.1.1.7 ! root       66: #endif                         /* __GO32__ */
1.1.1.6   root       67: #define poll_for_break() {while (kbhit()) getch();}
1.1.1.7 ! root       68: #endif                         /* MSDOS */
        !            69: #endif                         /* DEBUG */
1.1.1.6   root       70: 
                     71: #ifndef poll_for_break
1.1.1.7 ! root       72: #define poll_for_break()       /* stub */
1.1.1.6   root       73: #endif
                     74: 
                     75: #include "mpilib.h"
                     76: 
                     77: #ifdef mp_smula
                     78: #ifdef mp_smul
1.1.1.7 ! root       79: Error:Both mp_smula and mp_smul cannot be defined.
1.1.1.6   root       80: #else
                     81: #define mp_smul        mp_smula
                     82: #endif
                     83: #endif
                     84: 
                     85: /* set macros for MULTUNIT */
                     86: #ifdef MUNIT8
                     87: #define MULTUNITSIZE   8
                     88: typedef unsigned char MULTUNIT;
                     89: #ifdef UNIT8
                     90: #define MULTUNIT_SIZE_SAME
                     91: #endif
1.1.1.7 ! root       92: #else                          /* not MUNIT8 */
1.1.1.6   root       93: #ifdef MUNIT32
                     94: #define MULTUNITSIZE   32
                     95: typedef unsigned long MULTUNIT;
                     96: #ifdef UNIT32
                     97: #define MULTUNIT_SIZE_SAME
                     98: #else
                     99: /* #error is not portable, this has the same effect */
                    100: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
                    101: #endif
1.1.1.7 ! root      102: #else                          /* assume MUNIT16 */
1.1.1.6   root      103: #define MULTUNITSIZE   16
                    104: typedef unsigned short MULTUNIT;
                    105: #ifdef UNIT16
                    106: #define MULTUNIT_SIZE_SAME
1.1.1.7 ! root      107: #endif                         /* UNIT16 */
1.1.1.6   root      108: #ifdef UNIT8
                    109: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
1.1.1.7 ! root      110: #endif                         /* UNIT8 */
        !           111: #endif                         /* MUNIT32 */
        !           112: #endif                         /* MUNIT8 */
1.1.1.6   root      113: 
1.1.1.7 ! root      114: #define MULTUNIT_msb    ((MULTUNIT)1 << (MULTUNITSIZE-1))      /* msb of
        !           115:                                                                   MULTUNIT */
1.1.1.6   root      116: #define DMULTUNIT_msb   (1L << (2*MULTUNITSIZE-1))
                    117: #define MULTUNIT_mask   ((MULTUNIT)((1L << MULTUNITSIZE)-1))
                    118: #define MULTUNITs_perunit   (UNITSIZE/MULTUNITSIZE)
                    119: 
                    120: 
1.1.1.7 ! root      121: void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier);
        !           122: void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier);
1.1.1.6   root      123: 
1.1.1.7 ! root      124: short global_precision = 0;    /* units of precision for all routines */
        !           125: /*      global_precision is the unit precision last set by set_precision.
        !           126:    Initially, set_precision() should be called to define global_precision
        !           127:    before using any of these other multiprecision library routines.
        !           128:    i.e.:   set_precision(MAX_UNIT_PRECISION);
        !           129:  */
1.1.1.6   root      130: 
                    131: /*************** multiprecision library primitives ****************/
1.1.1.7 ! root      132: /*      The following portable C primitives should be recoded in assembly.
        !           133:    The entry point name should be defined, in "mpilib.h" to the external
        !           134:    entry point name.  If undefined, the C version will be used.
        !           135:  */
1.1.1.6   root      136: 
                    137: typedef unsigned long int ulint;
                    138: 
                    139: #ifndef mp_addc
1.1.1.7 ! root      140: /* 
        !           141:    multiprecision add with carry r2 to r1, result in r1
        !           142:    carry is incoming carry flag-- value should be 0 or 1
        !           143: */
1.1.1.6   root      144: boolean mp_addc
1.1.1.7 ! root      145:  (register unitptr r1, register unitptr r2, register boolean carry)
        !           146: {
        !           147:     register unit x;
        !           148:     short precision;           /* number of units to add */
        !           149:     precision = global_precision;
        !           150:     make_lsbptr(r1, precision);
        !           151:     make_lsbptr(r2, precision);
        !           152:     while (precision--) {
        !           153:        if (carry) {
        !           154:            x = *r1 + *r2 + 1;
        !           155:            carry = (*r2 >= (unit) (~*r1));
        !           156:        } else {
        !           157:            x = *r1 + *r2;
        !           158:            carry = (x < *r1);
1.1.1.6   root      159:        }
1.1.1.7 ! root      160:        post_higherunit(r2);
        !           161:        *post_higherunit(r1) = x;
        !           162:     }
        !           163:     return carry;              /* return the final carry flag bit */
        !           164: }                              /* mp_addc */
        !           165: #endif                         /* mp_addc */
1.1.1.6   root      166: 
                    167: #ifndef mp_subb
1.1.1.7 ! root      168: 
        !           169: /* 
        !           170:    multiprecision subtract with borrow, r2 from r1, result in r1
        !           171:    borrow is incoming borrow flag-- value should be 0 or 1
        !           172:  */
1.1.1.6   root      173: boolean mp_subb
1.1.1.7 ! root      174:  (register unitptr r1, register unitptr r2, register boolean borrow)
        !           175: {
        !           176:     register unit x;
        !           177:     short precision;           /* number of units to subtract */
        !           178:     precision = global_precision;
        !           179:     make_lsbptr(r1, precision);
        !           180:     make_lsbptr(r2, precision);
        !           181:     while (precision--) {
        !           182:        if (borrow) {
        !           183:            x = *r1 - *r2 - 1;
        !           184:            borrow = (*r1 <= *r2);
        !           185:        } else {
        !           186:            x = *r1 - *r2;
        !           187:            borrow = (*r1 < *r2);
1.1.1.6   root      188:        }
1.1.1.7 ! root      189:        post_higherunit(r2);
        !           190:        *post_higherunit(r1) = x;
        !           191:     }
        !           192:     return borrow;             /* return the final carry/borrow flag bit */
        !           193: }                              /* mp_subb */
        !           194: #endif                         /* mp_subb */
1.1.1.6   root      195: 
                    196: #ifndef mp_rotate_left
1.1.1.7 ! root      197: 
        !           198: /*
        !           199:    multiprecision rotate left 1 bit with carry, result in r1.
        !           200:    carry is incoming carry flag-- value should be 0 or 1
        !           201: */
        !           202: boolean mp_rotate_left(register unitptr r1, register boolean carry)
1.1.1.6   root      203: {
1.1.1.7 ! root      204:     register int precision;    /* number of units to rotate */
        !           205:     unsigned int mcarry = carry, nextcarry;    /* int is supposed to be
1.1.1.6   root      206:                                                 * the efficient size for ops*/
1.1.1.7 ! root      207:     precision = global_precision;
        !           208:     make_lsbptr(r1, precision);
        !           209:     while (precision--) {
        !           210:        nextcarry = (((signedunit) * r1) < 0);
        !           211:        *r1 = (*r1 << 1) | mcarry;
        !           212:        mcarry = nextcarry;
        !           213:        pre_higherunit(r1);
        !           214:     }
        !           215:     return nextcarry;          /* return the final carry flag bit */
        !           216: }                              /* mp_rotate_left */
        !           217: #endif                         /* mp_rotate_left */
1.1.1.6   root      218: 
                    219: /************** end of primitives that should be in assembly *************/
                    220: 
1.1.1.7 ! root      221: /* The mp_shift_right_bits function is not called in any time-critical
        !           222:    situations in public-key cryptographic functions, so it doesn't
        !           223:    need to be coded in assembly language.
        !           224:  */
1.1.1.6   root      225: 
1.1.1.7 ! root      226: /*
        !           227:    multiprecision shift right bits, result in r1.
        !           228:    bits is how many bits to shift, must be <= UNITSIZE.
1.1.1.6   root      229: */
1.1.1.7 ! root      230: void mp_shift_right_bits(register unitptr r1, register short bits)
        !           231: {
        !           232:     register short precision;  /* number of units to shift */
        !           233:     register unit carry, nextcarry, bitmask;
        !           234:     register short unbits;
        !           235:     if (bits == 0)
        !           236:        return;                 /* shift zero bits is a no-op */
        !           237:     carry = 0;
        !           238:     bitmask = power_of_2(bits) - 1;
        !           239:     unbits = UNITSIZE - bits;  /* shift bits must be <= UNITSIZE */
        !           240:     precision = global_precision;
        !           241:     make_msbptr(r1, precision);
        !           242:     if (bits == UNITSIZE) {
        !           243:        while (precision--) {
        !           244:            nextcarry = *r1;
        !           245:            *r1 = carry;
        !           246:            carry = nextcarry;
        !           247:            pre_lowerunit(r1);
1.1.1.6   root      248:        }
1.1.1.7 ! root      249:     } else {
        !           250:        while (precision--) {
        !           251:            nextcarry = *r1 & bitmask;
        !           252:            *r1 >>= bits;
        !           253:            *r1 |= carry << unbits;
        !           254:            carry = nextcarry;
        !           255:            pre_lowerunit(r1);
        !           256:        }
        !           257:     }
        !           258: }                              /* mp_shift_right_bits */
1.1.1.6   root      259: 
                    260: #ifndef mp_compare
1.1.1.7 ! root      261: 
1.1.1.6   root      262: /*
                    263:  * Compares multiprecision integers *r1, *r2, and returns:
                    264:  * -1 iff *r1 < *r2
                    265:  *  0 iff *r1 == *r2
                    266:  * +1 iff *r1 > *r2
                    267:  */
1.1.1.7 ! root      268: short mp_compare(register unitptr r1, register unitptr r2)
1.1.1.6   root      269: {
1.1.1.7 ! root      270:     register short precision;  /* number of units to compare */
1.1.1.6   root      271: 
1.1.1.7 ! root      272:     precision = global_precision;
        !           273:     make_msbptr(r1, precision);
        !           274:     make_msbptr(r2, precision);
        !           275:     do {
        !           276:        if (*r1 < *r2)
        !           277:            return -1;
        !           278:        if (*post_lowerunit(r1) > *post_lowerunit(r2))
        !           279:            return 1;
        !           280:     } while (--precision);
        !           281:     return 0;                  /*  *r1 == *r2  */
        !           282: }                              /* mp_compare */
        !           283: #endif                         /* mp_compare */
1.1.1.6   root      284: 
                    285: /* Increment multiprecision integer r. */
1.1.1.7 ! root      286: boolean mp_inc(register unitptr r)
1.1.1.6   root      287: {
1.1.1.7 ! root      288:     register short precision;
        !           289:     precision = global_precision;
        !           290:     make_lsbptr(r, precision);
        !           291:     do {
        !           292:        if (++(*r))
        !           293:            return 0;           /* no carry */
        !           294:        post_higherunit(r);
        !           295:     } while (--precision);
        !           296:     return 1;                  /* return carry set */
        !           297: }                              /* mp_inc */
1.1.1.6   root      298: 
                    299: /* Decrement multiprecision integer r. */
1.1.1.7 ! root      300: boolean mp_dec(register unitptr r)
1.1.1.6   root      301: {
1.1.1.7 ! root      302:     register short precision;
        !           303:     precision = global_precision;
        !           304:     make_lsbptr(r, precision);
        !           305:     do {
        !           306:        if ((signedunit) (--(*r)) != (signedunit) - 1)
        !           307:            return 0;           /* no borrow */
        !           308:        post_higherunit(r);
        !           309:     } while (--precision);
        !           310:     return 1;                  /* return borrow set */
        !           311: }                              /* mp_dec */
1.1.1.6   root      312: 
1.1.1.7 ! root      313: /* Compute 2's complement, the arithmetic negative, of r */
1.1.1.6   root      314: void mp_neg(register unitptr r)
                    315: {
1.1.1.7 ! root      316:     register short precision;  /* number of units to negate */
        !           317:     precision = global_precision;
        !           318:     mp_dec(r);                 /* 2's complement is 1's complement plus 1 */
        !           319:     do {                       /* now do 1's complement */
        !           320:        *r = ~(*r);
        !           321:        r++;
        !           322:     } while (--precision);
        !           323: }                              /* mp_neg */
1.1.1.6   root      324: 
                    325: #ifndef mp_move
1.1.1.7 ! root      326: 
        !           327: void mp_move(register unitptr dst, register unitptr src)
1.1.1.6   root      328: {
1.1.1.7 ! root      329:     register short precision;  /* number of units to move */
        !           330:     precision = global_precision;
        !           331:     do {
        !           332:        *dst++ = *src++;
        !           333:     } while (--precision);
        !           334: }                              /* mp_move */
        !           335: #endif                         /* mp_move */
1.1.1.6   root      336: 
1.1.1.7 ! root      337: /* Init multiprecision register r with short value. */
1.1.1.6   root      338: void mp_init(register unitptr r, word16 value)
                    339: {      /* Note that mp_init doesn't extend sign bit for >32767 */
                    340: 
1.1.1.7 ! root      341:     unitfill0(r, global_precision);
        !           342:     make_lsbptr(r, global_precision);
        !           343:     *post_higherunit(r) = value;
1.1.1.6   root      344: #ifdef UNIT8
1.1.1.7 ! root      345:     *post_higherunit(r) = value >> UNITSIZE;
1.1.1.6   root      346: #endif
1.1.1.7 ! root      347: }                              /* mp_init */
1.1.1.6   root      348: 
1.1.1.7 ! root      349: /* Returns number of significant units in r */
1.1.1.6   root      350: short significance(register unitptr r)
                    351: {
1.1.1.7 ! root      352:     register short precision;
        !           353:     precision = global_precision;
        !           354:     make_msbptr(r, precision);
        !           355:     do {
        !           356:        if (*post_lowerunit(r))
        !           357:            return precision;
        !           358:     } while (--precision);
        !           359:     return precision;
        !           360: }                              /* significance */
1.1.1.6   root      361: 
                    362: 
                    363: #ifndef unitfill0
                    364: 
1.1.1.7 ! root      365: /* Zero-fill the unit buffer r. */
        !           366: void unitfill0(unitptr r, word16 unitcount)
        !           367: {
        !           368:     while (unitcount--)
        !           369:        *r++ = 0;
        !           370: }                              /* unitfill0 */
        !           371: #endif                         /* unitfill0 */
1.1.1.6   root      372: 
1.1.1.7 ! root      373: /* Unsigned divide, treats both operands as positive. */
        !           374: int mp_udiv(register unitptr remainder, register unitptr quotient,
        !           375:            register unitptr dividend, register unitptr divisor)
1.1.1.6   root      376: {
1.1.1.7 ! root      377:     int bits;
        !           378:     short dprec;
        !           379:     register unit bitmask;
        !           380: 
        !           381:     if (testeq(divisor, 0))
        !           382:        return -1;              /* zero divisor means divide error */
        !           383:     mp_init0(remainder);
        !           384:     mp_init0(quotient);
        !           385:     /* normalize and compute number of bits in dividend first */
        !           386:     init_bitsniffer(dividend, bitmask, dprec, bits);
        !           387:     /* rescale quotient to same precision (dprec) as dividend */
        !           388:     rescale(quotient, global_precision, dprec);
        !           389:     make_msbptr(quotient, dprec);
        !           390: 
        !           391:     while (bits--) {
        !           392:        mp_rotate_left(remainder,
        !           393:                       (boolean) (sniff_bit(dividend, bitmask) != 0));
        !           394:        if (mp_compare(remainder, divisor) >= 0) {
        !           395:            mp_sub(remainder, divisor);
        !           396:            stuff_bit(quotient, bitmask);
        !           397:        }
        !           398:        bump_2bitsniffers(dividend, quotient, bitmask);
        !           399:     }
        !           400:     return 0;
        !           401: }                              /* mp_udiv */
1.1.1.6   root      402: 
1.1.1.7 ! root      403: #ifdef UPTON_OR_SMITH
1.1.1.6   root      404: 
1.1.1.7 ! root      405: #define RECIPMARGIN 0          /* extra margin bits used by mp_recip() */
1.1.1.6   root      406: 
1.1.1.7 ! root      407: /* Compute reciprocal (quotient) as 1/divisor.  Used by faster modmult. */
        !           408: int mp_recip(register unitptr quotient, register unitptr divisor)
        !           409: {
        !           410:     int bits;
        !           411:     short qprec;
        !           412:     register unit bitmask;
        !           413:     unit remainder[MAX_UNIT_PRECISION];
        !           414:     if (testeq(divisor, 0))
        !           415:        return -1;              /* zero divisor means divide error */
        !           416:     mp_init0(remainder);
        !           417:     mp_init0(quotient);
        !           418: 
        !           419:     /* normalize and compute number of bits in quotient first */
        !           420:     bits = countbits(divisor) + RECIPMARGIN;
        !           421:     bitmask = bitmsk(bits);    /* bitmask within a single unit */
        !           422:     qprec = bits2units(bits + 1);
        !           423:     mp_setbit(remainder, (bits - RECIPMARGIN) - 1);
        !           424:     /* rescale quotient to precision of divisor + RECIPMARGIN bits */
        !           425:     rescale(quotient, global_precision, qprec);
        !           426:     make_msbptr(quotient, qprec);
        !           427: 
        !           428:     while (bits--) {
        !           429:        mp_shift_left(remainder);
        !           430:        if (mp_compare(remainder, divisor) >= 0) {
        !           431:            mp_sub(remainder, divisor);
        !           432:            stuff_bit(quotient, bitmask);
        !           433:        }
        !           434:        bump_bitsniffer(quotient, bitmask);
        !           435:     }
        !           436:     mp_init0(remainder);       /* burn sensitive data left on stack */
        !           437:     return 0;
        !           438: }                              /* mp_recip */
        !           439: #endif                         /* UPTON_OR_SMITH */
        !           440: 
        !           441: /* Signed divide, either or both operands may be negative. */
        !           442: int mp_div(register unitptr remainder, register unitptr quotient,
        !           443:           register unitptr dividend, register unitptr divisor)
        !           444: {
        !           445:     boolean dvdsign, dsign;
        !           446:     int status;
        !           447:     dvdsign = (boolean) (mp_tstminus(dividend) != 0);
        !           448:     dsign = (boolean) (mp_tstminus(divisor) != 0);
        !           449:     if (dvdsign)
        !           450:        mp_neg(dividend);
        !           451:     if (dsign)
        !           452:        mp_neg(divisor);
        !           453:     status = mp_udiv(remainder, quotient, dividend, divisor);
        !           454:     if (dvdsign)
        !           455:        mp_neg(dividend);       /* repair caller's dividend */
        !           456:     if (dsign)
        !           457:        mp_neg(divisor);        /* repair caller's divisor */
        !           458:     if (status < 0)
        !           459:        return status;          /* divide error? */
        !           460:     if (dvdsign)
        !           461:        mp_neg(remainder);
        !           462:     if (dvdsign ^ dsign)
        !           463:        mp_neg(quotient);
        !           464:     return status;
        !           465: }                              /* mp_div */
1.1.1.6   root      466: 
                    467: /*
                    468:  * This function does a fast divide and mod on a multiprecision dividend
                    469:  * using a short integer divisor returning a short integer remainder.
                    470:  * This is an unsigned divide.  It treats both operands as positive.
                    471:  * It is used mainly for faster printing of large numbers in base 10.
                    472:  */
1.1.1.7 ! root      473: word16 mp_shortdiv(register unitptr quotient,
        !           474:                   register unitptr dividend, register word16 divisor)
1.1.1.6   root      475: {
1.1.1.7 ! root      476:     int bits;
        !           477:     short dprec;
        !           478:     register unit bitmask;
        !           479:     register word16 remainder;
        !           480:     if (!divisor)              /* if divisor == 0 */
        !           481:        return -1;              /* zero divisor means divide error */
        !           482:     remainder = 0;
        !           483:     mp_init0(quotient);
        !           484:     /* normalize and compute number of bits in dividend first */
        !           485:     init_bitsniffer(dividend, bitmask, dprec, bits);
        !           486:     /* rescale quotient to same precision (dprec) as dividend */
        !           487:     rescale(quotient, global_precision, dprec);
        !           488:     make_msbptr(quotient, dprec);
        !           489: 
        !           490:     while (bits--) {
        !           491:        remainder <<= 1;
        !           492:        if (sniff_bit(dividend, bitmask))
        !           493:            remainder++;
        !           494:        if (remainder >= divisor) {
        !           495:            remainder -= divisor;
        !           496:            stuff_bit(quotient, bitmask);
        !           497:        }
        !           498:        bump_2bitsniffers(dividend, quotient, bitmask);
        !           499:     }
        !           500:     return remainder;
        !           501: }                              /* mp_shortdiv */
1.1.1.6   root      502: 
                    503: /* Unsigned divide, treats both operands as positive. */
1.1.1.7 ! root      504: int mp_mod(register unitptr remainder,
        !           505:           register unitptr dividend, register unitptr divisor)
1.1.1.6   root      506: {
1.1.1.7 ! root      507:     int bits;
        !           508:     short dprec;
        !           509:     register unit bitmask;
        !           510:     if (testeq(divisor, 0))
        !           511:        return -1;              /* zero divisor means divide error */
        !           512:     mp_init0(remainder);
        !           513:     /* normalize and compute number of bits in dividend first */
        !           514:     init_bitsniffer(dividend, bitmask, dprec, bits);
        !           515: 
        !           516:     while (bits--) {
        !           517:        mp_rotate_left(remainder,
        !           518:                       (boolean) (sniff_bit(dividend, bitmask) != 0));
        !           519:        msub(remainder, divisor);
        !           520:        bump_bitsniffer(dividend, bitmask);
        !           521:     }
        !           522:     return 0;
        !           523: }                              /* mp_mod */
1.1.1.6   root      524: 
                    525: /*
                    526:  * This function does a fast mod operation on a multiprecision dividend
                    527:  * using a short integer modulus returning a short integer remainder.
                    528:  * This is an unsigned divide.  It treats both operands as positive.
                    529:  * It is used mainly for fast sieve searches for large primes.
                    530:  */
1.1.1.7 ! root      531: word16 mp_shortmod(register unitptr dividend, register word16 divisor)
        !           532: {
        !           533:     int bits;
        !           534:     short dprec;
        !           535:     register unit bitmask;
        !           536:     register word16 remainder;
        !           537:     if (!divisor)              /* if divisor == 0 */
        !           538:        return -1;              /* zero divisor means divide error */
        !           539:     remainder = 0;
        !           540:     /* normalize and compute number of bits in dividend first */
        !           541:     init_bitsniffer(dividend, bitmask, dprec, bits);
        !           542: 
        !           543:     while (bits--) {
        !           544:        remainder <<= 1;
        !           545:        if (sniff_bit(dividend, bitmask))
        !           546:            remainder++;
        !           547:        if (remainder >= divisor)
        !           548:            remainder -= divisor;
        !           549:        bump_bitsniffer(dividend, bitmask);
        !           550:     }
        !           551:     return remainder;
        !           552: }                              /* mp_shortmod */
1.1.1.6   root      553: 
                    554: 
                    555: 
1.1.1.7 ! root      556: #ifdef COMB_MULT               /* use faster "comb" multiply algorithm */
        !           557: /* We are skipping this code because it has a bug... */
1.1.1.6   root      558: 
                    559: /*
                    560:  * Computes multiprecision prod = multiplicand * multiplier
                    561:  *
                    562:  * Uses interleaved comb multiply algorithm.
                    563:  * This improved multiply more than twice as fast as a Russian
                    564:  * peasant multiply, because it does a lot fewer shifts.
                    565:  * Must have global_precision set to the size of the multiplicand
                    566:  * plus UNITSIZE-1 SLOP_BITS.  Produces a product that is the sum
                    567:  * of the lengths of the multiplier and multiplicand.
                    568:  *
                    569:  * BUG ALERT:  Unfortunately, this code has a bug.  It fails for
                    570:  * some numbers.  One such example:
                    571:  * x=   59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5
                    572:  * x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D
                    573:  *      52C8 CDC7 AFB3 61C8 243C 741B
                    574:  * --which is obviously wrong.  The answer should be:
                    575:  * x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A
                    576:  *      C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9
                    577:  * We'll have to fix this some day.  In the meantime, we'll
                    578:  * just have the compiler skip over this code.
                    579:  *
                    580:  * BUG NOTE: Possibly fixed.  Needs testing.
                    581:  */
1.1.1.7 ! root      582: int mp_mult(register unitptr prod,
        !           583:            register unitptr multiplicand, register unitptr multiplier)
1.1.1.6   root      584: {
1.1.1.7 ! root      585:     int bits;
        !           586:     register unit bitmask;
        !           587:     unitptr product, mplier, temp;
        !           588:     short mprec, mprec2;
        !           589:     unit mplicand[MAX_UNIT_PRECISION];
1.1.1.6   root      590: 
1.1.1.7 ! root      591:     /* better clear full width--double precision */
        !           592:     mp_init(prod + tohigher(global_precision), 0);
1.1.1.6   root      593: 
1.1.1.7 ! root      594:     if (testeq(multiplicand, 0))
        !           595:        return 0;               /* zero multiplicand means zero product */
1.1.1.6   root      596: 
1.1.1.7 ! root      597:     mp_move(mplicand, multiplicand);   /* save it from damage */
1.1.1.6   root      598: 
1.1.1.7 ! root      599:     normalize(multiplier, mprec);
        !           600:     if (!mprec)
        !           601:        return 0;
1.1.1.6   root      602: 
1.1.1.7 ! root      603:     make_lsbptr(multiplier, mprec);
        !           604:     bitmask = 1;               /* start scan at LSB of multiplier */
1.1.1.6   root      605: 
1.1.1.7 ! root      606:     do {                       /* UNITSIZE times */
        !           607:        /* do for bits 0-15 */
        !           608:        product = prod;
        !           609:        mplier = multiplier;
        !           610:        mprec2 = mprec;
        !           611:        while (mprec2--) {      /* do for each word in multiplier */
        !           612: 
        !           613:            if (sniff_bit(mplier, bitmask)) {
        !           614:                if (mp_addc(product, multiplicand, 0)) {  /* ripple carry */
        !           615:                  /* After 1st time thru, this is rarely encountered. */
        !           616:                    temp = msbptr(product, global_precision);
        !           617:                    pre_higherunit(temp);
        !           618:                    /* temp now points to LSU of carry region. */
        !           619:                    unmake_lsbptr(temp, global_precision);
        !           620:                    mp_inc(temp);
        !           621:                }               /* ripple carry */
        !           622:            }
        !           623:            pre_higherunit(mplier);
        !           624:            pre_higherunit(product);
        !           625:        }
        !           626:        if (!(bitmask <<= 1))
        !           627:            break;
        !           628:        mp_shift_left(multiplicand);
        !           629: 
        !           630:     } while (TRUE);
        !           631: 
        !           632:     mp_move(multiplicand, mplicand);   /* recover */
        !           633: 
        !           634:     return 0;                  /* normal return */
        !           635: }                              /* mp_mult */
        !           636: 
        !           637: #endif                         /* COMB_MULT */
        !           638: 
        !           639: 
        !           640: /* Because the "comb" multiply has a bug, we will use the slower
        !           641:    Russian peasant multiply instead.  Fortunately, the mp_mult
        !           642:    function is not called from any time-critical code.
        !           643:  */
1.1.1.6   root      644: 
                    645: /*
                    646:  * Computes multiprecision prod = multiplicand * multiplier
                    647:  * Uses "Russian peasant" multiply algorithm.
                    648:  */
1.1.1.7 ! root      649: int mp_mult(register unitptr prod,
        !           650:            register unitptr multiplicand, register unitptr multiplier)
1.1.1.6   root      651: {
1.1.1.7 ! root      652:     int bits;
        !           653:     register unit bitmask;
        !           654:     short mprec;
        !           655:     mp_init(prod, 0);
        !           656:     if (testeq(multiplicand, 0))
        !           657:        return 0;               /* zero multiplicand means zero product */
        !           658:     /* normalize and compute number of bits in multiplier first */
        !           659:     init_bitsniffer(multiplier, bitmask, mprec, bits);
        !           660: 
        !           661:     while (bits--) {
        !           662:        mp_shift_left(prod);
        !           663:        if (sniff_bit(multiplier, bitmask))
        !           664:            mp_add(prod, multiplicand);
        !           665:        bump_bitsniffer(multiplier, bitmask);
        !           666:     }
        !           667:     return 0;
        !           668: }                              /* mp_mult */
1.1.1.6   root      669: 
                    670: /*
                    671:  * mp_modmult computes a multiprecision multiply combined with a
                    672:  * modulo operation.  This is the most time-critical function in
                    673:  * this multiprecision arithmetic library for performing modulo
                    674:  * exponentiation.  We experimented with different versions of modmult,
                    675:  * depending on the machine architecture and performance requirements.
                    676:  * We will either use a Russian Peasant modmult (peasant_modmult), 
                    677:  * Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's
                    678:  * modmult (upton_modmult), or Thad Smith's modmult (smith_modmult).
                    679:  * On machines with a hardware atomic multiply instruction,
                    680:  * Smith's modmult is fastest.  It can utilize assembly subroutines to
                    681:  * speed up the hardware multiply logic and trial quotient calculation.
                    682:  * If the machine lacks a fast hardware multiply, Merritt's modmult
                    683:  * is preferred, which doesn't call any assembly multiply routine.
                    684:  * We use the alias names mp_modmult, stage_modulus, and modmult_burn
                    685:  * for the corresponding true names, which depend on what flavor of
                    686:  * modmult we are using.
                    687:  * 
                    688:  * Before making the first call to mp_modmult, you must set up the
                    689:  * modulus-dependant precomputated tables by calling stage_modulus.
                    690:  * After making all the calls to mp_modmult, you call modmult_burn to
                    691:  * erase the tables created by stage_modulus that were left in memory.
                    692:  */
                    693: 
                    694: #ifdef COUNTMULTS
                    695: /* "number of modmults" counters, used for performance studies. */
                    696: static unsigned int tally_modmults = 0;
                    697: static unsigned int tally_modsquares = 0;
1.1.1.7 ! root      698: #endif                         /* COUNTMULTS */
1.1.1.6   root      699: 
                    700: 
                    701: #ifdef PEASANT
                    702: /* Conventional Russian peasant multiply with modulo algorithm. */
                    703: 
                    704: static unitptr pmodulus = 0;   /* used only by mp_modmult */
                    705: 
                    706: /*
                    707:  * Must pass modulus to stage_modulus before calling modmult.
                    708:  * Assumes that global_precision has already been adjusted to the
                    709:  * size of the modulus, plus SLOP_BITS.
                    710:  */
1.1.1.7 ! root      711: int stage_peasant_modulus(unitptr n)
1.1.1.6   root      712: {      /* For this simple version of modmult, just copy unit pointer. */
1.1.1.7 ! root      713:     pmodulus = n;
        !           714:     return 0;                  /* normal return */
        !           715: }                              /* stage_peasant_modulus */
1.1.1.6   root      716: 
                    717: /* "Russian peasant" multiply algorithm, combined with a modulo
                    718:  * operation.  This is a simple naive replacement for Merritt's
                    719:  * faster modmult algorithm.  References global unitptr "modulus".
                    720:  * Computes:  prod = (multiplicand*multiplier) mod modulus
                    721:  * WARNING: All the arguments must be less than the modulus!
                    722:  */
1.1.1.7 ! root      723: int peasant_modmult(register unitptr prod,
        !           724:                    unitptr multiplicand, register unitptr multiplier)
1.1.1.6   root      725: {
1.1.1.7 ! root      726:     int bits;
        !           727:     register unit bitmask;
        !           728:     short mprec;
        !           729:     mp_init(prod, 0);
        !           730: /*      if (testeq(multiplicand,0))
        !           731:        return 0; *//* zero multiplicand means zero product */
        !           732:     /* normalize and compute number of bits in multiplier first */
        !           733:     init_bitsniffer(multiplier, bitmask, mprec, bits);
        !           734: 
        !           735:     while (bits--) {
        !           736:        mp_shift_left(prod);
        !           737:        msub(prod, pmodulus);   /* turns mult into modmult */
        !           738:        if (sniff_bit(multiplier, bitmask)) {
        !           739:            mp_add(prod, multiplicand);
        !           740:            msub(prod, pmodulus);       /* turns mult into modmult */
        !           741:        }
        !           742:        bump_bitsniffer(multiplier, bitmask);
        !           743:     }
        !           744:     return 0;
        !           745: }                              /* peasant_modmult */
1.1.1.6   root      746: 
                    747: /*
                    748:  * If we are using a version of mp_modmult that uses internal tables
                    749:  * in memory, we have to call modmult_burn() at the end of mp_modexp.
                    750:  * This is so that no sensitive data is left in memory after the program
                    751:  * exits.  The Russian peasant method doesn't use any such tables.
                    752:  */
1.1.1.7 ! root      753: 
1.1.1.6   root      754: /*
                    755:  * Alias for modmult_burn, called only from mp_modexp().  Destroys
                    756:  * internal modmult tables.  This version does nothing because no
                    757:  * tables are used by the Russian peasant modmult.
                    758:  */
1.1.1.7 ! root      759: void peasant_burn(void)
        !           760: {
        !           761: }                              /* peasant_burn */
1.1.1.6   root      762: 
1.1.1.7 ! root      763: #endif                         /* PEASANT */
1.1.1.6   root      764: 
                    765: 
                    766: #ifdef MERRITT
                    767: /*=========================================================================*/
                    768: /*
1.1.1.7 ! root      769:    This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
        !           770:    Also refined by Zhahai Stewart to reduce the number of subtracts.
        !           771:    Modified by Raymond Brand to reduce the number of SLOP_BITS by 1.
        !           772:    It performs a multiply combined with a modulo operation, without
        !           773:    going into "double precision".  It is faster than the Russian peasant
        !           774:    method, and still works well on machines that lack a fast hardware
        !           775:    multiply instruction.
        !           776:  */
1.1.1.6   root      777: 
1.1.1.7 ! root      778: /* The following support functions, macros, and data structures
        !           779:    are used only by Merritt's modmult algorithm... */
1.1.1.6   root      780: 
1.1.1.7 ! root      781: /*      Shift r1 1 whole unit to the left.  Used only by modmult function. */
1.1.1.6   root      782: static void mp_lshift_unit(register unitptr r1)
                    783: {
1.1.1.7 ! root      784:     register short precision;
        !           785:     register unitptr r2;
        !           786:     precision = global_precision;
        !           787:     make_msbptr(r1, precision);
        !           788:     r2 = r1;
        !           789:     while (--precision)
        !           790:        *post_lowerunit(r1) = *pre_lowerunit(r2);
        !           791:     *r1 = 0;
        !           792: }                              /* mp_lshift_unit */
1.1.1.6   root      793: 
                    794: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
1.1.1.7 ! root      795: static unit moduli_buf[UNITSIZE - 1][MAX_UNIT_PRECISION] =
        !           796: {0};
        !           797: static unitptr moduli[UNITSIZE] =      /* contains pointers into moduli_buf */
        !           798: {0, moduli_buf[0], moduli_buf[1], moduli_buf[2],
        !           799:  moduli_buf[3], moduli_buf[4], moduli_buf[5], moduli_buf[6]
1.1.1.6   root      800: #ifndef UNIT8
1.1.1.7 ! root      801:  ,moduli_buf[7], moduli_buf[8], moduli_buf[9], moduli_buf[10],
        !           802:  moduli_buf[11], moduli_buf[12], moduli_buf[13], moduli_buf[14]
        !           803: #ifndef UNIT16                 /* and not UNIT8 */
        !           804:  ,moduli_buf[15], moduli_buf[16], moduli_buf[17], moduli_buf[18],
        !           805:  moduli_buf[19], moduli_buf[20], moduli_buf[21], moduli_buf[22],
        !           806:  moduli_buf[23], moduli_buf[24], moduli_buf[25], moduli_buf[26],
        !           807:  moduli_buf[27], moduli_buf[28], moduli_buf[29], moduli_buf[30]
        !           808: #endif                         /* UNIT16 and UNIT8 not defined */
        !           809: #endif                         /* UNIT8 not defined */
1.1.1.6   root      810: };
                    811: 
                    812: /*
                    813:  * To optimize msubs, need following 2 unit arrays, each filled
                    814:  * with the most significant unit and next-to-most significant unit
                    815:  * of the preshifted images of the modulus.
                    816:  */
1.1.1.7 ! root      817: static unit msu_moduli[UNITSIZE] =
        !           818: {0};                           /* most signif. unit */
        !           819: static unit nmsu_moduli[UNITSIZE] =
        !           820: {0};                           /* next-most signif. unit */
1.1.1.6   root      821: 
                    822: /*
                    823:  * mpdbuf contains preshifted images of the multiplicand, mod n.
                    824:  * It is used only by mp_modmult.  It could be staticly declared
                    825:  * inside of mp_modmult, but we put it outside mp_modmult so that
                    826:  * it can be wiped clean by modmult_burn(), which is called at the
                    827:  * end of mp_modexp.  This is so that no sensitive data is left in
                    828:  * memory after the program exits.
                    829:  */
1.1.1.7 ! root      830: static unit mpdbuf[UNITSIZE - 1][MAX_UNIT_PRECISION] =
        !           831: {0};
1.1.1.6   root      832: 
                    833: /*
                    834:  * Computes UNITSIZE images of r, each shifted left 1 more bit.
                    835:  * Used only by modmult function.
                    836:  */
1.1.1.7 ! root      837: static void stage_mp_images(unitptr images[UNITSIZE], unitptr r)
1.1.1.6   root      838: {
1.1.1.7 ! root      839:     short int i;
1.1.1.6   root      840: 
1.1.1.7 ! root      841:     images[0] = r;     /* no need to move the first image, just copy ptr */
        !           842:     for (i = 1; i < UNITSIZE; i++) {
        !           843:        mp_move(images[i], images[i - 1]);
        !           844:        mp_shift_left(images[i]);
        !           845:        msub(images[i], moduli[0]);
        !           846:     }
        !           847: }                              /* stage_mp_images */
1.1.1.6   root      848: 
                    849: /*
                    850:  * Computes UNITSIZE images of modulus, each shifted left 1 more bit.
                    851:  * Before calling mp_modmult, you must first stage the moduli images by
                    852:  * calling stage_modulus.  n is the pointer to the modulus.
                    853:  * Assumes that global_precision has already been adjusted to the
                    854:  * size of the modulus, plus SLOP_BITS.
                    855:  */
1.1.1.7 ! root      856: int stage_merritt_modulus(unitptr n)
1.1.1.6   root      857: {
1.1.1.7 ! root      858:     short int i;
        !           859:     unitptr msu;       /* ptr to most significant unit, for faster msubs */
1.1.1.6   root      860: 
1.1.1.7 ! root      861:     moduli[0] = n;     /* no need to move the first image, just copy ptr */
1.1.1.6   root      862: 
1.1.1.7 ! root      863:     /* used by optimized msubs macro... */
        !           864:     msu = msbptr(n, global_precision); /* needed by msubs */
        !           865:     msu_moduli[0] = *post_lowerunit(msu);
        !           866:     nmsu_moduli[0] = *msu;
        !           867: 
        !           868:     for (i = 1; i < UNITSIZE; i++) {
        !           869:        mp_move(moduli[i], moduli[i - 1]);
        !           870:        mp_shift_left(moduli[i]);
1.1.1.6   root      871: 
1.1.1.7 ! root      872:        /* used by optimized msubs macro... */
        !           873:        msu = msbptr(moduli[i], global_precision);      /* needed by msubs */
        !           874:        msu_moduli[i] = *post_lowerunit(msu);   /* for faster msubs */
        !           875:        nmsu_moduli[i] = *msu;
        !           876:     }
        !           877:     return 0;                  /* normal return */
        !           878: }                              /* stage_merritt_modulus */
1.1.1.6   root      879: 
                    880: /* The following macros, sniffadd and msubs, are used by modmult... */
                    881: #define sniffadd(i) if (*multiplier & power_of_2(i))  mp_add(prod,mpd[i])
                    882: 
                    883: /* Unoptimized msubs macro (msubs0) follows... */
                    884: /* #define msubs0(i) msub(prod,moduli[i])
1.1.1.7 ! root      885:  */
1.1.1.6   root      886: 
                    887: /*
                    888:  * To optimize msubs, compare the most significant units of the
                    889:  * product and the shifted modulus before deciding to call the full
                    890:  * mp_compare routine.  Better still, compare the upper two units
                    891:  * before deciding to call mp_compare.
                    892:  * Optimization of msubs relies heavily on standard C left-to-right
                    893:  * parsimonious evaluation of logical expressions.
                    894:  */
                    895: 
                    896: /* Partially-optimized msubs macro (msubs1) follows... */
                    897: /* #define msubs1(i) if ( \
1.1.1.7 ! root      898:    ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
        !           899:    (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
        !           900:    ) mp_sub(prod,moduli[i])
        !           901:  */
1.1.1.6   root      902: 
                    903: /* Fully-optimized msubs macro (msubs2) follows... */
                    904: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
                    905:  (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
                    906:  (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
                    907:  mp_sub(prod,moduli[i])
                    908: 
                    909: /*
                    910:  * Performs combined multiply/modulo operation.
                    911:  * Computes:  prod = (multiplicand*multiplier) mod modulus
                    912:  * WARNING: All the arguments must be less than the modulus!
                    913:  * Assumes the modulus has been predefined by first calling
                    914:  * stage_modulus.
                    915:  */
1.1.1.7 ! root      916: int merritt_modmult(register unitptr prod,
        !           917:                    unitptr multiplicand, register unitptr multiplier)
1.1.1.6   root      918: {
1.1.1.7 ! root      919:     /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro... */
        !           920:     register signedunit p_m;
        !           921:     register unitptr msu_prod; /* ptr to most significant unit of product */
        !           922:     register unitptr nmsu_prod;        /* next-most signif. unit of product */
        !           923:     short mprec;               /* precision of multiplier, in units */
        !           924:     /*      Array mpd contains a list of pointers to preshifted images of
        !           925:        the multiplicand: */
        !           926:     static unitptr mpd[UNITSIZE] =
        !           927:     {
        !           928:        0, mpdbuf[0], mpdbuf[1], mpdbuf[2],
        !           929:        mpdbuf[3], mpdbuf[4], mpdbuf[5], mpdbuf[6]
1.1.1.6   root      930: #ifndef UNIT8
1.1.1.7 ! root      931:        ,mpdbuf[7], mpdbuf[8], mpdbuf[9], mpdbuf[10],
        !           932:        mpdbuf[11], mpdbuf[12], mpdbuf[13], mpdbuf[14]
        !           933: #ifndef UNIT16                 /* and not UNIT8 */
        !           934:        ,mpdbuf[15], mpdbuf[16], mpdbuf[17], mpdbuf[18],
        !           935:        mpdbuf[19], mpdbuf[20], mpdbuf[21], mpdbuf[22],
        !           936:        mpdbuf[23], mpdbuf[24], mpdbuf[25], mpdbuf[26],
        !           937:        mpdbuf[27], mpdbuf[28], mpdbuf[29], mpdbuf[30]
        !           938: #endif                         /* UNIT16 and UNIT8 not defined */
        !           939: #endif                         /* UNIT8 not defined */
        !           940:     };
        !           941: 
        !           942:     /* Compute preshifted images of multiplicand, mod n: */
        !           943:     stage_mp_images(mpd, multiplicand);
        !           944: 
        !           945:     /* To optimize msubs, set up msu_prod and nmsu_prod: */
        !           946:     msu_prod = msbptr(prod, global_precision); /* Get ptr to MSU of prod */
        !           947:     nmsu_prod = msu_prod;
        !           948:     post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
        !           949: 
        !           950:     /*
        !           951:      * To understand this algorithm, it would be helpful to first
        !           952:      * study the conventional Russian peasant modmult algorithm.
        !           953:      * This one does about the same thing as Russian peasant, but
        !           954:      * is organized differently to save some steps.  It loops
        !           955:      * through the multiplier a word (unit) at a time, instead of
        !           956:      * a bit at a time.  It word-shifts the product instead of
        !           957:      * bit-shifting it, so it should be faster.  It also does about
        !           958:      * half as many subtracts as Russian peasant.
        !           959:      */
        !           960: 
        !           961:     mp_init(prod, 0);          /* Initialize product to 0. */
        !           962: 
        !           963:     /*
        !           964:      * The way mp_modmult is actually used in cryptographic
        !           965:      * applications, there will NEVER be a zero multiplier or
        !           966:      * multiplicand.  So there is no need to optimize for that
        !           967:      * condition.
        !           968:      */
        !           969: /*      if (testeq(multiplicand,0))
        !           970:        return 0; *//* zero multiplicand means zero product */
        !           971:     /* Normalize and compute number of units in multiplier first: */
        !           972:     normalize(multiplier, mprec);
        !           973:     if (mprec == 0)            /* if precision of multiplier is 0 */
        !           974:        return 0;               /* zero multiplier means zero product */
        !           975:     make_msbptr(multiplier, mprec);    /* start at MSU of multiplier */
1.1.1.6   root      976: 
1.1.1.7 ! root      977:     while (mprec--) { /* Loop for the number of units in the multiplier */
1.1.1.6   root      978:        /*
1.1.1.7 ! root      979:         * Shift the product one whole unit to the left.
        !           980:         * This is part of the multiply phase of modmult.
1.1.1.6   root      981:         */
                    982: 
1.1.1.7 ! root      983:        mp_lshift_unit(prod);
1.1.1.6   root      984: 
1.1.1.7 ! root      985:        /* 
        !           986:         * The product may have grown by as many as UNITSIZE
        !           987:         * bits.  That's why we have global_precision set to the
        !           988:         * size of the modulus plus UNITSIZE slop bits.
        !           989:         * Now reduce the product back down by conditionally
        !           990:         * subtracting the preshifted images of the modulus.
        !           991:         * This is part of the modulo reduction phase of modmult. 
        !           992:         * The following loop is unrolled for speed, using macros...
        !           993: 
        !           994:         for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--)
        !           995:         if (mp_compare(prod,moduli[i]) >= 0)
        !           996:         mp_sub(prod,moduli[i]);
1.1.1.6   root      997:         */
                    998: 
                    999: #ifndef UNIT8
1.1.1.7 ! root     1000: #ifndef UNIT16                 /* and not UNIT8 */
        !          1001:        msubs(31);
        !          1002:        msubs(30);
        !          1003:        msubs(29);
        !          1004:        msubs(28);
        !          1005:        msubs(27);
        !          1006:        msubs(26);
        !          1007:        msubs(25);
        !          1008:        msubs(24);
        !          1009:        msubs(23);
        !          1010:        msubs(22);
        !          1011:        msubs(21);
        !          1012:        msubs(20);
        !          1013:        msubs(19);
        !          1014:        msubs(18);
        !          1015:        msubs(17);
        !          1016:        msubs(16);
        !          1017: #endif                         /* not UNIT16 and not UNIT8 */
        !          1018:        msubs(15);
        !          1019:        msubs(14);
        !          1020:        msubs(13);
        !          1021:        msubs(12);
        !          1022:        msubs(11);
        !          1023:        msubs(10);
        !          1024:        msubs(9);
        !          1025:        msubs(8);
        !          1026: #endif                         /* not UNIT8 */
        !          1027:        msubs(7);
        !          1028:        msubs(6);
        !          1029:        msubs(5);
1.1.1.6   root     1030: #ifndef UNIT32
1.1.1.7 ! root     1031:        msubs(4);
1.1.1.6   root     1032: #ifndef UNIT16
1.1.1.7 ! root     1033:        msubs(3);
1.1.1.6   root     1034: #endif
                   1035: #endif
                   1036: 
1.1.1.7 ! root     1037:        /*
        !          1038:         * Sniff each bit in the current unit of the multiplier, 
        !          1039:         * and conditionally add the corresponding preshifted 
        !          1040:         * image of the multiplicand to the product.
        !          1041:         * This is also part of the multiply phase of modmult.
        !          1042:         *
        !          1043:         * The following loop is unrolled for speed, using macros...
        !          1044: 
        !          1045:         for (i=UNITSIZE-1; i>=0; i--)
        !          1046:         if (*multiplier & power_of_2(i))
        !          1047:         mp_add(prod,mpd[i]);
        !          1048:         */
1.1.1.6   root     1049: #ifndef UNIT8
1.1.1.7 ! root     1050: #ifndef UNIT16                 /* and not UNIT8 */
        !          1051:        sniffadd(31);
        !          1052:        sniffadd(30);
        !          1053:        sniffadd(29);
        !          1054:        sniffadd(28);
        !          1055:        sniffadd(27);
        !          1056:        sniffadd(26);
        !          1057:        sniffadd(25);
        !          1058:        sniffadd(24);
        !          1059:        sniffadd(23);
        !          1060:        sniffadd(22);
        !          1061:        sniffadd(21);
        !          1062:        sniffadd(20);
        !          1063:        sniffadd(19);
        !          1064:        sniffadd(18);
        !          1065:        sniffadd(17);
        !          1066:        sniffadd(16);
        !          1067: #endif                         /* not UNIT16 and not UNIT8 */
        !          1068:        sniffadd(15);
        !          1069:        sniffadd(14);
        !          1070:        sniffadd(13);
        !          1071:        sniffadd(12);
        !          1072:        sniffadd(11);
        !          1073:        sniffadd(10);
        !          1074:        sniffadd(9);
        !          1075:        sniffadd(8);
        !          1076: #endif                         /* not UNIT8 */
        !          1077:        sniffadd(7);
        !          1078:        sniffadd(6);
        !          1079:        sniffadd(5);
        !          1080:        sniffadd(4);
        !          1081:        sniffadd(3);
        !          1082:        sniffadd(2);
        !          1083:        sniffadd(1);
        !          1084:        sniffadd(0);
        !          1085: 
        !          1086:        /*
        !          1087:         * The product may have grown by as many as LOG_UNITSIZE+1
        !          1088:         * bits.
        !          1089:         * Now reduce the product back down by conditionally 
        !          1090:         * subtracting LOG_UNITSIZE+1 preshifted images of the 
        !          1091:         * modulus.  This is the modulo reduction phase of modmult.
        !          1092:         *
        !          1093:         * The following loop is unrolled for speed, using macros...
        !          1094: 
        !          1095:         for (i=LOG_UNITSIZE; i>=0; i--)
        !          1096:         if (mp_compare(prod,moduli[i]) >= 0) 
        !          1097:         mp_sub(prod,moduli[i]); 
        !          1098:         */
1.1.1.6   root     1099: 
                   1100: #ifndef UNIT8
                   1101: #ifndef UNIT16
1.1.1.7 ! root     1102:        msubs(5);
1.1.1.6   root     1103: #endif
1.1.1.7 ! root     1104:        msubs(4);
1.1.1.6   root     1105: #endif
1.1.1.7 ! root     1106:        msubs(3);
        !          1107:        msubs(2);
        !          1108:        msubs(1);
        !          1109:        msubs(0);
1.1.1.6   root     1110: 
1.1.1.7 ! root     1111:        /* Bump pointer to next lower unit of multiplier: */
        !          1112:        post_lowerunit(multiplier);
1.1.1.6   root     1113: 
1.1.1.7 ! root     1114:     }                          /* Loop for the number of units in the
        !          1115:                                   multiplier */
1.1.1.6   root     1116: 
1.1.1.7 ! root     1117:     return 0;                  /* normal return */
1.1.1.6   root     1118: 
1.1.1.7 ! root     1119: }                              /* merritt_modmult */
1.1.1.6   root     1120: 
                   1121: #undef msubs
                   1122: #undef sniffadd
                   1123: 
                   1124: /*
                   1125:  * Merritt's mp_modmult function leaves some internal tables in memory,
                   1126:  * so we have to call modmult_burn() at the end of mp_modexp.
                   1127:  * This is so that no cryptographically sensitive data is left in memory
                   1128:  * after the program exits.
                   1129:  */
1.1.1.7 ! root     1130: 
1.1.1.6   root     1131: /* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
1.1.1.7 ! root     1132: void merritt_burn(void)
1.1.1.6   root     1133: {
1.1.1.7 ! root     1134:     unitfill0(&(mpdbuf[0][0]), (UNITSIZE - 1) * MAX_UNIT_PRECISION);
        !          1135:     unitfill0(&(moduli_buf[0][0]), (UNITSIZE - 1) * MAX_UNIT_PRECISION);
        !          1136:     unitfill0(msu_moduli, UNITSIZE);
        !          1137:     unitfill0(nmsu_moduli, UNITSIZE);
        !          1138: }                              /* merritt_burn() */
1.1.1.6   root     1139: 
                   1140: /******* end of Merritt's MODMULT stuff. *******/
                   1141: /*=========================================================================*/
1.1.1.7 ! root     1142: #endif                         /* MERRITT */
1.1.1.6   root     1143: 
                   1144: 
1.1.1.7 ! root     1145: #ifdef UPTON_OR_SMITH          /* Used by Upton's and Smith's
        !          1146:                                   modmult algorithms */
1.1.1.6   root     1147: 
                   1148: /*
                   1149:  * Jimmy Upton's multiprecision modmult algorithm in C.
                   1150:  * Performs a multiply combined with a modulo operation.
                   1151:  *
                   1152:  * The following support functions and data structures
                   1153:  * are used only by Upton's modmult algorithm...
                   1154:  */
                   1155: 
1.1.1.7 ! root     1156: short munit_prec;              /* global_precision expressed in MULTUNITs */
1.1.1.6   root     1157: 
                   1158: /*
                   1159:  * Note that since the SPARC CPU has no hardware integer multiply
                   1160:  * instruction, there is not that much advantage in having an
                   1161:  * assembly version of mp_smul on that machine.  It might be faster
                   1162:  * to use Merritt's modmult instead of Upton's modmult on the SPARC.
                   1163:  */
                   1164: 
                   1165: /*
                   1166:  * Multiply the single-word multiplier times the multiprecision integer
                   1167:  * in multiplicand, accumulating result in prod.  The resulting
                   1168:  * multiprecision prod will be 1 word longer than the multiplicand.
                   1169:  * multiplicand is munit_prec words long.  We add into prod, so caller
                   1170:  * should zero it out first.  For best results, this time-critical
                   1171:  * function should be implemented in assembly.
                   1172:  * NOTE:  Unlike other functions in the multiprecision arithmetic
                   1173:  * library, both multiplicand and prod are pointing at the LSB,
                   1174:  * regardless of byte order of the machine.  On an 80x86, this makes
                   1175:  * no difference.  But if this assembly function is implemented
                   1176:  * on a 680x0, it becomes important.
                   1177:  */
                   1178: 
                   1179: /*
                   1180:  * Note that this has been modified from the previous version to allow
                   1181:  * better support for Smith's modmult:
1.1.1.7 ! root     1182:  *      The final carry bit is added to the existing product
        !          1183:  *      array, rather than simply stored.
        !          1184:  */
1.1.1.6   root     1185: 
                   1186: #ifndef mp_smul
1.1.1.7 ! root     1187: void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier)
1.1.1.6   root     1188: {
1.1.1.7 ! root     1189:     short i;
        !          1190:     unsigned long p, carry;
        !          1191: 
        !          1192:     carry = 0;
        !          1193:     for (i = 0; i < munit_prec; ++i) {
        !          1194:        p = (unsigned long) multiplier **post_higherunit(multiplicand);
        !          1195:        p += *prod + carry;
        !          1196:        *post_higherunit(prod) = (MULTUNIT) p;
        !          1197:        carry = p >> MULTUNITSIZE;
        !          1198:     }
        !          1199:     /* Add carry to the next higher word of product / dividend */
        !          1200:     *prod += (MULTUNIT) carry;
        !          1201: }                              /* mp_smul */
1.1.1.6   root     1202: 
                   1203: #endif
                   1204: 
                   1205: /*
                   1206:  * mp_dmul is a double-precision multiply multiplicand times multiplier,
                   1207:  * result into prod.  prod must be pointing at a "double precision"
                   1208:  * buffer.  E.g. If global_precision is 10 words, prod must be
                   1209:  * pointing at a 20-word buffer.
                   1210:  */
                   1211: #ifndef mp_dmul
1.1.1.7 ! root     1212: void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier)
1.1.1.6   root     1213: {
1.1.1.7 ! root     1214:     register int i;
        !          1215:     register MULTUNIT *p_multiplicand, *p_multiplier;
        !          1216:     register MULTUNIT *prodp;
        !          1217: 
        !          1218: 
        !          1219:     unitfill0(prod, global_precision * 2);     /* Pre-zero prod */
        !          1220:     /* Calculate precision in units of MULTUNIT */
        !          1221:     munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
        !          1222:     p_multiplicand = (MULTUNIT *) multiplicand;
        !          1223:     p_multiplier = (MULTUNIT *) multiplier;
        !          1224:     prodp = (MULTUNIT *) prod;
        !          1225:     make_lsbptr(p_multiplicand, munit_prec);
        !          1226:     make_lsbptr(p_multiplier, munit_prec);
        !          1227:     make_lsbptr(prodp, munit_prec * 2);
        !          1228:     /* Multiply multiplicand by each word in multiplier, accumulating prod: */
        !          1229:     for (i = 0; i < munit_prec; ++i)
        !          1230:        mp_smul(post_higherunit(prodp),
        !          1231:                p_multiplicand, *post_higherunit(p_multiplier));
        !          1232: }                              /* mp_dmul */
        !          1233: #endif                         /* mp_dmul */
1.1.1.6   root     1234: 
                   1235: static unit ALIGN modulus[MAX_UNIT_PRECISION];
1.1.1.7 ! root     1236: static short nbits;            /* number of modulus significant bits */
        !          1237: #endif                         /* UPTON_OR_SMITH */
1.1.1.6   root     1238: 
                   1239: #ifdef UPTON
                   1240: 
                   1241: /*
                   1242:  * These scratchpad arrays are used only by upton_modmult (mp_modmult).
                   1243:  * Some of them could be staticly declared inside of mp_modmult, but we
                   1244:  * put them outside mp_modmult so that they can be wiped clean by
                   1245:  * modmult_burn(), which is called at the end of mp_modexp.  This is
                   1246:  * so that no sensitive data is left in memory after the program exits.
                   1247:  */
                   1248: 
                   1249: static unit ALIGN reciprocal[MAX_UNIT_PRECISION];
                   1250: static unit ALIGN dhi[MAX_UNIT_PRECISION];
1.1.1.7 ! root     1251: static unit ALIGN d_data[MAX_UNIT_PRECISION * 2];      /* double-wide
        !          1252:                                                           register */
        !          1253: static unit ALIGN e_data[MAX_UNIT_PRECISION * 2];      /* double-wide
        !          1254:                                                           register */
        !          1255: static unit ALIGN f_data[MAX_UNIT_PRECISION * 2];      /* double-wide
        !          1256:                                                           register */
1.1.1.6   root     1257: 
                   1258: static short nbitsDivUNITSIZE;
                   1259: static short nbitsModUNITSIZE;
                   1260: 
                   1261: /*
                   1262:  * stage_upton_modulus() is aliased to stage_modulus().
                   1263:  * Prepare for an Upton modmult.  Calculate the reciprocal of modulus,
                   1264:  * and save both.  Note that reciprocal will have one more bit than
                   1265:  * modulus.
                   1266:  * Assumes that global_precision has already been adjusted to the
                   1267:  * size of the modulus, plus SLOP_BITS.
                   1268:  */
                   1269: int stage_upton_modulus(unitptr n)
                   1270: {
1.1.1.7 ! root     1271:     mp_move(modulus, n);
        !          1272:     mp_recip(reciprocal, modulus);
        !          1273:     nbits = countbits(modulus);
        !          1274:     nbitsDivUNITSIZE = nbits / UNITSIZE;
        !          1275:     nbitsModUNITSIZE = nbits % UNITSIZE;
        !          1276:     return 0;                  /* normal return */
        !          1277: }                              /* stage_upton_modulus */
1.1.1.6   root     1278: 
                   1279: 
                   1280: /*
                   1281:  * Upton's algorithm performs a multiply combined with a modulo operation.
                   1282:  * Computes:  prod = (multiplicand*multiplier) mod modulus
                   1283:  * WARNING: All the arguments must be less than the modulus!
                   1284:  * References global unitptr modulus and reciprocal.
                   1285:  * The reciprocal of modulus is 1 bit longer than the modulus.
                   1286:  * upton_modmult() is aliased to mp_modmult().
                   1287:  */
1.1.1.7 ! root     1288: int upton_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
1.1.1.6   root     1289: {
1.1.1.7 ! root     1290:     unitptr d = d_data;
        !          1291:     unitptr d1 = d_data;
        !          1292:     unitptr e = e_data;
        !          1293:     unitptr f = f_data;
        !          1294:     short orig_precision;
1.1.1.6   root     1295: 
1.1.1.7 ! root     1296:     orig_precision = global_precision;
        !          1297:     mp_dmul(d, multiplicand, multiplier);
1.1.1.6   root     1298: 
1.1.1.7 ! root     1299:     /* Throw off low nbits of d */
1.1.1.6   root     1300: #ifndef HIGHFIRST
1.1.1.7 ! root     1301:     d1 = d + nbitsDivUNITSIZE;
1.1.1.6   root     1302: #else
1.1.1.7 ! root     1303:     d1 = d + orig_precision - nbitsDivUNITSIZE;
1.1.1.6   root     1304: #endif
1.1.1.7 ! root     1305:     mp_move(dhi, d1);          /* Don't screw up d, we need it later */
        !          1306:     mp_shift_right_bits(dhi, nbitsModUNITSIZE);
1.1.1.6   root     1307: 
1.1.1.7 ! root     1308:     mp_dmul(e, dhi, reciprocal); /* Note - reciprocal has nbits+1 bits */
1.1.1.6   root     1309: 
                   1310: #ifndef HIGHFIRST
1.1.1.7 ! root     1311:     e += nbitsDivUNITSIZE;
1.1.1.6   root     1312: #else
1.1.1.7 ! root     1313:     e += orig_precision - nbitsDivUNITSIZE;
1.1.1.6   root     1314: #endif
1.1.1.7 ! root     1315:     mp_shift_right_bits(e, nbitsModUNITSIZE);
1.1.1.6   root     1316: 
1.1.1.7 ! root     1317:     mp_dmul(f, e, modulus);
1.1.1.6   root     1318: 
1.1.1.7 ! root     1319:     /* Now for the only double-precision call to mpilib: */
        !          1320:     set_precision(orig_precision * 2);
        !          1321:     mp_sub(d, f);
        !          1322: 
        !          1323:     /* d's precision should be <= orig_precision */
        !          1324:     rescale(d, orig_precision * 2, orig_precision);
        !          1325:     set_precision(orig_precision);
        !          1326: 
        !          1327:     /* Should never have to do this final subtract more than twice: */
        !          1328:     while (mp_compare(d, modulus) > 0)
        !          1329:        mp_sub(d, modulus);
        !          1330: 
        !          1331:     mp_move(prod, d);
        !          1332:     return 0;                  /* normal return */
        !          1333: }                              /* upton_modmult */
        !          1334: 
        !          1335: /* Upton's mp_modmult function leaves some internal arrays in memory,
        !          1336:    so we have to call modmult_burn() at the end of mp_modexp.
        !          1337:    This is so that no cryptographically sensitive data is left in memory
        !          1338:    after the program exits.
        !          1339:    upton_burn() is aliased to modmult_burn().
        !          1340:  */
1.1.1.6   root     1341: void upton_burn(void)
                   1342: {
1.1.1.7 ! root     1343:     unitfill0(modulus, MAX_UNIT_PRECISION);
        !          1344:     unitfill0(reciprocal, MAX_UNIT_PRECISION);
        !          1345:     unitfill0(dhi, MAX_UNIT_PRECISION);
        !          1346:     unitfill0(d_data, MAX_UNIT_PRECISION * 2);
        !          1347:     unitfill0(e_data, MAX_UNIT_PRECISION * 2);
        !          1348:     unitfill0(f_data, MAX_UNIT_PRECISION * 2);
        !          1349:     nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
        !          1350: }                              /* upton_burn */
1.1.1.6   root     1351: 
                   1352: /******* end of Upton's MODMULT stuff. *******/
                   1353: /*=========================================================================*/
1.1.1.7 ! root     1354: #endif                         /* UPTON */
1.1.1.6   root     1355: 
1.1.1.7 ! root     1356: #ifdef SMITH                   /* using Thad Smith's modmult algorithm */
1.1.1.6   root     1357: 
                   1358: /*
                   1359:  * Thad Smith's implementation of multiprecision modmult algorithm in C.
                   1360:  * Performs a multiply combined with a modulo operation.
                   1361:  * The multiplication is done with mp_dmul, the same as for Upton's
                   1362:  * modmult.  The modulus reduction is done by long division, in
                   1363:  * which a trial quotient "digit" is determined, then the product of
                   1364:  * that digit and the divisor are subtracted from the dividend.
                   1365:  *
                   1366:  * In this case, the digit is MULTUNIT in size and the subtraction
                   1367:  * is done by adding the product to the one's complement of the
                   1368:  * dividend, which allows use of the existing mp_smul routine.
                   1369:  *
                   1370:  * The following support functions and data structures
                   1371:  * are used only by Smith's modmult algorithm...
                   1372:  */
                   1373: 
                   1374: /*
                   1375:  * These scratchpad arrays are used only by smith_modmult (mp_modmult).
                   1376:  * Some of them could be statically declared inside of mp_modmult, but we
                   1377:  * put them outside mp_modmult so that they can be wiped clean by
                   1378:  * modmult_burn(), which is called at the end of mp_modexp.  This is
                   1379:  * so that no sensitive data is left in memory after the program exits.
                   1380:  */
                   1381: 
1.1.1.7 ! root     1382: static unit ALIGN ds_data[MAX_UNIT_PRECISION * 2 + 2];
1.1.1.6   root     1383: 
1.1.1.7 ! root     1384: static unit mod_quotient[4];
        !          1385: static unit mod_divisor[4];    /* 2 most signif. MULTUNITs of modulus */
1.1.1.6   root     1386: 
1.1.1.7 ! root     1387: static MULTUNIT *modmpl;       /* ptr to modulus least significant
        !          1388:                                   ** MULTUNIT                      */
        !          1389: static int mshift;             /* number of bits for
        !          1390:                                   ** recip scaling       */
        !          1391: static MULTUNIT reciph;                /* MSunit of scaled recip */
        !          1392: static MULTUNIT recipl;                /* LSunit of scaled recip */
1.1.1.6   root     1393: 
1.1.1.7 ! root     1394: static short modlenMULTUNITS;  /* length of modulus in MULTUNITs */
        !          1395: static MULTUNIT mutemp;                /* temporary */
1.1.1.6   root     1396: 
                   1397: /*
                   1398:  * The routines mp_smul and mp_dmul are the same as for UPTON and
                   1399:  * should be coded in assembly.  Note, however, that the previous
                   1400:  * Upton's mp_smul version has been modified to compatible with
                   1401:  * Smith's modmult.  The new version also still works for Upton's
                   1402:  * modmult.
                   1403:  */
                   1404: 
                   1405: #ifndef mp_set_recip
1.1.1.7 ! root     1406: #define mp_set_recip(rh,rl,m)  /* null */
1.1.1.6   root     1407: #else
                   1408: /* setup routine for external mp_quo_digit */
                   1409: void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m);
                   1410: #endif
1.1.1.7 ! root     1411: MULTUNIT mp_quo_digit(MULTUNIT * dividend);
1.1.1.6   root     1412: 
                   1413: #ifdef MULTUNIT_SIZE_SAME
                   1414: #define mp_musubb mp_subb      /* use existing routine */
1.1.1.7 ! root     1415: #else                          /* ! MULTUNIT_SIZE_SAME */
1.1.1.6   root     1416: 
                   1417: /*
                   1418:  * This performs the same function as mp_subb, but with MULTUNITs.
                   1419:  * Note: Processors without alignment requirements may be able to use
                   1420:  * mp_subb, even though MULTUNITs are smaller than units.  In that case,
                   1421:  * use mp_subb, since it would be faster if coded in assembly.  Note that
                   1422:  * this implementation won't work for MULTUNITs which are long -- use
                   1423:  * mp_subb in that case.
                   1424:  */
                   1425: #ifndef mp_musubb
1.1.1.7 ! root     1426: 
1.1.1.6   root     1427: /*
                   1428:  * multiprecision subtract of MULTUNITs with borrow, r2 from r1, result in r1
                   1429:  * borrow is incoming borrow flag-- value should be 0 or 1
                   1430:  */
1.1.1.7 ! root     1431: boolean mp_musubb
        !          1432:  (register MULTUNIT * r1, register MULTUNIT * r2, register boolean borrow)
1.1.1.6   root     1433: {
1.1.1.7 ! root     1434:     register ulint x;          /* won't work if sizeof(MULTUNIT)==
        !          1435:                                   sizeof(long)     */
        !          1436:     short precision;           /* number of MULTUNITs to subtract */
        !          1437:     precision = global_precision * MULTUNITs_perunit;
        !          1438:     make_lsbptr(r1, precision);
        !          1439:     make_lsbptr(r2, precision);
        !          1440:     while (precision--) {
        !          1441:        x = (ulint) * r1 - (ulint) * post_higherunit(r2) - (ulint) borrow;
        !          1442:        *post_higherunit(r1) = x;
        !          1443:        borrow = (((1L << MULTUNITSIZE) & x) != 0L);
        !          1444:     }
        !          1445:     return (borrow);
        !          1446: }                              /* mp_musubb */
        !          1447: #endif                         /* mp_musubb */
        !          1448: #endif                         /* MULTUNIT_SIZE_SAME */
1.1.1.6   root     1449: 
                   1450: /*
                   1451:  * The function mp_quo_digit is the heart of Smith's modulo reduction,
                   1452:  * which uses a form of long division.  It computes a trial quotient
                   1453:  * "digit" (MULTUNIT-sized digit) by multiplying the three most
                   1454:  * significant MULTUNITs of the dividend by the two most significant
                   1455:  * MULTUNITs of the reciprocal of the modulus.  Note that this function
                   1456:  * requires that MULTUNITSIZE * 2 <= sizeof(unsigned long).
                   1457:  *
                   1458:  * An important part of this technique is that the quotient never be
                   1459:  * too small, although it may occasionally be too large.  This was
                   1460:  * done to eliminate the need to check and correct for a remainder
1.1.1.7 ! root     1461:  * exceeding the divisor.       It is easier to check for a negative
1.1.1.6   root     1462:  * remainder.  The following technique rarely needs correction for
                   1463:  * MULTUNITs of at least 16 bits.
                   1464:  *
                   1465:  * The following routine has two implementations:
                   1466:  *
                   1467:  * ASM_PROTOTYPE defined: written to be an executable prototype for
                   1468:  * an efficient assembly language implementation.  Note that several
                   1469:  * of the following masks and shifts can be done by directly
                   1470:  * manipulating single precision registers on some architectures.
                   1471:  *
                   1472:  * ASM_PROTOTYPE undefined: a slightly more efficient implementation
                   1473:  * in C.  Although this version returns a result larger than the
                   1474:  * optimum (which is corrected in smith_modmult) more often than the
                   1475:  * prototype version, the faster execution in C more than makes up
                   1476:  * for the difference.
                   1477:  * 
                   1478:  * Parameter: dividend - points to the most significant MULTUNIT
1.1.1.7 ! root     1479:  *      of the dividend.  Note that dividend actually contains the 
        !          1480:  *      one's complement of the actual dividend value (see comments for 
        !          1481:  *      smith_modmult).
1.1.1.6   root     1482:  * 
                   1483:  *  Return: the trial quotient digit resulting from dividing the first
1.1.1.7 ! root     1484:  *      three MULTUNITs at dividend by the upper two MULTUNITs of the 
        !          1485:  *      modulus.
1.1.1.6   root     1486:  */
                   1487: 
1.1.1.7 ! root     1488: /* #define ASM_PROTOTYPE *//* undefined: use C-optimized version */
1.1.1.6   root     1489: 
                   1490: #ifndef mp_quo_digit
1.1.1.7 ! root     1491: MULTUNIT mp_quo_digit(MULTUNIT * dividend)
        !          1492: {
        !          1493:     unsigned long q, q0, q1, q2;
        !          1494:     unsigned short lsb_factor;
1.1.1.6   root     1495: 
                   1496: /*
                   1497:  * Compute the least significant product group.
1.1.1.7 ! root     1498:  * The last terms of q1 and q2 perform upward rounding, which is
        !          1499:  * needed to guarantee that the result not be too small.
1.1.1.6   root     1500:  */
1.1.1.7 ! root     1501:     q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long) reciph
        !          1502:        + reciph;
        !          1503:     q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long) recipl
        !          1504:        + (1L << MULTUNITSIZE);
1.1.1.6   root     1505: #ifdef ASM_PROTOTYPE
1.1.1.7 ! root     1506:     lsb_factor = 1 & (q1 >> MULTUNITSIZE) & (q2 >> MULTUNITSIZE);
        !          1507:     q = q1 + q2;
1.1.1.6   root     1508: 
1.1.1.7 ! root     1509:     /* The following statement is equivalent to shifting the sum right
        !          1510:        one bit while shifting in the carry bit.
        !          1511:      */
        !          1512:     q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
        !          1513: #else                          /* optimized C version */
        !          1514:     q0 = (q1 >> 1) + (q2 >> 1) + 1;
1.1.1.6   root     1515: #endif
                   1516: 
1.1.1.7 ! root     1517: /*      Compute the middle significant product group.   */
1.1.1.6   root     1518: 
1.1.1.7 ! root     1519:     q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long) reciph;
        !          1520:     q2 = (dividend[0] ^ MULTUNIT_mask) * (unsigned long) recipl;
1.1.1.6   root     1521: #ifdef ASM_PROTOTYPE
1.1.1.7 ! root     1522:     q = q1 + q2;
        !          1523:     q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
1.1.1.6   root     1524: 
1.1.1.7 ! root     1525: /*      Add in the most significant word of the first group.
        !          1526:    The last term takes care of the carry from adding the lsb's
        !          1527:    that were shifted out prior to addition.
        !          1528:  */
        !          1529:     q = (q0 >> MULTUNITSIZE) + q + (lsb_factor & (q1 ^ q2));
        !          1530: #else                          /* optimized C version */
        !          1531:     q = (q0 >> MULTUNITSIZE) + (q1 >> 1) + (q2 >> 1) + 1;
1.1.1.6   root     1532: #endif
                   1533: 
1.1.1.7 ! root     1534: /*      Compute the most significant term and add in the others */
1.1.1.6   root     1535: 
1.1.1.7 ! root     1536:     q = (q >> (MULTUNITSIZE - 2)) +
        !          1537:        (((dividend[0] ^ MULTUNIT_mask) * (unsigned long) reciph) << 1);
        !          1538:     q >>= mshift;
        !          1539: 
        !          1540: /*      Prevent overflow and then wipe out the intermediate results. */
        !          1541: 
        !          1542:     mutemp = (MULTUNIT) min(q, (1L << MULTUNITSIZE) - 1);
        !          1543:     q = q0 = q1 = q2 = 0;
        !          1544:     lsb_factor = 0;
        !          1545:     (void) lsb_factor;
        !          1546:     return mutemp;
1.1.1.6   root     1547: }
1.1.1.7 ! root     1548: #endif                         /* mp_quo_digit */
1.1.1.6   root     1549: 
                   1550: /*
                   1551:  * stage_smith_modulus() - Prepare for a Smith modmult.
                   1552:  * 
                   1553:  * Calculate the reciprocal of modulus with a precision of two MULTUNITs.
                   1554:  * Assumes that global_precision has already been adjusted to the
                   1555:  * size of the modulus, plus SLOP_BITS.
                   1556:  * 
                   1557:  * Note: This routine was designed to work with large values and
                   1558:  * doesn't have the necessary testing or handling to work with a
                   1559:  * modulus having less than three significant units.  For such cases,
                   1560:  * the separate multiply and modulus routines can be used.
                   1561:  * 
                   1562:  * stage_smith_modulus() is aliased to stage_modulus().
                   1563:  */
                   1564: int stage_smith_modulus(unitptr n_modulus)
                   1565: {
1.1.1.7 ! root     1566:     int original_precision;
        !          1567:     int sigmod;                        /* significant units in modulus */
        !          1568:     unitptr mp;                        /* modulus most significant pointer */
        !          1569:     MULTUNIT *mpm;             /* reciprocal pointer */
        !          1570:     int prec;                  /* precision of reciprocal calc in units */
        !          1571: 
        !          1572:     mp_move(modulus, n_modulus);
        !          1573:     modmpl = (MULTUNIT *) modulus;
        !          1574:     modmpl = lsbptr(modmpl, global_precision * MULTUNITs_perunit);
        !          1575:     nbits = countbits(modulus);
        !          1576:     modlenMULTUNITS = (nbits + MULTUNITSIZE - 1) / MULTUNITSIZE;
        !          1577: 
        !          1578:     original_precision = global_precision;
        !          1579: 
        !          1580:     /* The following code copies the three most significant units of
        !          1581:      * modulus to mod_divisor.
        !          1582:      */
        !          1583:     mp = modulus;
        !          1584:     sigmod = significance(modulus);
        !          1585:     rescale(mp, original_precision, sigmod);
1.1.1.6   root     1586: /* prec is the unit precision required for 3 MULTUNITs */
1.1.1.7 ! root     1587:     prec = (3 + (MULTUNITs_perunit - 1)) / MULTUNITs_perunit;
        !          1588:     set_precision(prec);
1.1.1.6   root     1589: 
1.1.1.7 ! root     1590:     /* set mp = ptr to most significant units of modulus, then move
        !          1591:      * the most significant units to mp_divisor 
        !          1592:      */
        !          1593:     mp = msbptr(mp, sigmod) - tohigher(prec - 1);
        !          1594:     unmake_lsbptr(mp, prec);
        !          1595:     mp_move(mod_divisor, mp);
        !          1596: 
        !          1597:     /* Keep 2*MULTUNITSIZE bits in mod_divisor.
        !          1598:      * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits.
        !          1599:      */
        !          1600:     mshift = countbits(mod_divisor) - 2 * MULTUNITSIZE;
        !          1601:     mp_shift_right_bits(mod_divisor, mshift);
        !          1602:     mp_recip(mod_quotient, mod_divisor);
        !          1603:     mp_shift_right_bits(mod_quotient, 1);
        !          1604: 
        !          1605:     /* Reduce to:   0 < mshift <= MULTUNITSIZE */
        !          1606:     mshift = ((mshift + (MULTUNITSIZE - 1)) % MULTUNITSIZE) + 1;
        !          1607:     /* round up, rescaling if necessary */
        !          1608:     mp_inc(mod_quotient);
        !          1609:     if (countbits(mod_quotient) > 2 * MULTUNITSIZE) {
        !          1610:        mp_shift_right_bits(mod_quotient, 1);
        !          1611:        mshift--;               /* now  0 <= mshift <= MULTUNITSIZE */
        !          1612:     }
        !          1613:     mpm = lsbptr((MULTUNIT *) mod_quotient, prec * MULTUNITs_perunit);
        !          1614:     recipl = *post_higherunit(mpm);
        !          1615:     reciph = *mpm;
        !          1616:     mp_set_recip(reciph, recipl, mshift);
        !          1617:     set_precision(original_precision);
        !          1618:     return 0;                  /* normal return */
        !          1619: }                              /* stage_smith_modulus */
        !          1620: 
        !          1621: /* Smith's algorithm performs a multiply combined with a modulo operation.
        !          1622:    Computes:  prod = (multiplicand*multiplier) mod modulus
        !          1623:    The modulus must first be set by stage_smith_modulus().
        !          1624:    smith_modmult() is aliased to mp_modmult().
        !          1625:  */
        !          1626: int smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
        !          1627: {
        !          1628:     unitptr d;                 /* ptr to product */
        !          1629:     MULTUNIT *dmph, *dmpl, *dmp;       /* ptrs to dividend (high, low, first)
        !          1630:                                         * aligned for subtraction         */
        !          1631: /* Note that dmph points one MULTUNIT higher than indicated by
        !          1632:    global precision.  This allows us to zero out a word one higher than
        !          1633:    the normal precision.
        !          1634:  */
        !          1635:     short orig_precision;
        !          1636:     short nqd;                 /* number of quotient digits remaining to
        !          1637:                                 * be generated                            */
        !          1638:     short dmi;                 /* number of significant MULTUNITs in
        !          1639:                                   product */
        !          1640: 
        !          1641:     d = ds_data + 1;           /* room for leading MSB if HIGHFIRST */
        !          1642:     orig_precision = global_precision;
        !          1643:     mp_dmul(d, multiplicand, multiplier);
        !          1644: 
        !          1645:     rescale(d, orig_precision * 2, orig_precision * 2 + 1);
        !          1646:     set_precision(orig_precision * 2 + 1);
        !          1647:     *msbptr(d, global_precision) = 0;  /* leading 0 unit */
        !          1648: 
        !          1649: /*      We now start working with MULTUNITs.
        !          1650:    Determine the most significant MULTUNIT of the product so we don't
        !          1651:    have to process leading zeros in our divide loop.
        !          1652:  */
        !          1653:     dmi = significance(d) * MULTUNITs_perunit;
        !          1654:     if (dmi >= modlenMULTUNITS) {
        !          1655:        /* Make dividend negative.  This allows the use of mp_smul to
        !          1656:         * "subtract" the product of the modulus and the trial divisor
        !          1657:         * by actually adding to a negative dividend.
        !          1658:         * The one's complement of the dividend is used, since it causes
        !          1659:         * a zero value to be represented as all ones.  This facilitates
        !          1660:         * testing the result for possible overflow, since a sign bit
        !          1661:         * indicates that no adjustment is necessary, and we should not
        !          1662:         * attempt to adjust if the result of the addition is zero.
        !          1663:         */
        !          1664:        mp_inc(d);
        !          1665:        mp_neg(d);
        !          1666:        set_precision(orig_precision);
        !          1667:        munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
        !          1668: 
        !          1669:        /* Set msb, lsb, and normal ptrs of dividend */
        !          1670:        dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) *
        !          1671:                      MULTUNITs_perunit) + tohigher(dmi);
        !          1672:        nqd = dmi + 1 - modlenMULTUNITS;
        !          1673:        dmpl = dmph - tohigher(modlenMULTUNITS);
1.1.1.6   root     1674: 
1.1.1.7 ! root     1675: /*      
1.1.1.6   root     1676:  * Divide loop.
                   1677:  * Each iteration computes the next quotient MULTUNIT digit, then
                   1678:  * multiplies the divisor (modulus) by the quotient digit and adds
                   1679:  * it to the one's complement of the dividend (equivalent to
                   1680:  * subtracting).  If the product was greater than the remaining dividend,
                   1681:  * we get a non-negative result, in which case we subtract off the
                   1682:  * modulus to get the proper negative remainder.
                   1683:  */
1.1.1.7 ! root     1684:        for (; nqd; nqd--) {
        !          1685:            MULTUNIT q;         /* quotient trial digit */
1.1.1.6   root     1686: 
1.1.1.7 ! root     1687:            q = mp_quo_digit(dmph);
        !          1688:            if (q > 0) {
        !          1689:                mp_smul(dmpl, modmpl, q);
1.1.1.6   root     1690: 
1.1.1.7 ! root     1691:                /* Perform correction if q too large.
        !          1692:                   *  This rarely occurs.
        !          1693:                 */
        !          1694:                if (!(*dmph & MULTUNIT_msb)) {
        !          1695:                    dmp = dmpl;
        !          1696:                    unmake_lsbptr(dmp, orig_precision *
        !          1697:                                  MULTUNITs_perunit);
        !          1698:                    if (mp_musubb(dmp,
        !          1699:                                  (MULTUNIT *) modulus, 0))
        !          1700:                        (*dmph)--;
        !          1701:                }
        !          1702:            }
        !          1703:            pre_lowerunit(dmph);
        !          1704:            pre_lowerunit(dmpl);
        !          1705:        }
        !          1706:        /* d contains the one's complement of the remainder. */
        !          1707:        rescale(d, orig_precision * 2 + 1, orig_precision);
        !          1708:        set_precision(orig_precision);
        !          1709:        mp_neg(d);
        !          1710:        mp_dec(d);
        !          1711:     } else {
        !          1712:        /* Product was less than modulus.  Return it. */
        !          1713:        rescale(d, orig_precision * 2 + 1, orig_precision);
        !          1714:        set_precision(orig_precision);
        !          1715:     }
        !          1716:     mp_move(prod, d);
        !          1717:     return (0);                        /* normal return */
        !          1718: }                              /* smith_modmult */
        !          1719: 
        !          1720: /* Smith's mp_modmult function leaves some internal arrays in memory,
        !          1721:    so we have to call modmult_burn() at the end of mp_modexp.
        !          1722:    This is so that no cryptographically sensitive data is left in memory
        !          1723:    after the program exits.
        !          1724:    smith_burn() is aliased to modmult_burn().
        !          1725:  */
1.1.1.6   root     1726: void smith_burn(void)
                   1727: {
1.1.1.7 ! root     1728:     empty_array(modulus);
        !          1729:     empty_array(ds_data);
        !          1730:     empty_array(mod_quotient);
        !          1731:     empty_array(mod_divisor);
        !          1732:     modmpl = 0;
        !          1733:     mshift = nbits = 0;
        !          1734:     reciph = recipl = 0;
        !          1735:     modlenMULTUNITS = mutemp = 0;
        !          1736:     mp_set_recip(0, 0, 0);
        !          1737: }                              /* smith_burn */
1.1.1.6   root     1738: 
1.1.1.7 ! root     1739: /*      End of Thad Smith's implementation of modmult. */
1.1.1.6   root     1740: 
1.1.1.7 ! root     1741: #endif                         /* SMITH */
1.1.1.6   root     1742: 
1.1.1.7 ! root     1743: /* Returns number of significant bits in r */
1.1.1.6   root     1744: int countbits(unitptr r)
                   1745: {
1.1.1.7 ! root     1746:     int bits;
        !          1747:     short prec;
        !          1748:     register unit bitmask;
        !          1749:     init_bitsniffer(r, bitmask, prec, bits);
        !          1750:     return bits;
        !          1751: }                              /* countbits */
1.1.1.6   root     1752: 
                   1753: char *copyright_notice(void)
                   1754: /* force linker to include copyright notice in the executable object image. */
1.1.1.7 ! root     1755: {
        !          1756:     return ("(c)1986 Philip Zimmermann");
        !          1757: }                              /* copyright_notice */
1.1.1.6   root     1758: 
                   1759: /*
                   1760:  * Russian peasant combined exponentiation/modulo algorithm.
                   1761:  * Calls modmult instead of mult.
                   1762:  * Computes:  expout = (expin**exponent) mod modulus
                   1763:  * WARNING: All the arguments must be less than the modulus!
                   1764:  */
1.1.1.7 ! root     1765: int mp_modexp(register unitptr expout, register unitptr expin,
        !          1766:              register unitptr exponent, register unitptr modulus)
1.1.1.6   root     1767: {
1.1.1.7 ! root     1768:     int bits;
        !          1769:     short oldprecision;
        !          1770:     register unit bitmask;
        !          1771:     unit product[MAX_UNIT_PRECISION];
        !          1772:     short eprec;
1.1.1.6   root     1773: 
                   1774: #ifdef COUNTMULTS
1.1.1.7 ! root     1775:     tally_modmults = 0;                /* clear "number of modmults" counter */
        !          1776:     tally_modsquares = 0;      /* clear "number of modsquares" counter */
        !          1777: #endif                         /* COUNTMULTS */
        !          1778:     mp_init(expout, 1);
        !          1779:     if (testeq(exponent, 0)) {
        !          1780:        if (testeq(expin, 0))
        !          1781:            return -1;          /* 0 to the 0th power means return error */
        !          1782:        return 0;               /* otherwise, zero exponent means expout
        !          1783:                                   is 1 */
        !          1784:     }
        !          1785:     if (testeq(modulus, 0))
        !          1786:        return -2;              /* zero modulus means error */
        !          1787: #if SLOP_BITS > 0              /* if there's room for sign bits */
        !          1788:     if (mp_tstminus(modulus))
        !          1789:        return -2;              /* negative modulus means error */
        !          1790: #endif                         /* SLOP_BITS > 0 */
        !          1791:     if (mp_compare(expin, modulus) >= 0)
        !          1792:        return -3;              /* if expin >= modulus, return error */
        !          1793:     if (mp_compare(exponent, modulus) >= 0)
        !          1794:        return -4;              /* if exponent >= modulus, return error */
        !          1795: 
        !          1796:     oldprecision = global_precision;   /* save global_precision */
        !          1797:     /* set smallest optimum precision for this modulus */
        !          1798:     set_precision(bits2units(countbits(modulus) + SLOP_BITS));
        !          1799:     /* rescale all these registers to global_precision we just defined */
        !          1800:     rescale(modulus, oldprecision, global_precision);
        !          1801:     rescale(expin, oldprecision, global_precision);
        !          1802:     rescale(exponent, oldprecision, global_precision);
        !          1803:     rescale(expout, oldprecision, global_precision);
1.1.1.6   root     1804: 
1.1.1.7 ! root     1805:     if (stage_modulus(modulus)) {
        !          1806:        set_precision(oldprecision);    /* restore original precision */
        !          1807:        return -5;              /* unstageable modulus (STEWART algorithm) */
        !          1808:     }
        !          1809:     /* normalize and compute number of bits in exponent first */
        !          1810:     init_bitsniffer(exponent, bitmask, eprec, bits);
        !          1811: 
        !          1812:     /* We can "optimize out" the first modsquare and modmult: */
        !          1813:     bits--;                    /* We know for sure at this point that
        !          1814:                                   bits>0 */
        !          1815:     mp_move(expout, expin);    /*  expout = (1*1)*expin; */
        !          1816:     bump_bitsniffer(exponent, bitmask);
        !          1817: 
        !          1818:     while (bits--) {
        !          1819:        poll_for_break();       /* polls keyboard, allows ctrl-C to
        !          1820:                                   abort program */
1.1.1.6   root     1821: #ifdef COUNTMULTS
1.1.1.7 ! root     1822:        tally_modsquares++;     /* bump "number of modsquares" counter */
        !          1823: #endif                         /* COUNTMULTS */
        !          1824:        mp_modsquare(product, expout);
        !          1825:        if (sniff_bit(exponent, bitmask)) {
        !          1826:            mp_modmult(expout, product, expin);
1.1.1.6   root     1827: #ifdef COUNTMULTS
1.1.1.7 ! root     1828:            tally_modmults++;   /* bump "number of modmults" counter */
        !          1829: #endif                         /* COUNTMULTS */
        !          1830:        } else {
        !          1831:            mp_move(expout, product);
1.1.1.6   root     1832:        }
1.1.1.7 ! root     1833:        bump_bitsniffer(exponent, bitmask);
        !          1834:     }                          /* while bits-- */
        !          1835:     mp_burn(product);          /* burn the evidence on the stack */
        !          1836:     modmult_burn();            /* ask mp_modmult to also burn its
        !          1837:                                   own evidence */
        !          1838: 
        !          1839: #ifdef COUNTMULTS              /* diagnostic analysis */
        !          1840:     {
        !          1841:        long atomic_mults;
        !          1842:        unsigned int unitcount, totalmults;
        !          1843:        unitcount = bits2units(countbits(modulus));
        !          1844:        /* calculation assumes modsquare takes as long as a modmult: */
        !          1845:        atomic_mults = (long) tally_modmults *(unitcount * unitcount);
        !          1846:        atomic_mults += (long) tally_modsquares *(unitcount * unitcount);
        !          1847:        printf("%ld atomic mults for ", atomic_mults);
        !          1848:        printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
        !          1849:               tally_modsquares, tally_modmults,
        !          1850:               tally_modsquares + tally_modmults,
        !          1851:               countbits(modulus), unitcount);
        !          1852:     }
        !          1853: #endif                         /* COUNTMULTS */
        !          1854: 
        !          1855:     set_precision(oldprecision);       /* restore original precision */
        !          1856: 
        !          1857:     /* Do an explicit reference to the copyright notice so that the linker
        !          1858:        will be forced to include it in the executable object image... */
        !          1859:     copyright_notice();                /* has no real effect at run time */
1.1.1.6   root     1860: 
1.1.1.7 ! root     1861:     return 0;                  /* normal return */
        !          1862: }                              /* mp_modexp */
1.1.1.6   root     1863: 
                   1864: /*
                   1865:  * This is a faster modexp for moduli with a known factorisation into two
                   1866:  * relatively prime factors p and q, and an input relatively prime to the
                   1867:  * modulus, the Chinese Remainder Theorem to do the computation mod p and
                   1868:  * mod q, and then combine the results.  This relies on a number of
                   1869:  * precomputed values, but does not actually require the modulus n or the
                   1870:  * exponent e.
                   1871:  * 
                   1872:  * expout = expin ^ e mod (p*q).
                   1873:  * We form this by evaluating
                   1874:  * p2 = (expin ^ e) mod p and
                   1875:  * q2 = (expin ^ e) mod q
                   1876:  * and then combining the two by the CRT.
                   1877:  * 
                   1878:  * Two optimisations of this are possible.  First, we can reduce expin
                   1879:  * modulo p and q before starting.
                   1880:  * 
                   1881:  * Second, since we know the factorisation of p and q (trivially derived
                   1882:  * from the factorisation of n = p*q), and expin is relatively prime to
                   1883:  * both p and q, we can use Euler's theorem, expin^phi(m) = 1 (mod m),
                   1884:  * to throw away multiples of phi(p) or phi(q) in e.
                   1885:  * Letting ep = e mod phi(p) and
                   1886:  *         eq = e mod phi(q)
                   1887:  * then combining these two speedups, we only need to evaluate
                   1888:  * p2 = ((expin mod p) ^ ep) mod p and
                   1889:  * q2 = ((expin mod q) ^ eq) mod q.
                   1890:  * 
                   1891:  * Now we need to apply the CRT.  Starting with
                   1892:  * expout = p2 (mod p) and
                   1893:  * expout = q2 (mod q)
                   1894:  * we can say that expout = p2 + p * k, and if we assume that 0 <= p2 < p,
                   1895:  * then 0 <= expout < p*q for some 0 <= k < q.  Since we want expout = q2
                   1896:  * (mod q), then p*k = q2-p2 (mod q).  Since p and q are relatively prime,
                   1897:  * p has a multiplicative inverse u mod q.  In other words, u = 1/p (mod q).
                   1898:  *
                   1899:  * Multiplying by u on both sides gives k = u*(q2-p2) (mod q).
                   1900:  * Since we want 0 <= k < q, we can thus find k as
                   1901:  * k = (u * (q2-p2)) mod q.
                   1902:  * 
                   1903:  * Once we have k, evaluating p2 + p * k is easy, and
                   1904:  * that gives us the result.
                   1905:  * 
                   1906:  * In the detailed implementation, there is a temporary, temp, used to
                   1907:  * hold intermediate results, p2 is held in expout, and q2 is used as a
                   1908:  * temporary in the final step when it is no longer needed.  With that,
                   1909:  * you should be able to understand the code below.
                   1910:  */
1.1.1.7 ! root     1911: int mp_modexp_crt(unitptr expout, unitptr expin,
        !          1912:                  unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u)
1.1.1.6   root     1913: {
1.1.1.7 ! root     1914:     unit q2[MAX_UNIT_PRECISION];
        !          1915:     unit temp[MAX_UNIT_PRECISION];
        !          1916:     int status;
        !          1917: 
        !          1918: /*      First, compiute p2 (physically held in M) */
        !          1919: 
        !          1920: /*      p2 = [ (expin mod p) ^ ep ] mod p               */
        !          1921:     mp_mod(temp, expin, p);    /* temp = expin mod p  */
        !          1922:     status = mp_modexp(expout, temp, ep, p);
        !          1923:     if (status < 0) {          /* mp_modexp returned an error. */
        !          1924:        mp_init(expout, 1);
        !          1925:        return status;          /* error return */
        !          1926:     }
        !          1927: /*      And the same thing for q2 */
        !          1928: 
        !          1929: /*      q2 = [ (expin mod q) ^ eq ] mod q               */
        !          1930:     mp_mod(temp, expin, q);    /* temp = expin mod q  */
        !          1931:     status = mp_modexp(q2, temp, eq, q);
        !          1932:     if (status < 0) {          /* mp_modexp returned an error. */
        !          1933:        mp_init(expout, 1);
        !          1934:        return status;          /* error return */
        !          1935:     }
        !          1936: /*      Now use the multiplicative inverse u to glue together the
        !          1937:    two halves.
        !          1938:  */
1.1.1.6   root     1939: #if 0
                   1940: /* This optimisation is useful if you commonly get small results,
                   1941:    but for random results and large q, the odds of (1/q) of it
                   1942:    being useful do not warrant the test.
1.1.1.7 ! root     1943:  */
        !          1944:     if (mp_compare(expout, q2) != 0) {
1.1.1.6   root     1945: #endif
1.1.1.7 ! root     1946:        /*      Find q2-p2 mod q */
        !          1947:        if (mp_sub(q2, expout)) /* if the result went negative */
        !          1948:            mp_add(q2, q);      /* add q to q2 */
        !          1949: 
        !          1950:        /*      expout = p2 + ( p * [(q2*u) mod q] )    */
        !          1951:        mp_mult(temp, q2, u);   /* q2*u  */
        !          1952:        mp_mod(q2, temp, q);    /* (q2*u) mod q */
        !          1953:        mp_mult(temp, p, q2);   /* p * [(q2*u) mod q] */
        !          1954:        mp_add(expout, temp);   /* expout = p2 + p * [...] */
1.1.1.6   root     1955: #if 0
1.1.1.7 ! root     1956:     }
1.1.1.6   root     1957: #endif
                   1958: 
1.1.1.7 ! root     1959:     mp_burn(q2);               /* burn the evidence on the stack... */
        !          1960:     mp_burn(temp);
        !          1961:     return 0;                  /* normal return */
        !          1962: }                              /* mp_modexp_crt */
1.1.1.6   root     1963: 
                   1964: 
                   1965: /****************** end of MPI library ****************************/
unix.superglobalmegacorp.com
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.