Annotation of pgp/src/mpilib.c, revision 1.1.1.3

1.1.1.2   root        1: /*     C source code for multiprecision arithmetic library routines.
                      2:        Implemented Nov 86 by Philip Zimmermann
                      3:        Last revised 27 Nov 91 by PRZ
                      4: 
                      5:        Boulder Software Engineering
                      6:        3021 Eleventh Street
                      7:        Boulder, CO 80304
                      8:        (303) 541-0140
                      9: 
                     10:        (c) Copyright 1986-92 by Philip Zimmermann.  All rights reserved.
                     11:        The author assumes no liability for damages resulting from the use
                     12:        of this software, even if the damage results from defects in this
                     13:        software.  No warranty is expressed or implied.  The use of this
                     14:        cryptographic software for developing weapon systems is expressly
                     15:        forbidden.
                     16: 
                     17:        These routines implement all of the multiprecision arithmetic
                     18:        necessary for number-theoretic cryptographic algorithms such as
                     19:        ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
                     20:        composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
                     21:        key cryptography.
                     22: 
                     23:        Although originally developed in Microsoft C for the IBM PC, this code
                     24:        contains few machine dependencies.  It assumes 2's complement
                     25:        arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines,
                     26:        lowbyte-highbyte order or highbyte-lowbyte order.  This version
                     27:        has been converted to ANSI C.
                     28: 
                     29: 
                     30:        The internal representation for these extended precision integer
                     31:        "registers" is an array of "units".  A unit is a machine word, which
                     32:        is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
                     33:        unsigned integer, depending on the machine's word size.  For example,
                     34:        an IBM PC or AT uses a unit size of 16 bits.  To perform arithmetic
                     35:        on these huge precision integers, we pass pointers to these unit
                     36:        arrays to various subroutines.  A pointer to an array of units is of
                     37:        type unitptr.  This is a pointer to a huge integer "register".
                     38: 
                     39:        When calling a subroutine, we always pass a pointer to the BEGINNING
                     40:        of the array of units, regardless of the byte order of the machine.
                     41:        On a lowbyte-first machine, such as the Intel 80x86, this unitptr
                     42:        points to the LEAST significant unit, and the array of units increases
                     43:        significance to the right.  On a highbyte-first machine, such as the
                     44:        Motorola 680x0, this unitptr points to the MOST significant unit, and
                     45:        the array of units decreases significance to the right.
                     46: 
                     47:        Modified 8 Apr 92 - HAJK
                     48:        Implement new VAX/VMS primitive support.
                     49: 
                     50:        Modified 30 Sep 92 -Castor Fu <[email protected]>
                     51:        Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long)
                     52: 
                     53:        Modified 28 Nov 92 - Thad Smith
                     54:        Added Smith modmult, generalized non-portable support.
                     55: */
                     56: 
                     57: /* #define COUNTMULTS */ /* count modmults for performance studies */
                     58: 
                     59: #ifdef DEBUG
                     60: #ifdef MSDOS
1.1.1.3 ! root       61: #ifdef __GO32__ /* DJGPP */
        !            62: #include <pc.h>
        !            63: #else
1.1.1.2   root       64: #include <conio.h>
1.1.1.3 ! root       65: #endif  /* __GO32__ */
1.1.1.2   root       66: #define poll_for_break() {while (kbhit()) getch();}
                     67: #endif /* MSDOS */
                     68: #endif /* DEBUG */
                     69: 
                     70: #ifndef poll_for_break
                     71: #define poll_for_break()  /* stub */
                     72: #endif
                     73: 
                     74: #include "mpilib.h"
                     75: 
                     76: #ifdef mp_smula
                     77: #ifdef mp_smul
                     78:        Error: Both mp_smula and mp_smul cannot be defined.
                     79: #else
                     80: #define mp_smul        mp_smula
                     81: #endif
                     82: #endif
                     83: 
                     84: /* set macros for MULTUNIT */
                     85: #ifdef MUNIT8
                     86: #define MULTUNITSIZE   8
                     87: typedef unsigned char MULTUNIT;
                     88: #ifdef UNIT8
                     89: #define MULTUNIT_SIZE_SAME
                     90: #endif
                     91: #else  /* not MUNIT8 */
                     92: #ifdef MUNIT32
                     93: #define MULTUNITSIZE   32
                     94: typedef unsigned long MULTUNIT;
                     95: #ifdef UNIT32
                     96: #define MULTUNIT_SIZE_SAME
                     97: #else
                     98: /* #error is not portable, this has the same effect */
                     99: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
                    100: #endif
                    101: #else   /* assume MUNIT16 */
                    102: #define MULTUNITSIZE   16
                    103: typedef unsigned short MULTUNIT;
                    104: #ifdef UNIT16
                    105: #define MULTUNIT_SIZE_SAME
                    106: #endif /* UNIT16 */
                    107: #ifdef UNIT8
                    108: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
                    109: #endif /* UNIT8 */
                    110: #endif /* MUNIT32 */
                    111: #endif /* MUNIT8 */
                    112: 
                    113: #define MULTUNIT_msb    ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */
                    114: #define DMULTUNIT_msb   (1L << (2*MULTUNITSIZE-1))
                    115: #define MULTUNIT_mask   ((MULTUNIT)((1L << MULTUNITSIZE)-1))
                    116: #define MULTUNITs_perunit   (UNITSIZE/MULTUNITSIZE)
                    117: 
                    118: 
                    119: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier);
                    120: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
                    121: 
                    122: short global_precision = 0; /* units of precision for all routines */
                    123: /*     global_precision is the unit precision last set by set_precision.
                    124:        Initially, set_precision() should be called to define global_precision
                    125:        before using any of these other multiprecision library routines.
                    126:        i.e.:   set_precision(MAX_UNIT_PRECISION);
                    127: */
                    128: 
                    129: /*************** multiprecision library primitives ****************/
                    130: /*     The following portable C primitives should be recoded in assembly.
                    131:        The entry point name should be defined, in "mpilib.h" to the external
                    132:        entry point name.  If undefined, the C version will be used.
                    133: */
                    134: 
                    135: typedef unsigned long int ulint;
                    136: 
                    137: #ifndef mp_addc
                    138: boolean mp_addc
                    139:        (register unitptr r1,register unitptr r2,register boolean carry)
                    140:        /* multiprecision add with carry r2 to r1, result in r1 */
                    141:        /* carry is incoming carry flag-- value should be 0 or 1 */
                    142: {      register unit x;
                    143:        short precision;        /* number of units to add */
                    144:        precision = global_precision;
                    145:        make_lsbptr(r1,precision);
                    146:        make_lsbptr(r2,precision);
                    147:        while (precision--)
                    148:        {
                    149:                if (carry)
                    150:                {       x = *r1 + *r2 + 1;
                    151:                        carry = (*r2  >= (unit)(~ *r1));
                    152:                } else
                    153:                {       x = *r1 + *r2;
                    154:                        carry = (x < *r1) ;
                    155:                }
                    156:                post_higherunit(r2);
                    157:                *post_higherunit(r1) = x;
                    158:        }
                    159:        return(carry);          /* return the final carry flag bit */
                    160: }      /* mp_addc */
                    161: #endif  /* mp_addc */
                    162: 
                    163: #ifndef mp_subb
                    164: boolean mp_subb
                    165:        (register unitptr r1,register unitptr r2,register boolean borrow)
                    166:        /* multiprecision subtract with borrow, r2 from r1, result in r1 */
                    167:        /* borrow is incoming borrow flag-- value should be 0 or 1 */
                    168: {      register unit x;
                    169:        short precision;        /* number of units to subtract */
                    170:        precision = global_precision;
                    171:        make_lsbptr(r1,precision);
                    172:        make_lsbptr(r2,precision);
                    173:        while (precision--)
                    174:        {       if (borrow)
                    175:                {       x = *r1 - *r2 - 1;
                    176:                        borrow = (*r1 <= *r2);
                    177:                } else
                    178:                {       x = *r1 - *r2;
                    179:                        borrow = (*r1 < *r2);
                    180:                }
                    181:                post_higherunit(r2);
                    182:                *post_higherunit(r1) = x;
                    183:        }
                    184:        return(borrow); /* return the final carry/borrow flag bit */
                    185: }      /* mp_subb */
                    186: #endif  /* mp_subb */
                    187: 
                    188: #ifndef mp_rotate_left
                    189: boolean mp_rotate_left(register unitptr r1,register boolean carry)
                    190:        /* multiprecision rotate left 1 bit with carry, result in r1. */
                    191:        /* carry is incoming carry flag-- value should be 0 or 1 */
                    192: {      register int precision; /* number of units to rotate */
                    193:        unsigned int mcarry = carry, nextcarry; /* int is supposed to be
                    194:                                                 * the efficient size for ops*/
                    195:        precision = global_precision;
                    196:        make_lsbptr(r1,precision);
                    197:        while (precision--)
                    198:        {
                    199:                nextcarry = (((signedunit) *r1) < 0);
                    200:                *r1 = (*r1 << 1) | mcarry;
                    201:                mcarry = nextcarry;
                    202:                pre_higherunit(r1);
                    203:        }
                    204:        return(nextcarry);      /* return the final carry flag bit */
                    205: }      /* mp_rotate_left */
                    206: #endif  /* mp_rotate_left */
                    207: 
                    208: /************** end of primitives that should be in assembly *************/
                    209: 
                    210: 
                    211: /*     The mp_shift_right_bits function is not called in any time-critical
                    212:        situations in public-key cryptographic functions, so it doesn't
                    213:        need to be coded in assembly language.
                    214: */
                    215: void mp_shift_right_bits(register unitptr r1,register short bits)
                    216:        /*      multiprecision shift right bits, result in r1.
                    217:                bits is how many bits to shift, must be <= UNITSIZE.
                    218:        */
                    219: {      register short precision;       /* number of units to shift */
                    220:        register unit carry,nextcarry,bitmask;
                    221:        register short unbits;
                    222:        if (bits==0) return;    /* shift zero bits is a no-op */
                    223:        carry = 0;
                    224:        bitmask = power_of_2(bits)-1;
                    225:        unbits = UNITSIZE-bits;         /* shift bits must be <= UNITSIZE */
                    226:        precision = global_precision;
                    227:        make_msbptr(r1,precision);
                    228:        if (bits == UNITSIZE) {
                    229:                while (precision--)  {
                    230:                        nextcarry = *r1;
                    231:                        *r1 = carry;
                    232:                        carry = nextcarry;
                    233:                        pre_lowerunit(r1);
                    234:                }
                    235:        } else {
                    236:                while (precision--)
                    237:                {       nextcarry = *r1 & bitmask;
                    238:                        *r1 >>= bits;
                    239:                        *r1 |= carry << unbits;
                    240:                        carry = nextcarry;
                    241:                        pre_lowerunit(r1);
                    242:                }
                    243:        }
                    244: }      /* mp_shift_right_bits */
                    245: 
                    246: 
                    247: #ifndef mp_compare
                    248: short mp_compare(register unitptr r1,register unitptr r2)
                    249: /*     Compares multiprecision integers *r1, *r2, and returns:
                    250:        -1 iff *r1 < *r2
                    251:         0 iff *r1 == *r2
                    252:        +1 iff *r1 > *r2
                    253: */
                    254: {      register short precision;       /* number of units to compare */
                    255: 
                    256:        precision = global_precision;
                    257:        make_msbptr(r1,precision);
                    258:        make_msbptr(r2,precision);
                    259:        do
                    260:        {       if (*r1 < *r2)
                    261:                        return(-1);
                    262:                if (*post_lowerunit(r1) > *post_lowerunit(r2))
                    263:                        return(1);
                    264:        } while (--precision);
                    265:        return(0);      /*  *r1 == *r2  */
                    266: }      /* mp_compare */
                    267: #endif /* mp_compare */
                    268: 
                    269: 
                    270: boolean mp_inc(register unitptr r)
                    271:        /* Increment multiprecision integer r. */
                    272: {      register short precision;
                    273:        precision = global_precision;
                    274:        make_lsbptr(r,precision);
                    275:        do
                    276:        {       if ( ++(*r) ) return(0);        /* no carry */
                    277:                post_higherunit(r);
                    278:        } while (--precision);
                    279:        return(1);              /* return carry set */
                    280: }      /* mp_inc */
                    281: 
                    282: 
                    283: boolean mp_dec(register unitptr r)
                    284:        /* Decrement multiprecision integer r. */
                    285: {      register short precision;
                    286:        precision = global_precision;
                    287:        make_lsbptr(r,precision);
                    288:        do
                    289:        {       if ( (signedunit) (--(*r)) != (signedunit) -1 )
                    290:                        return(0);      /* no borrow */
                    291:                post_higherunit(r);
                    292:        } while (--precision);
                    293:        return(1);              /* return borrow set */
                    294: }      /* mp_dec */
                    295: 
                    296: 
                    297: void mp_neg(register unitptr r)
                    298:        /* Compute 2's complement, the arithmetic negative, of r */
                    299: {      register short precision;       /* number of units to negate */
                    300:        precision = global_precision;
                    301:        mp_dec(r);      /* 2's complement is 1's complement plus 1 */
                    302:        do      /* now do 1's complement */
                    303:        {       *r = ~(*r);
                    304:                r++;
                    305:        } while (--precision);
                    306: }      /* mp_neg */
                    307: 
                    308: #ifndef mp_move
                    309: void mp_move(register unitptr dst,register unitptr src)
                    310: {      register short precision;       /* number of units to move */
                    311:        precision = global_precision;
                    312:        do { *dst++ = *src++; } while (--precision);
                    313: }      /* mp_move */
                    314: #endif /* mp_move */
                    315: 
                    316: void mp_init(register unitptr r, word16 value)
                    317:        /* Init multiprecision register r with short value. */
                    318: {      /* Note that mp_init doesn't extend sign bit for >32767 */
                    319: 
                    320:        unitfill0( r, global_precision);
                    321:        make_lsbptr(r,global_precision);
                    322:        *post_higherunit(r) = value;
                    323: #ifdef UNIT8
                    324:        *post_higherunit(r) = value >> UNITSIZE;
                    325: #endif
                    326: }      /* mp_init */
                    327: 
                    328: 
                    329: short significance(register unitptr r)
                    330:        /* Returns number of significant units in r */
                    331: {      register short precision;
                    332:        precision = global_precision;
                    333:        make_msbptr(r,precision);
                    334:        do
                    335:        {       if (*post_lowerunit(r))
                    336:                        return(precision);
                    337:        } while (--precision);
                    338:        return(precision);
                    339: }      /* significance */
                    340: 
                    341: 
                    342: #ifndef unitfill0
                    343: void unitfill0(unitptr r,word16 unitcount)
                    344:        /* Zero-fill the unit buffer r. */
                    345: {      while (unitcount--) *r++ = 0;
                    346: }      /* unitfill0 */
                    347: #endif  /* unitfill0 */
                    348: 
                    349: 
                    350: int mp_udiv(register unitptr remainder,register unitptr quotient,
                    351:        register unitptr dividend,register unitptr divisor)
                    352:        /* Unsigned divide, treats both operands as positive. */
                    353: {      int bits;
                    354:        short dprec;
                    355:        register unit bitmask;
                    356:        if (testeq(divisor,0))
                    357:                return(-1);     /* zero divisor means divide error */
                    358:        mp_init0(remainder);
                    359:        mp_init0(quotient);
                    360:        /* normalize and compute number of bits in dividend first */
                    361:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    362:        /* rescale quotient to same precision (dprec) as dividend */
                    363:        rescale(quotient,global_precision,dprec);
                    364:        make_msbptr(quotient,dprec);
                    365: 
                    366:        while (bits--)
                    367:        {       mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
                    368:                if (mp_compare(remainder,divisor) >= 0)
                    369:                {       mp_sub(remainder,divisor);
                    370:                        stuff_bit(quotient,bitmask);
                    371:                }
                    372:                bump_2bitsniffers(dividend,quotient,bitmask);
                    373:        }
                    374:        return(0);
                    375: } /* mp_udiv */
                    376: 
                    377: 
                    378: #ifdef UPTON_OR_SMITH
                    379: #define RECIPMARGIN 0  /* extra margin bits used by mp_recip() */
                    380: 
                    381: int mp_recip(register unitptr quotient,register unitptr divisor)
                    382:        /* Compute reciprocal (quotient) as 1/divisor.  Used by faster modmult. */
                    383: {      int bits;
                    384:        short qprec;
                    385:        register unit bitmask;
                    386:        unit remainder[MAX_UNIT_PRECISION];
                    387:        if (testeq(divisor,0))
                    388:                return(-1);     /* zero divisor means divide error */
                    389:        mp_init0(remainder);
                    390:        mp_init0(quotient);
                    391: 
                    392:        /* normalize and compute number of bits in quotient first */
                    393:        bits = countbits(divisor) + RECIPMARGIN;
                    394:        bitmask = bitmsk(bits);         /* bitmask within a single unit */
                    395:        qprec = bits2units(bits+1);
                    396:        mp_setbit(remainder,(bits-RECIPMARGIN)-1);
                    397:        /* rescale quotient to precision of divisor + RECIPMARGIN bits */
                    398:        rescale(quotient,global_precision,qprec);
                    399:        make_msbptr(quotient,qprec);
                    400: 
                    401:        while (bits--)
                    402:        {       mp_shift_left(remainder);
                    403:                if (mp_compare(remainder,divisor) >= 0)
                    404:                {       mp_sub(remainder,divisor);
                    405:                        stuff_bit(quotient,bitmask);
                    406:                }
                    407:                bump_bitsniffer(quotient,bitmask);
                    408:        }
                    409:        mp_init0(remainder);    /* burn sensitive data left on stack */
                    410:        return(0);
                    411: } /* mp_recip */
                    412: #endif /* UPTON_OR_SMITH */
                    413: 
                    414: 
                    415: int mp_div(register unitptr remainder,register unitptr quotient,
                    416:        register unitptr dividend,register unitptr divisor)
                    417:        /* Signed divide, either or both operands may be negative. */
                    418: {      boolean dvdsign,dsign;
                    419:        int status;
                    420:        dvdsign = (boolean)(mp_tstminus(dividend)!=0);
                    421:        dsign = (boolean)(mp_tstminus(divisor)!=0);
                    422:        if (dvdsign) mp_neg(dividend);
                    423:        if (dsign) mp_neg(divisor);
                    424:        status = mp_udiv(remainder,quotient,dividend,divisor);
                    425:        if (dvdsign) mp_neg(dividend);          /* repair caller's dividend */
                    426:        if (dsign) mp_neg(divisor);             /* repair caller's divisor */
                    427:        if (status<0) return(status);           /* divide error? */
                    428:        if (dvdsign) mp_neg(remainder);
                    429:        if (dvdsign ^ dsign) mp_neg(quotient);
                    430:        return(status);
                    431: } /* mp_div */
                    432: 
                    433: 
                    434: word16 mp_shortdiv(register unitptr quotient,
                    435:        register unitptr dividend,register word16 divisor)
                    436: /*     This function does a fast divide and mod on a multiprecision dividend
                    437:        using a short integer divisor returning a short integer remainder.
                    438:        This is an unsigned divide.  It treats both operands as positive.
                    439:        It is used mainly for faster printing of large numbers in base 10.
                    440: */
                    441: {      int bits;
                    442:        short dprec;
                    443:        register unit bitmask;
                    444:        register word16 remainder;
                    445:        if (!divisor)   /* if divisor == 0 */
                    446:                return(-1);     /* zero divisor means divide error */
                    447:        remainder=0;
                    448:        mp_init0(quotient);
                    449:        /* normalize and compute number of bits in dividend first */
                    450:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    451:        /* rescale quotient to same precision (dprec) as dividend */
                    452:        rescale(quotient,global_precision,dprec);
                    453:        make_msbptr(quotient,dprec);
                    454: 
                    455:        while (bits--)
                    456:        {       remainder <<= 1;
                    457:                if (sniff_bit(dividend,bitmask))
                    458:                        remainder++;
                    459:                if (remainder >= divisor)
                    460:                {       remainder -= divisor;
                    461:                        stuff_bit(quotient,bitmask);
                    462:                }
                    463:                bump_2bitsniffers(dividend,quotient,bitmask);
                    464:        }
                    465:        return(remainder);
                    466: } /* mp_shortdiv */
                    467: 
                    468: 
                    469: int mp_mod(register unitptr remainder,
                    470:        register unitptr dividend,register unitptr divisor)
                    471:        /* Unsigned divide, treats both operands as positive. */
                    472: {      int bits;
                    473:        short dprec;
                    474:        register unit bitmask;
                    475:        if (testeq(divisor,0))
                    476:                return(-1);     /* zero divisor means divide error */
                    477:        mp_init0(remainder);
                    478:        /* normalize and compute number of bits in dividend first */
                    479:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    480: 
                    481:        while (bits--)
                    482:        {       mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
                    483:                msub(remainder,divisor);
                    484:                bump_bitsniffer(dividend,bitmask);
                    485:        }
                    486:        return(0);
                    487: } /* mp_mod */
                    488: 
                    489: 
                    490: word16 mp_shortmod(register unitptr dividend,register word16 divisor)
                    491: /*     This function does a fast mod operation on a multiprecision dividend
                    492:        using a short integer modulus returning a short integer remainder.
                    493:        This is an unsigned divide.  It treats both operands as positive.
                    494:        It is used mainly for fast sieve searches for large primes.
                    495: */
                    496: {      int bits;
                    497:        short dprec;
                    498:        register unit bitmask;
                    499:        register word16 remainder;
                    500:        if (!divisor)   /* if divisor == 0 */
                    501:                return(-1);     /* zero divisor means divide error */
                    502:        remainder=0;
                    503:        /* normalize and compute number of bits in dividend first */
                    504:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    505: 
                    506:        while (bits--)
                    507:        {       remainder <<= 1;
                    508:                if (sniff_bit(dividend,bitmask))
                    509:                        remainder++;
                    510:                if (remainder >= divisor) remainder -= divisor;
                    511:                bump_bitsniffer(dividend,bitmask);
                    512:        }
                    513:        return(remainder);
                    514: } /* mp_shortmod */
                    515: 
                    516: 
                    517: 
                    518: #ifdef COMB_MULT       /* use faster "comb" multiply algorithm */
                    519:        /* We are skipping this code because it has a bug... */
                    520: 
                    521: int mp_mult(register unitptr prod,
                    522:        register unitptr multiplicand, register unitptr multiplier)
                    523:        /*      Computes multiprecision prod = multiplicand * multiplier */
                    524: {      /*      Uses interleaved comb multiply algorithm.
                    525:                This improved multiply more than twice as fast as a Russian
                    526:                peasant multiply, because it does a lot fewer shifts.
                    527:                Must have global_precision set to the size of the multiplicand
                    528:                plus UNITSIZE-1 SLOP_BITS.  Produces a product that is the sum
                    529:                of the lengths of the multiplier and multiplicand.
                    530: 
                    531:                BUG ALERT:  Unfortunately, this code has a bug.  It fails for
                    532:                some numbers.  One such example:
                    533:                x=   59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5
                    534:                x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D
                    535:                     52C8 CDC7 AFB3 61C8 243C 741B
                    536:                --which is obviously wrong.  The answer should be:
                    537:                x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A
                    538:                     C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9
                    539:                We'll have to fix this some day.  In the meantime, we'll
                    540:                just have the compiler skip over this code.
                    541:        */
                    542:        int bits;
                    543:        register unit bitmask;
                    544:        unitptr product, mplier, temp;
                    545:        short mprec,mprec2;
                    546:        unit mplicand[MAX_UNIT_PRECISION];
                    547:        mp_init(prod,0);                /* better clear full width--double precision */
                    548:        if (testeq(multiplicand,0))
                    549:                return(0);      /* zero multiplicand means zero product */
                    550: 
                    551:        mp_move(mplicand,multiplicand); /* save it from damage */
                    552: 
                    553:        normalize(multiplier,mprec);
                    554:        if (!mprec)
                    555:                return(0);
                    556: 
                    557:        make_lsbptr(multiplier,mprec);
                    558:        bitmask = 1;    /* start scan at LSB of multiplier */
                    559: 
                    560:        do      /* UNITSIZE times */
                    561:        {       /* do for bits 0-15 */
                    562:                product = prod;
                    563:                mplier = multiplier;
                    564:                mprec2 = mprec;
                    565:                while (mprec2--)        /* do for each word in multiplier */
                    566:                {
                    567:                        if (sniff_bit(mplier,bitmask))
                    568:                        {       if (mp_addc(product,multiplicand,0))    /* ripple carry */
                    569:                                {       /* After 1st time thru, this is rarely encountered. */
                    570:                                        temp = msbptr(product,global_precision);
                    571:                                        pre_higherunit(temp);
                    572:                                        /* temp now points to LSU of carry region. */
                    573:                                        unmake_lsbptr(temp,global_precision);
                    574:                                        mp_inc(temp);
                    575:                                }       /* ripple carry */
                    576:                        }
                    577:                        pre_higherunit(mplier);
                    578:                        pre_higherunit(product);
                    579:                }
                    580:                if (!(bitmask <<= 1))
                    581:                        break;
                    582:                mp_shift_left(multiplicand);
                    583: 
                    584:        } while (TRUE);
                    585: 
                    586:        mp_move(multiplicand,mplicand); /* recover */
                    587: 
                    588:        return(0);      /* normal return */
                    589: }      /* mp_mult */
                    590: 
                    591: #endif /* COMB_MULT */
                    592: 
                    593: 
                    594: /*     Because the "comb" multiply has a bug, we will use the slower
                    595:        Russian peasant multiply instead.  Fortunately, the mp_mult
                    596:        function is not called from any time-critical code.
                    597: */
                    598: 
                    599: int mp_mult(register unitptr prod,
                    600:        register unitptr multiplicand,register unitptr multiplier)
                    601:        /* Computes multiprecision prod = multiplicand * multiplier */
                    602: {      /* Uses "Russian peasant" multiply algorithm. */
                    603:        int bits;
                    604:        register unit bitmask;
                    605:        short mprec;
                    606:        mp_init(prod,0);
                    607:        if (testeq(multiplicand,0))
                    608:                return(0);      /* zero multiplicand means zero product */
                    609:        /* normalize and compute number of bits in multiplier first */
                    610:        init_bitsniffer(multiplier,bitmask,mprec,bits);
                    611: 
                    612:        while (bits--)
                    613:        {       mp_shift_left(prod);
                    614:                if (sniff_bit(multiplier,bitmask))
                    615:                        mp_add(prod,multiplicand);
                    616:                bump_bitsniffer(multiplier,bitmask);
                    617:        }
                    618:        return(0);
                    619: }      /* mp_mult */
                    620: 
                    621: 
                    622: 
                    623: /*     mp_modmult computes a multiprecision multiply combined with a
                    624:        modulo operation.  This is the most time-critical function in
                    625:        this multiprecision arithmetic library for performing modulo
                    626:        exponentiation.  We experimented with different versions of modmult,
                    627:        depending on the machine architecture and performance requirements.
                    628:        We will either use a Russian Peasant modmult (peasant_modmult), 
                    629:        Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's
                    630:        modmult (upton_modmult), or Thad Smith's modmult (smith_modmult).
                    631:        On machines with a hardware atomic multiply instruction,
                    632:        Smith's modmult is fastest.  It can utilize assembly subroutines to
                    633:        speed up the hardware multiply logic and trial quotient calculation.
                    634:        If the machine lacks a fast hardware multiply, Merritt's modmult
                    635:        is preferred, which doesn't call any assembly multiply routine.
                    636:        We use the alias names mp_modmult, stage_modulus, and modmult_burn
                    637:        for the corresponding true names, which depend on what flavor of
                    638:        modmult we are using.
                    639: 
                    640:        Before making the first call to mp_modmult, you must set up the
                    641:        modulus-dependant precomputated tables by calling stage_modulus.
                    642:        After making all the calls to mp_modmult, you call modmult_burn to
                    643:        erase the tables created by stage_modulus that were left in memory.
                    644: */
                    645: 
                    646: #ifdef COUNTMULTS
                    647: /* "number of modmults" counters, used for performance studies. */
                    648: static unsigned int tally_modmults = 0;
                    649: static unsigned int tally_modsquares = 0;
                    650: #endif /* COUNTMULTS */
                    651: 
                    652: 
                    653: #ifdef PEASANT
                    654: /* Conventional Russian peasant multiply with modulo algorithm. */
                    655: 
                    656: static unitptr pmodulus = 0;   /* used only by mp_modmult */
                    657: 
                    658: int stage_peasant_modulus(unitptr n)
                    659: /*     Must pass modulus to stage_modulus before calling modmult.
                    660:        Assumes that global_precision has already been adjusted to the
                    661:        size of the modulus, plus SLOP_BITS.
                    662: */
                    663: {      /* For this simple version of modmult, just copy unit pointer. */
                    664:        pmodulus = n;
                    665:        return(0);      /* normal return */
                    666: }      /* stage_peasant_modulus */
                    667: 
                    668: 
                    669: int peasant_modmult(register unitptr prod,
                    670:        unitptr multiplicand,register unitptr multiplier)
                    671: {      /*      "Russian peasant" multiply algorithm, combined with a modulo
                    672:                operation.  This is a simple naive replacement for Merritt's
                    673:                faster modmult algorithm.  References global unitptr "modulus".
                    674:                Computes:  prod = (multiplicand*multiplier) mod modulus
                    675:                WARNING: All the arguments must be less than the modulus!
                    676:        */
                    677:        int bits;
                    678:        register unit bitmask;
                    679:        short mprec;
                    680:        mp_init(prod,0);
                    681: /*     if (testeq(multiplicand,0))
                    682:                return(0); */   /* zero multiplicand means zero product */
                    683:        /* normalize and compute number of bits in multiplier first */
                    684:        init_bitsniffer(multiplier,bitmask,mprec,bits);
                    685: 
                    686:        while (bits--)
                    687:        {       mp_shift_left(prod);
                    688:                msub(prod,pmodulus);    /* turns mult into modmult */
                    689:                if (sniff_bit(multiplier,bitmask))
                    690:                {       mp_add(prod,multiplicand);
                    691:                        msub(prod,pmodulus);    /* turns mult into modmult */
                    692:                }
                    693:                bump_bitsniffer(multiplier,bitmask);
                    694:        }
                    695:        return(0);
                    696: }      /* peasant_modmult */
                    697: 
                    698: 
                    699: /*     If we are using a version of mp_modmult that uses internal tables
                    700:        in memory, we have to call modmult_burn() at the end of mp_modexp.
                    701:        This is so that no sensitive data is left in memory after the program
                    702:        exits.  The Russian peasant method doesn't use any such tables.
                    703: */
                    704: void peasant_burn(void)
                    705: /*     Alias for modmult_burn, called only from mp_modexp().  Destroys
                    706:        internal modmult tables.  This version does nothing because no
                    707:        tables are used by the Russian peasant modmult. */
                    708: { }    /* peasant_burn */
                    709: 
                    710: #endif /* PEASANT */
                    711: 
                    712: 
                    713: #ifdef MERRITT
                    714: /*=========================================================================*/
                    715: /*
                    716:        This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
                    717:        Also refined by Zhahai Stewart to reduce the number of subtracts.
                    718:     Modified by Raymond Brand to reduce the number of SLOP_BITS by 1.
                    719:        It performs a multiply combined with a modulo operation, without
                    720:        going into "double precision".  It is faster than the Russian peasant
                    721:        method, and still works well on machines that lack a fast hardware
                    722:        multiply instruction.
                    723: */
                    724: 
                    725: /*     The following support functions, macros, and data structures
                    726:        are used only by Merritt's modmult algorithm... */
                    727: 
                    728: static void mp_lshift_unit(register unitptr r1)
                    729: /*     Shift r1 1 whole unit to the left.  Used only by modmult function. */
                    730: {      register short precision;
                    731:        register unitptr r2;
                    732:        precision = global_precision;
                    733:        make_msbptr(r1,precision);
                    734:        r2 = r1;
                    735:        while (--precision)
                    736:                *post_lowerunit(r1) = *pre_lowerunit(r2);
                    737:        *r1 = 0;
                    738: }      /* mp_lshift_unit */
                    739: 
                    740: 
                    741: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
                    742: static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
                    743: static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */
                    744: {      0
                    745:        ,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0],
                    746:         &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0]
                    747: #ifndef UNIT8
                    748:        ,&moduli_buf[ 7][0]
                    749:        ,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0], 
                    750:         &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0]
                    751: #ifndef UNIT16 /* and not UNIT8 */
                    752:        ,&moduli_buf[15][0]
                    753:        ,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0], 
                    754:         &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0], 
                    755:         &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0], 
                    756:         &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0]
                    757: #endif /* UNIT16 and UNIT8 not defined */
                    758: #endif /* UNIT8 not defined */
                    759: };
                    760: 
                    761: /*     To optimize msubs, need following 2 unit arrays, each filled
                    762:        with the most significant unit and next-to-most significant unit
                    763:        of the preshifted images of the modulus. */
                    764: static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */
                    765: static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */
                    766: 
                    767: /*     mpdbuf contains preshifted images of the multiplicand, mod n.
                    768:        It is used only by mp_modmult.  It could be staticly declared
                    769:        inside of mp_modmult, but we put it outside mp_modmult so that
                    770:        it can be wiped clean by modmult_burn(), which is called at the
                    771:        end of mp_modexp.  This is so that no sensitive data is left in
                    772:        memory after the program exits.
                    773: */
                    774: static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
                    775: 
                    776: 
                    777: static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
                    778: /*     Computes UNITSIZE images of r, each shifted left 1 more bit.
                    779:        Used only by modmult function.
                    780: */
                    781: {      short int i;
                    782:        images[0] = r;  /* no need to move the first image, just copy ptr */
                    783:        for (i=1; i<UNITSIZE; i++)
                    784:        {       mp_move(images[i],images[i-1]);
                    785:                mp_shift_left(images[i]);
                    786:                msub(images[i],moduli[0]);
                    787:        }
                    788: }      /* stage_mp_images */
                    789: 
                    790: 
                    791: int stage_merritt_modulus(unitptr n)
                    792: /*     Computes UNITSIZE images of modulus, each shifted left 1 more bit.
                    793:        Before calling mp_modmult, you must first stage the moduli images by
                    794:        calling stage_modulus.  n is the pointer to the modulus.
                    795:        Assumes that global_precision has already been adjusted to the
                    796:        size of the modulus, plus SLOP_BITS.
                    797: */
                    798: {      short int i;
                    799:        unitptr msu;    /* ptr to most significant unit, for faster msubs */
                    800:        moduli[0] = n;  /* no need to move the first image, just copy ptr */
                    801: 
                    802:        /* used by optimized msubs macro... */
                    803:        msu = msbptr(n,global_precision);       /* needed by msubs */
                    804:        msu_moduli[0] = *post_lowerunit(msu);
                    805:        nmsu_moduli[0] = *msu;
                    806: 
                    807:        for (i=1; i<UNITSIZE; i++)
                    808:        {       mp_move(moduli[i],moduli[i-1]);
                    809:                mp_shift_left(moduli[i]);
                    810: 
                    811:                /* used by optimized msubs macro... */
                    812:                msu = msbptr(moduli[i],global_precision);       /* needed by msubs */
                    813:                msu_moduli[i] = *post_lowerunit(msu);   /* for faster msubs */
                    814:                nmsu_moduli[i] = *msu;
                    815:        }
                    816:        return(0);      /* normal return */
                    817: }      /* stage_merritt_modulus */
                    818: 
                    819: 
                    820: /* The following macros, sniffadd and msubs, are used by modmult... */
                    821: #define sniffadd(i) if (*multiplier & power_of_2(i))  mp_add(prod,mpd[i])
                    822: 
                    823: /* Unoptimized msubs macro (msubs0) follows... */
                    824: /* #define msubs0(i) msub(prod,moduli[i])
                    825: */
                    826: 
                    827: /*     To optimize msubs, compare the most significant units of the
                    828:        product and the shifted modulus before deciding to call the full
                    829:        mp_compare routine.  Better still, compare the upper two units
                    830:        before deciding to call mp_compare.
                    831:        Optimization of msubs relies heavily on standard C left-to-right
                    832:        parsimonious evaluation of logical expressions.
                    833: */
                    834: 
                    835: /* Partially-optimized msubs macro (msubs1) follows... */
                    836: /* #define msubs1(i) if ( \
                    837:   ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
                    838:   (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
                    839:   ) mp_sub(prod,moduli[i])
                    840: */
                    841: 
                    842: /* Fully-optimized msubs macro (msubs2) follows... */
                    843: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
                    844:  (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
                    845:  (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
                    846:  mp_sub(prod,moduli[i])
                    847: 
                    848: 
                    849: int merritt_modmult(register unitptr prod,
                    850:        unitptr multiplicand,register unitptr multiplier)
                    851:        /*      Performs combined multiply/modulo operation.
                    852:                Computes:  prod = (multiplicand*multiplier) mod modulus
                    853:                WARNING: All the arguments must be less than the modulus!
                    854:                Assumes the modulus has been predefined by first calling
                    855:                stage_modulus.
                    856:        */
                    857: {
                    858:        /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
                    859:        register signedunit p_m;
                    860:        register unitptr msu_prod;      /* ptr to most significant unit of product */
                    861:        register unitptr nmsu_prod;     /* next-most signif. unit of product */
                    862:        short mprec;            /* precision of multiplier, in units */
                    863:        /*      Array mpd contains a list of pointers to preshifted images of
                    864:                the multiplicand: */
                    865:        static unitptr mpd[UNITSIZE] =
                    866:        {        0,              &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
                    867:                 &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
                    868: #ifndef UNIT8
                    869:                ,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
                    870:                 &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
                    871: #ifndef UNIT16 /* and not UNIT8 */
                    872:                ,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
                    873:                 &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
                    874:                 &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
                    875:                 &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
                    876: #endif /* UNIT16 and UNIT8 not defined */
                    877: #endif /* UNIT8 not defined */
                    878:        };
                    879: 
                    880:        /* Compute preshifted images of multiplicand, mod n: */
                    881:        stage_mp_images(mpd,multiplicand);
                    882: 
                    883:        /* To optimize msubs, set up msu_prod and nmsu_prod: */
                    884:        msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */
                    885:        nmsu_prod = msu_prod;
                    886:        post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
                    887: 
                    888:        /*      To understand this algorithm, it would be helpful to first
                    889:                study the conventional Russian peasant modmult algorithm.
                    890:                This one does about the same thing as Russian peasant, but
                    891:                is organized differently to save some steps.  It loops
                    892:                through the multiplier a word (unit) at a time, instead of
                    893:                a bit at a time.  It word-shifts the product instead of
                    894:                bit-shifting it, so it should be faster.  It also does about
                    895:                half as many subtracts as Russian peasant.
                    896:        */
                    897: 
                    898:        mp_init(prod,0);        /* Initialize product to 0. */
                    899: 
                    900:        /*      The way mp_modmult is actually used in cryptographic
                    901:                applications, there will NEVER be a zero multiplier or
                    902:                multiplicand.  So there is no need to optimize for that
                    903:                condition.
                    904:        */
                    905: /*     if (testeq(multiplicand,0))
                    906:                return(0); */   /* zero multiplicand means zero product */
                    907:        /* Normalize and compute number of units in multiplier first: */
                    908:        normalize(multiplier,mprec);
                    909:        if (mprec==0)   /* if precision of multiplier is 0 */
                    910:                return(0);      /* zero multiplier means zero product */
                    911:        make_msbptr(multiplier,mprec); /* start at MSU of multiplier */
                    912: 
                    913:        while (mprec--) /* Loop for the number of units in the multiplier */
                    914:        {
                    915:                /*      Shift the product one whole unit to the left.
                    916:                        This is part of the multiply phase of modmult.
                    917:                */
                    918: 
                    919:                mp_lshift_unit(prod);
                    920: 
                    921:                /*      The product may have grown by as many as UNITSIZE
                    922:                        bits.  That's why we have global_precision set to the
                    923:                        size of the modulus plus UNITSIZE slop bits.
                    924:                        Now reduce the product back down by conditionally
                    925:                        subtracting the preshifted images of the modulus.
                    926:                        This is part of the modulo reduction phase of modmult. 
                    927:                        The following loop is unrolled for speed, using macros...
                    928: 
                    929:                for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--)
                    930:                        if (mp_compare(prod,moduli[i]) >= 0)
                    931:                                mp_sub(prod,moduli[i]);
                    932:                */
                    933: 
                    934: #ifndef UNIT8
                    935: #ifndef UNIT16  /* and not UNIT8 */
                    936:                msubs(31);
                    937:                msubs(30);
                    938:                msubs(29);
                    939:                msubs(28);
                    940:                msubs(27);
                    941:                msubs(26);
                    942:                msubs(25);
                    943:                msubs(24);
                    944:                msubs(23);
                    945:                msubs(22);
                    946:                msubs(21);
                    947:                msubs(20);
                    948:                msubs(19);
                    949:                msubs(18);
                    950:                msubs(17);
                    951:                msubs(16);
                    952: #endif  /* not UNIT16 and not UNIT8 */
                    953:                msubs(15);
                    954:                msubs(14);
                    955:                msubs(13);
                    956:                msubs(12);
                    957:                msubs(11);
                    958:                msubs(10);
                    959:                msubs(9);
                    960:                msubs(8);
                    961: #endif  /* not UNIT8 */
                    962:                msubs(7);
                    963:                msubs(6);
                    964:                msubs(5);
                    965: #ifndef UNIT32
                    966:                msubs(4);
                    967: #ifndef UNIT16
                    968:                msubs(3);
                    969: #endif
                    970: #endif
                    971: 
                    972:                /*      Sniff each bit in the current unit of the multiplier, 
                    973:                        and conditionally add the corresponding preshifted 
                    974:                        image of the multiplicand to the product.
                    975:                        This is also part of the multiply phase of modmult.
                    976: 
                    977:                        The following loop is unrolled for speed, using macros...
                    978: 
                    979:                for (i=UNITSIZE-1; i>=0; i--)
                    980:                   if (*multiplier & power_of_2(i))
                    981:                                mp_add(prod,mpd[i]);
                    982:                */
                    983: #ifndef UNIT8
                    984: #ifndef UNIT16 /* and not UNIT8 */
                    985:                sniffadd(31);
                    986:                sniffadd(30);
                    987:                sniffadd(29);
                    988:                sniffadd(28);
                    989:                sniffadd(27);
                    990:                sniffadd(26);
                    991:                sniffadd(25);
                    992:                sniffadd(24);
                    993:                sniffadd(23);
                    994:                sniffadd(22);
                    995:                sniffadd(21);
                    996:                sniffadd(20);
                    997:                sniffadd(19);
                    998:                sniffadd(18);
                    999:                sniffadd(17);
                   1000:                sniffadd(16);
                   1001: #endif /* not UNIT16 and not UNIT8 */
                   1002:                sniffadd(15);
                   1003:                sniffadd(14);
                   1004:                sniffadd(13);
                   1005:                sniffadd(12);
                   1006:                sniffadd(11);
                   1007:                sniffadd(10);
                   1008:                sniffadd(9);
                   1009:                sniffadd(8);
                   1010: #endif /* not UNIT8 */
                   1011:                sniffadd(7);
                   1012:                sniffadd(6);
                   1013:                sniffadd(5);
                   1014:                sniffadd(4);
                   1015:                sniffadd(3);
                   1016:                sniffadd(2);
                   1017:                sniffadd(1);
                   1018:                sniffadd(0);
                   1019: 
                   1020:                /*      The product may have grown by as many as LOG_UNITSIZE+1
                   1021:                        bits.
                   1022:                        Now reduce the product back down by conditionally 
                   1023:                        subtracting LOG_UNITSIZE+1 preshifted images of the 
                   1024:                        modulus.  This is the modulo reduction phase of modmult.
                   1025: 
                   1026:                        The following loop is unrolled for speed, using macros...
                   1027: 
                   1028:                for (i=LOG_UNITSIZE; i>=0; i--)
                   1029:                        if (mp_compare(prod,moduli[i]) >= 0) 
                   1030:                                mp_sub(prod,moduli[i]); 
                   1031:                */
                   1032: 
                   1033: #ifndef UNIT8
                   1034: #ifndef UNIT16
                   1035:                msubs(5); 
                   1036: #endif
                   1037:                msubs(4);
                   1038: #endif
                   1039:                msubs(3); 
                   1040:                msubs(2); 
                   1041:                msubs(1); 
                   1042:                msubs(0);
                   1043: 
                   1044:                /* Bump pointer to next lower unit of multiplier: */
                   1045:                post_lowerunit(multiplier);
                   1046: 
                   1047:        }       /* Loop for the number of units in the multiplier */
                   1048: 
                   1049:        return(0);      /* normal return */
                   1050: 
                   1051: }      /* merritt_modmult */
                   1052: 
                   1053: 
                   1054: #undef msubs
                   1055: #undef sniffadd
                   1056: 
                   1057: 
                   1058: /*     Merritt's mp_modmult function leaves some internal tables in memory,
                   1059:        so we have to call modmult_burn() at the end of mp_modexp.
                   1060:        This is so that no cryptographically sensitive data is left in memory
                   1061:        after the program exits.
                   1062: */
                   1063: void merritt_burn(void)
                   1064: /*     Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
                   1065: {      unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
                   1066:        unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
                   1067:        unitfill0(msu_moduli,UNITSIZE);
                   1068:        unitfill0(nmsu_moduli,UNITSIZE);
                   1069: } /* merritt_burn() */
                   1070: 
                   1071: /******* end of Merritt's MODMULT stuff. *******/
                   1072: /*=========================================================================*/
                   1073: #endif /* MERRITT */
                   1074: 
                   1075: 
                   1076: #ifdef UPTON_OR_SMITH  /* Used by Upton's and Smith's modmult algorithms */
                   1077: 
                   1078: /*     Jimmy Upton's multiprecision modmult algorithm in C.
                   1079:        Performs a multiply combined with a modulo operation.
                   1080: 
                   1081:        The following support functions and data structures
                   1082:        are used only by Upton's modmult algorithm...
                   1083: */
                   1084: 
                   1085: short munit_prec;      /* global_precision expressed in MULTUNITs */
                   1086: 
                   1087: /*     Note that since the SPARC CPU has no hardware integer multiply
                   1088:        instruction, there is not that much advantage in having an
                   1089:        assembly version of mp_smul on that machine.  It might be faster
                   1090:        to use Merritt's modmult instead of Upton's modmult on the SPARC.
                   1091: */
                   1092: 
                   1093: /*
                   1094:        Multiply the single-word multiplier times the multiprecision integer
                   1095:        in multiplicand, accumulating result in prod.  The resulting
                   1096:        multiprecision prod will be 1 word longer than the multiplicand.
                   1097:        multiplicand is munit_prec words long.  We add into prod, so caller
                   1098:        should zero it out first.  For best results, this time-critical
                   1099:        function should be implemented in assembly.
                   1100:        NOTE:  Unlike other functions in the multiprecision arithmetic
                   1101:        library, both multiplicand and prod are pointing at the LSB,
                   1102:        regardless of byte order of the machine.  On an 80x86, this makes
                   1103:        no difference.  But if this assembly function is implemented
                   1104:        on a 680x0, it becomes important.
                   1105: */
                   1106: /*     Note that this has been modified from the previous version to allow
                   1107:        better support for Smith's modmult:
                   1108:                The final carry bit is added to the existing product
                   1109:                array, rather than simply stored.
                   1110: */
                   1111: 
                   1112: #ifndef mp_smul
                   1113: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
                   1114: {
                   1115:        short i;
                   1116:        unsigned long p, carry;
                   1117: 
                   1118:        carry = 0;
                   1119:        for (i=0; i<munit_prec; ++i)
                   1120:        {       p = (unsigned long)multiplier * *post_higherunit(multiplicand);
                   1121:                p += *prod + carry;
                   1122:                *post_higherunit(prod) = (MULTUNIT)p;
                   1123:                carry = p >> MULTUNITSIZE;
                   1124:        }
                   1125:        /* Add carry to the next higher word of product / dividend */
                   1126:        *prod += (MULTUNIT)carry;
                   1127: }      /* mp_smul */
                   1128: #endif
                   1129: 
                   1130: /*     mp_dmul is a double-precision multiply multiplicand times multiplier,
                   1131:        result into prod.  prod must be pointing at a "double precision"
                   1132:        buffer.  E.g. If global_precision is 10 words, prod must be
                   1133:        pointing at a 20-word buffer.
                   1134: */
                   1135: #ifndef mp_dmul
                   1136: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
                   1137: {
                   1138:        register        int i;
                   1139:        register        MULTUNIT *p_multiplicand, *p_multiplier;
                   1140:        register        MULTUNIT *prodp;
                   1141: 
                   1142: 
                   1143:        unitfill0(prod,global_precision*2);     /* Pre-zero prod */
                   1144:        /* Calculate precision in units of MULTUNIT */
                   1145:        munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
                   1146:        p_multiplicand = (MULTUNIT *)multiplicand;
                   1147:        p_multiplier = (MULTUNIT *)multiplier;
                   1148:        prodp = (MULTUNIT *)prod;
                   1149:        make_lsbptr(p_multiplicand,munit_prec);
                   1150:        make_lsbptr(p_multiplier,munit_prec);
                   1151:        make_lsbptr(prodp,munit_prec*2);
                   1152:        /* Multiply multiplicand by each word in multiplier, accumulating prod: */
                   1153:        for (i=0; i<munit_prec; ++i)
                   1154:                mp_smul (post_higherunit(prodp),
                   1155:                        p_multiplicand, *post_higherunit(p_multiplier));
                   1156: }      /* mp_dmul */
                   1157: #endif  /* mp_dmul */
                   1158: 
                   1159: static unit ALIGN modulus[MAX_UNIT_PRECISION];
                   1160: static short nbits;                    /* number of modulus significant bits */
                   1161: #endif /* UPTON_OR_SMITH */
                   1162: 
                   1163: #ifdef UPTON
                   1164: 
                   1165: /*     These scratchpad arrays are used only by upton_modmult (mp_modmult).
                   1166:        Some of them could be staticly declared inside of mp_modmult, but we
                   1167:        put them outside mp_modmult so that they can be wiped clean by
                   1168:        modmult_burn(), which is called at the end of mp_modexp.  This is
                   1169:        so that no sensitive data is left in memory after the program exits.
                   1170: */
                   1171: 
                   1172: static unit ALIGN reciprocal[MAX_UNIT_PRECISION];
                   1173: static unit ALIGN dhi[MAX_UNIT_PRECISION];
                   1174: static unit ALIGN d_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
                   1175: static unit ALIGN e_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
                   1176: static unit ALIGN f_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
                   1177: 
                   1178: static short nbitsDivUNITSIZE;
                   1179: static short nbitsModUNITSIZE;
                   1180: 
                   1181: /*     stage_upton_modulus() is aliased to stage_modulus().
                   1182:        Prepare for an Upton modmult.  Calculate the reciprocal of modulus,
                   1183:        and save both.  Note that reciprocal will have one more bit than
                   1184:        modulus.
                   1185:        Assumes that global_precision has already been adjusted to the
                   1186:        size of the modulus, plus SLOP_BITS.
                   1187: */
                   1188: int stage_upton_modulus(unitptr n)
                   1189: {
                   1190:        mp_move(modulus, n);
                   1191:        mp_recip(reciprocal, modulus);
                   1192:        nbits = countbits(modulus);
                   1193:        nbitsDivUNITSIZE = nbits / UNITSIZE;
                   1194:        nbitsModUNITSIZE = nbits % UNITSIZE;
                   1195:        return(0);      /* normal return */
                   1196: }      /* stage_upton_modulus */
                   1197: 
                   1198: 
                   1199: /*     Upton's algorithm performs a multiply combined with a modulo operation.
                   1200:        Computes:  prod = (multiplicand*multiplier) mod modulus
                   1201:        WARNING: All the arguments must be less than the modulus!
                   1202:        References global unitptr modulus and reciprocal.
                   1203:        The reciprocal of modulus is 1 bit longer than the modulus.
                   1204:        upton_modmult() is aliased to mp_modmult().
                   1205: */
                   1206: int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier)
                   1207: {
                   1208:        unitptr d = d_data;
                   1209:        unitptr d1 = d_data;
                   1210:        unitptr e = e_data;
                   1211:        unitptr f = f_data;
                   1212:        short orig_precision;
                   1213: 
                   1214:        orig_precision = global_precision;
                   1215:        mp_dmul (d,multiplicand,multiplier);
                   1216: 
                   1217:        /* Throw off low nbits of d */
                   1218: #ifndef HIGHFIRST
                   1219:        d1 = d + nbitsDivUNITSIZE;
                   1220: #else
                   1221:        d1 = d + orig_precision - nbitsDivUNITSIZE;
                   1222: #endif
                   1223:        mp_move(dhi, d1);       /* Don't screw up d, we need it later */
                   1224:        mp_shift_right_bits(dhi, nbitsModUNITSIZE);
                   1225: 
                   1226:        mp_dmul (e,dhi,reciprocal);     /* Note - reciprocal has nbits+1 bits */
                   1227: 
                   1228: #ifndef HIGHFIRST
                   1229:        e += nbitsDivUNITSIZE;
                   1230: #else
                   1231:        e += orig_precision - nbitsDivUNITSIZE;
                   1232: #endif
                   1233:        mp_shift_right_bits(e, nbitsModUNITSIZE);
                   1234: 
                   1235:        mp_dmul (f,e,modulus);
                   1236: 
                   1237:        /* Now for the only double-precision call to mpilib: */
                   1238:        set_precision (orig_precision * 2);
                   1239:        mp_sub (d,f);
                   1240: 
                   1241:        /* d's precision should be <= orig_precision */
                   1242:        rescale (d, orig_precision*2, orig_precision);
                   1243:        set_precision (orig_precision);
                   1244: 
                   1245:        /* Should never have to do this final subtract more than twice: */
                   1246:        while (mp_compare(d,modulus) > 0)
                   1247:                mp_sub (d,modulus);
                   1248: 
                   1249:        mp_move(prod,d);
                   1250:        return(0);  /* normal return */
                   1251: }      /* upton_modmult */
                   1252: 
                   1253: 
                   1254: /*     Upton's mp_modmult function leaves some internal arrays in memory,
                   1255:        so we have to call modmult_burn() at the end of mp_modexp.
                   1256:        This is so that no cryptographically sensitive data is left in memory
                   1257:        after the program exits.
                   1258:        upton_burn() is aliased to modmult_burn().
                   1259: */
                   1260: void upton_burn(void)
                   1261: {
                   1262:        unitfill0(modulus,MAX_UNIT_PRECISION);
                   1263:        unitfill0(reciprocal,MAX_UNIT_PRECISION);
                   1264:        unitfill0(dhi,MAX_UNIT_PRECISION);
                   1265:        unitfill0(d_data,MAX_UNIT_PRECISION*2);
                   1266:        unitfill0(e_data,MAX_UNIT_PRECISION*2);
                   1267:        unitfill0(f_data,MAX_UNIT_PRECISION*2);
                   1268:        nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
                   1269: }      /* upton_burn */
                   1270: 
                   1271: /******* end of Upton's MODMULT stuff. *******/
                   1272: /*=========================================================================*/
                   1273: #endif  /* UPTON */
                   1274: 
                   1275: #ifdef SMITH   /* using Thad Smith's modmult algorithm */
                   1276: 
                   1277: /*     Thad Smith's implementation of multiprecision modmult algorithm in C.
                   1278:        Performs a multiply combined with a modulo operation.
                   1279:        The multiplication is done with mp_dmul, the same as for Upton's
                   1280:        modmult.  The modulus reduction is done by long division, in
                   1281:        which a trial quotient "digit" is determined, then the product of
                   1282:        that digit and the divisor are subtracted from the dividend.
                   1283: 
                   1284:        In this case, the digit is MULTUNIT in size and the subtraction
                   1285:        is done by adding the product to the one's complement of the
                   1286:        dividend, which allows use of the existing mp_smul routine.
                   1287: 
                   1288:        The following support functions and data structures
                   1289:        are used only by Smith's modmult algorithm...
                   1290: */
                   1291: 
                   1292: /*     These scratchpad arrays are used only by smith_modmult (mp_modmult).
                   1293:        Some of them could be statically declared inside of mp_modmult, but we
                   1294:        put them outside mp_modmult so that they can be wiped clean by
                   1295:        modmult_burn(), which is called at the end of mp_modexp.  This is
                   1296:        so that no sensitive data is left in memory after the program exits.
                   1297: */
                   1298: 
                   1299: static unit ALIGN ds_data[MAX_UNIT_PRECISION*2+2];
                   1300: 
                   1301: static unit mod_quotient  [4];
                   1302: static unit mod_divisor   [4]; /* 2 most signif. MULTUNITs of modulus */
                   1303: 
                   1304: static MULTUNIT *modmpl;               /* ptr to modulus least significant
                   1305:                                        ** MULTUNIT                         */
                   1306: static int  mshift;                    /* number of bits for
                   1307:                                        ** recip scaling          */
                   1308: static MULTUNIT reciph;                /* MSunit of scaled recip */
                   1309: static MULTUNIT recipl;                /* LSunit of scaled recip */
                   1310: 
                   1311: static short modlenMULTUNITS;          /* length of modulus in MULTUNITs */
                   1312: static MULTUNIT mutemp;                /* temporary */
                   1313: 
                   1314: /*     The routines mp_smul and mp_dmul are the same as for UPTON and
                   1315:        should be coded in assembly.  Note, however, that the previous
                   1316:        Upton's mp_smul version has been modified to compatible with
                   1317:        Smith's modmult.  The new version also still works for Upton's
                   1318:        modmult.
                   1319: */
                   1320: 
                   1321: #ifndef mp_set_recip
                   1322: #define mp_set_recip(rh,rl,m)     /* null */
                   1323: #else
                   1324: /* setup routine for external mp_quo_digit */
                   1325: void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m);
                   1326: #endif
                   1327: MULTUNIT mp_quo_digit (MULTUNIT *dividend);
                   1328: 
                   1329: #ifdef MULTUNIT_SIZE_SAME
                   1330: #define mp_musubb mp_subb      /* use existing routine */
                   1331: #else  /* ! MULTUNIT_SIZE_SAME */
                   1332: 
                   1333: /*     This performs the same function as mp_subb, but with MULTUNITs.
                   1334:        Note: Processors without alignment requirements may be able to use
                   1335:        mp_subb, even though MULTUNITs are smaller than units.  In that case,
                   1336:        use mp_subb, since it would be faster if coded in assembly.  Note that
                   1337:        this implementation won't work for MULTUNITs which are long -- use
                   1338:        mp_subb in that case.
                   1339: */
                   1340: #ifndef mp_musubb
                   1341: boolean mp_musubb
                   1342:        (register MULTUNIT* r1,register MULTUNIT* r2,register boolean borrow)
                   1343:        /* multiprecision subtract of MULTUNITs with borrow, r2 from r1,
                   1344:        ** result in r1 */
                   1345:        /* borrow is incoming borrow flag-- value should be 0 or 1 */
                   1346: {      register ulint x;       /* won't work if sizeof(MULTUNIT)==
                   1347:                                                 sizeof(long)       */
                   1348:        short precision;        /* number of MULTUNITs to subtract */
                   1349:        precision = global_precision * MULTUNITs_perunit;
                   1350:        make_lsbptr(r1,precision);
                   1351:        make_lsbptr(r2,precision);
                   1352:        while (precision--)
                   1353:        {       x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
                   1354:                *post_higherunit(r1) = x;
                   1355:                borrow = (((1L << MULTUNITSIZE) & x) != 0L);
                   1356:        }
                   1357:        return (borrow);
                   1358: }      /* mp_musubb */
                   1359: #endif /* mp_musubb */
                   1360: #endif /* MULTUNIT_SIZE_SAME */
                   1361: 
                   1362: /*     The function mp_quo_digit is the heart of Smith's modulo reduction,
                   1363:        which uses a form of long division.  It computes a trial quotient
                   1364:        "digit" (MULTUNIT-sized digit) by multiplying the three most
                   1365:        significant MULTUNITs of the dividend by the two most significant
                   1366:        MULTUNITs of the reciprocal of the modulus.  Note that this function
                   1367:        requires that MULTUNITSIZE * 2 <= sizeof(unsigned long).
                   1368: 
                   1369:        An important part of this technique is that the quotient never be
                   1370:        too small, although it may occasionally be too large.  This was
                   1371:        done to eliminate the need to check and correct for a remainder
                   1372:        exceeding the divisor.  It is easier to check for a negative
                   1373:        remainder.  The following technique rarely needs correction for
                   1374:        MULTUNITs of at least 16 bits.
                   1375: 
                   1376:        The following routine has two implementations:
                   1377: 
                   1378:        ASM_PROTOTYPE defined: written to be an executable prototype for
                   1379:        an efficient assembly language implementation.  Note that several
                   1380:        of the following masks and shifts can be done by directly
                   1381:        manipulating single precision registers on some architectures.
                   1382: 
                   1383:        ASM_PROTOTYPE undefined: a slightly more efficient implementation
                   1384:        in C.  Although this version returns a result larger than the
                   1385:        optimum (which is corrected in smith_modmult) more often than the
                   1386:        prototype version, the faster execution in C more than makes up
                   1387:        for the difference.
                   1388:  
                   1389:        Parameter: dividend - points to the most significant MULTUNIT
                   1390:                of the dividend.  Note that dividend actually contains the 
                   1391:                one's complement of the actual dividend value (see comments for 
                   1392:                smith_modmult).
                   1393:        
                   1394:         Return: the trial quotient digit resulting from dividing the first
                   1395:                three MULTUNITs at dividend by the upper two MULTUNITs of the 
                   1396:                modulus.
                   1397: */
                   1398: 
                   1399: /* #define ASM_PROTOTYPE */ /* undefined: use C-optimized version */
                   1400: 
                   1401: #ifndef mp_quo_digit
                   1402: MULTUNIT mp_quo_digit (MULTUNIT *dividend) {
                   1403:        unsigned long q, q0, q1, q2;
                   1404:        unsigned short lsb_factor;
                   1405: 
                   1406: /*     Compute the least significant product group.
                   1407:        The last terms of q1 and q2 perform upward rounding, which is
                   1408:        needed to guarantee that the result not be too small.
                   1409: */
                   1410:        q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long)reciph
                   1411:             + reciph;
                   1412:        q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)recipl
                   1413:             + (1L << MULTUNITSIZE) ;
                   1414: #ifdef ASM_PROTOTYPE
                   1415:        lsb_factor = 1 & (q1>>MULTUNITSIZE) & (q2>>MULTUNITSIZE);
                   1416:        q = q1 + q2;
                   1417: 
                   1418:        /* The following statement is equivalent to shifting the sum right
                   1419:           one bit while shifting in the carry bit.
                   1420:        */
                   1421:        q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
                   1422: #else  /* optimized C version */
                   1423:        q0 = (q1>>1) + (q2>>1) + 1;
                   1424: #endif
                   1425: 
                   1426: /*     Compute the middle significant product group.   */
                   1427: 
                   1428:        q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph;
                   1429:        q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl;
                   1430: #ifdef ASM_PROTOTYPE
                   1431:        q = q1 + q2;
                   1432:        q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
                   1433: 
                   1434: /*     Add in the most significant word of the first group.
                   1435:        The last term takes care of the carry from adding the lsb's
                   1436:        that were shifted out prior to addition.
                   1437: */
                   1438:        q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2));
                   1439: #else  /* optimized C version */
                   1440:        q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1;
                   1441: #endif
                   1442: 
                   1443: /*     Compute the most significant term and add in the others */
                   1444: 
                   1445:        q = (q >> (MULTUNITSIZE-2)) +
                   1446:            (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1);
                   1447:        q >>= mshift;
                   1448: 
                   1449: /*     Prevent overflow and then wipe out the intermediate results. */
                   1450: 
                   1451:        mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1);
                   1452:        q= q0 = q1 = q2 = 0;   lsb_factor = 0; (void)lsb_factor;
                   1453:        return mutemp;
                   1454: }
                   1455: #endif /* mp_quo_digit */
                   1456: 
                   1457: /*     stage_smith_modulus() - Prepare for a Smith modmult.
                   1458: 
                   1459:        Calculate the reciprocal of modulus with a precision of two MULTUNITs.
                   1460:        Assumes that global_precision has already been adjusted to the
                   1461:        size of the modulus, plus SLOP_BITS.
                   1462: 
                   1463:        Note: This routine was designed to work with large values and
                   1464:        doesn't have the necessary testing or handling to work with a
                   1465:        modulus having less than three significant units.  For such cases,
                   1466:        the separate multiply and modulus routines can be used.
                   1467: 
                   1468:        stage_smith_modulus() is aliased to stage_modulus().
                   1469: */
                   1470: 
                   1471: int stage_smith_modulus(unitptr n_modulus)
                   1472: {
                   1473:        int   original_precision;
                   1474:        int   sigmod;           /* significant units in modulus */
                   1475:        unitptr  mp;            /* modulus most significant pointer */
                   1476:        MULTUNIT *mpm;          /* reciprocal pointer */
                   1477:        int   prec;             /* precision of reciprocal calc in units */
                   1478: 
                   1479:        mp_move(modulus, n_modulus);
                   1480:        modmpl = (MULTUNIT*) modulus;
                   1481:        modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit);
                   1482:        nbits = countbits(modulus);
                   1483:        modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE;
                   1484: 
                   1485:        original_precision = global_precision;
                   1486: 
                   1487:        /* The following code copies the three most significant units of
                   1488:         * modulus to mod_divisor.
                   1489:        */
                   1490:        mp = modulus;
                   1491:        sigmod = significance (modulus);
                   1492:        rescale (mp, original_precision, sigmod);
                   1493: /* prec is the unit precision required for 3 MULTUNITs */
                   1494:        prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit;
                   1495:        set_precision (prec);
                   1496:  
                   1497:        /* set mp = ptr to most significant units of modulus, then move
                   1498:         * the most significant units to mp_divisor 
                   1499:        */
                   1500:        mp = msbptr(mp,sigmod) -tohigher(prec-1);
                   1501:        unmake_lsbptr (mp, prec);
                   1502:        mp_move (mod_divisor, mp);
                   1503:  
                   1504:        /* Keep 2*MULTUNITSIZE bits in mod_divisor.
                   1505:         * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits.
                   1506:        */
                   1507:        mshift = countbits (mod_divisor) - 2*MULTUNITSIZE;
                   1508:        mp_shift_right_bits (mod_divisor, mshift);
                   1509:        mp_recip(mod_quotient,mod_divisor);
                   1510:        mp_shift_right_bits (mod_quotient,1);
                   1511: 
                   1512:        /* Reduce to:   0 < mshift <= MULTUNITSIZE */
                   1513:        mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1;
                   1514:        /* round up, rescaling if necessary */
                   1515:        mp_inc (mod_quotient);
                   1516:        if (countbits (mod_quotient) > 2*MULTUNITSIZE) {
                   1517:                mp_shift_right_bits (mod_quotient,1);
                   1518:                mshift--;       /* now  0 <= mshift <= MULTUNITSIZE */
                   1519:        }
                   1520:        mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit);
                   1521:        recipl = *post_higherunit (mpm);
                   1522:        reciph = *mpm;
                   1523:        mp_set_recip (reciph, recipl, mshift);
                   1524:        set_precision (original_precision);
                   1525:        return(0);      /* normal return */
                   1526: }      /* stage_smith_modulus */
                   1527: 
                   1528: /*     Smith's algorithm performs a multiply combined with a modulo operation.
                   1529:        Computes:  prod = (multiplicand*multiplier) mod modulus
                   1530:        The modulus must first be set by stage_smith_modulus().
                   1531:        smith_modmult() is aliased to mp_modmult().
                   1532: */
                   1533: 
                   1534: int
                   1535: smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
                   1536: {
                   1537:        unitptr    d;   /* ptr to product */ 
                   1538:        MULTUNIT   *dmph, *dmpl, *dmp;  /* ptrs to dividend (high, low, first)
                   1539:                                         * aligned for subtraction           */
                   1540: /*     Note that dmph points one MULTUNIT higher than indicated by
                   1541:        global precision.  This allows us to zero out a word one higher than
                   1542:        the normal precision.
                   1543: */
                   1544:        short       orig_precision;
                   1545:        short       nqd;        /* number of quotient digits remaining to
                   1546:                                 * be generated                              */
                   1547:        short       dmi;        /* number of significant MULTUNITs in product */
                   1548: 
                   1549:        d = ds_data + 1;        /* room for leading MSB if HIGHFIRST */
                   1550:        orig_precision = global_precision;
                   1551:        mp_dmul(d, multiplicand, multiplier);
                   1552: 
                   1553:        rescale(d, orig_precision * 2, orig_precision * 2 + 1);
                   1554:        set_precision(orig_precision * 2 + 1);
                   1555:        *msbptr(d, global_precision) = 0;       /* leading 0 unit */
                   1556: 
                   1557: /*     We now start working with MULTUNITs.
                   1558:        Determine the most significant MULTUNIT of the product so we don't
                   1559:        have to process leading zeros in our divide loop.
                   1560: */
                   1561:        dmi = significance(d) * MULTUNITs_perunit;
                   1562:        if (dmi >= modlenMULTUNITS)
                   1563:        {       /* Make dividend negative.  This allows the use of mp_smul to
                   1564:                 * "subtract" the product of the modulus and the trial divisor
                   1565:                 * by actually adding to a negative dividend.
                   1566:                 * The one's complement of the dividend is used, since it causes
                   1567:                 * a zero value to be represented as all ones.  This facilitates
                   1568:                 * testing the result for possible overflow, since a sign bit
                   1569:                 * indicates that no adjustment is necessary, and we should not
                   1570:                 * attempt to adjust if the result of the addition is zero.
                   1571:                 */
                   1572:                mp_inc(d);
                   1573:                mp_neg(d);
                   1574:                set_precision(orig_precision);
                   1575:                munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
                   1576: 
                   1577:                /* Set msb, lsb, and normal ptrs of dividend */
                   1578:                dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) *
                   1579:                            MULTUNITs_perunit) + tohigher(dmi);
                   1580:                nqd = dmi + 1 - modlenMULTUNITS;
                   1581:                dmpl = dmph - tohigher(modlenMULTUNITS);
                   1582: 
                   1583: /*     Divide loop.
                   1584:        Each iteration computes the next quotient MULTUNIT digit, then
                   1585:        multiplies the divisor (modulus) by the quotient digit and adds
                   1586:        it to the one's complement of the dividend (equivalent to
                   1587:        subtracting).  If the product was greater than the remaining dividend,
                   1588:        we get a non-negative result, in which case we subtract off the
                   1589:        modulus to get the proper negative remainder.
                   1590: */
                   1591:                for (; nqd; nqd--)
                   1592:                {       MULTUNIT    q;      /* quotient trial digit */
                   1593: 
                   1594:                        q = mp_quo_digit(dmph);
                   1595:                        if (q > 0)
                   1596:                        {       mp_smul(dmpl, modmpl, q);
                   1597: 
                   1598:                                /* Perform correction if q too large.
                   1599:                                *  This rarely occurs.
                   1600:                                */
                   1601:                                if (!(*dmph & MULTUNIT_msb))
                   1602:                                {       dmp = dmpl;
                   1603:                                        unmake_lsbptr(dmp, orig_precision * 
                   1604:                                                   MULTUNITs_perunit);
                   1605:                                        if (mp_musubb(dmp,
                   1606:                                                   (MULTUNIT *) modulus, 0))
                   1607:                                                (*dmph)  --;
                   1608:                                }
                   1609:                        }
                   1610:                        pre_lowerunit(dmph);
                   1611:                        pre_lowerunit(dmpl);
                   1612:                }
                   1613:                /* d contains the one's complement of the remainder. */
                   1614:                rescale(d, orig_precision * 2 + 1, orig_precision);
                   1615:                set_precision(orig_precision);
                   1616:                mp_neg(d);
                   1617:                mp_dec(d);
                   1618:        } else
                   1619:        {       /* Product was less than modulus.  Return it. */
                   1620:                rescale(d, orig_precision * 2 + 1, orig_precision);
                   1621:                set_precision(orig_precision);
                   1622:        }
                   1623:        mp_move(prod, d);
                   1624:        return (0);             /* normal return */
                   1625: }                              /* smith_modmult */
                   1626: 
                   1627: 
                   1628: /*     Smith's mp_modmult function leaves some internal arrays in memory,
                   1629:        so we have to call modmult_burn() at the end of mp_modexp.
                   1630:        This is so that no cryptographically sensitive data is left in memory
                   1631:        after the program exits.
                   1632:        smith_burn() is aliased to modmult_burn().
                   1633: */
                   1634: void smith_burn(void)
                   1635: {
                   1636:        empty_array (modulus);
                   1637:        empty_array (ds_data);
                   1638:        empty_array (mod_quotient);
                   1639:        empty_array (mod_divisor);
                   1640:        modmpl = 0;
                   1641:        mshift =nbits = 0;
                   1642:        reciph = recipl = 0;
                   1643:        modlenMULTUNITS = mutemp = 0;
                   1644:        mp_set_recip (0,0,0);
                   1645: }      /* smith_burn */
                   1646: 
                   1647: /*     End of Thad Smith's implementation of modmult. */
                   1648: 
                   1649: #endif /* SMITH */
                   1650: 
                   1651: 
                   1652: int countbits(unitptr r)
                   1653:        /* Returns number of significant bits in r */
                   1654: {      int bits;
                   1655:        short prec;
                   1656:        register unit bitmask;
                   1657:        init_bitsniffer(r,bitmask,prec,bits);
                   1658:        return(bits);
                   1659: } /* countbits */
                   1660: 
                   1661: 
                   1662: char *copyright_notice(void)
                   1663: /* force linker to include copyright notice in the executable object image. */
                   1664: { return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */
                   1665: 
                   1666: 
                   1667: int mp_modexp(register unitptr expout,register unitptr expin,
                   1668:        register unitptr exponent,register unitptr modulus)
                   1669: {      /*      Russian peasant combined exponentiation/modulo algorithm.
                   1670:                Calls modmult instead of mult.
                   1671:                Computes:  expout = (expin**exponent) mod modulus
                   1672:                WARNING: All the arguments must be less than the modulus!
                   1673:        */
                   1674:        int bits;
                   1675:        short oldprecision;
                   1676:        register unit bitmask;
                   1677:        unit product[MAX_UNIT_PRECISION];
                   1678:        short eprec;
                   1679: 
                   1680: #ifdef COUNTMULTS
                   1681:        tally_modmults = 0;     /* clear "number of modmults" counter */
                   1682:        tally_modsquares = 0;   /* clear "number of modsquares" counter */
                   1683: #endif /* COUNTMULTS */
                   1684:        mp_init(expout,1);
                   1685:        if (testeq(exponent,0))
                   1686:        {       if (testeq(expin,0))
                   1687:                        return(-1); /* 0 to the 0th power means return error */
                   1688:                return(0);      /* otherwise, zero exponent means expout is 1 */
                   1689:        }
                   1690:        if (testeq(modulus,0))
                   1691:                return(-2);             /* zero modulus means error */
                   1692: #if SLOP_BITS > 0      /* if there's room for sign bits */
                   1693:        if (mp_tstminus(modulus))
                   1694:                return(-2);             /* negative modulus means error */
                   1695: #endif /* SLOP_BITS > 0 */
                   1696:        if (mp_compare(expin,modulus) >= 0)
                   1697:                return(-3); /* if expin >= modulus, return error */
                   1698:        if (mp_compare(exponent,modulus) >= 0)
                   1699:                return(-4); /* if exponent >= modulus, return error */
                   1700: 
                   1701:        oldprecision = global_precision;        /* save global_precision */
                   1702:        /* set smallest optimum precision for this modulus */
                   1703:        set_precision(bits2units(countbits(modulus)+SLOP_BITS));
                   1704:        /* rescale all these registers to global_precision we just defined */
                   1705:        rescale(modulus,oldprecision,global_precision);
                   1706:        rescale(expin,oldprecision,global_precision);
                   1707:        rescale(exponent,oldprecision,global_precision);
                   1708:        rescale(expout,oldprecision,global_precision);
                   1709: 
                   1710:        if (stage_modulus(modulus))
                   1711:        {       set_precision(oldprecision);    /* restore original precision */
                   1712:                return(-5);             /* unstageable modulus (STEWART algorithm) */
                   1713:        }
                   1714: 
                   1715:        /* normalize and compute number of bits in exponent first */
                   1716:        init_bitsniffer(exponent,bitmask,eprec,bits);
                   1717: 
                   1718:        /* We can "optimize out" the first modsquare and modmult: */
                   1719:        bits--;         /* We know for sure at this point that bits>0 */
                   1720:        mp_move(expout,expin);          /*  expout = (1*1)*expin; */
                   1721:        bump_bitsniffer(exponent,bitmask);
                   1722: 
                   1723:        while (bits--)
                   1724:        {
                   1725:                poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
                   1726: #ifdef COUNTMULTS
                   1727:                tally_modsquares++;     /* bump "number of modsquares" counter */
                   1728: #endif /* COUNTMULTS */
                   1729:                mp_modsquare(product,expout);
                   1730:                if (sniff_bit(exponent,bitmask))
                   1731:                {       mp_modmult(expout,product,expin);
                   1732: #ifdef COUNTMULTS
                   1733:                        tally_modmults++;       /* bump "number of modmults" counter */
                   1734: #endif /* COUNTMULTS */
                   1735:                } else
                   1736:                {
                   1737:                        mp_move(expout,product);
                   1738:                }
                   1739:                bump_bitsniffer(exponent,bitmask);
                   1740:        }       /* while bits-- */
                   1741:        mp_burn(product);       /* burn the evidence on the stack */
                   1742:        modmult_burn(); /* ask mp_modmult to also burn its own evidence */
                   1743: 
                   1744: #ifdef COUNTMULTS      /* diagnostic analysis */
                   1745:        {       long atomic_mults;
                   1746:                unsigned int unitcount,totalmults;
                   1747:                unitcount = bits2units(countbits(modulus));
                   1748:                /* calculation assumes modsquare takes as long as a modmult: */
                   1749:                atomic_mults = (long) tally_modmults * (unitcount * unitcount);
                   1750:                atomic_mults += (long) tally_modsquares * (unitcount * unitcount);
                   1751:                printf("%ld atomic mults for ",atomic_mults);
                   1752:                printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
                   1753:                        tally_modsquares,tally_modmults,
                   1754:                        tally_modsquares+tally_modmults,
                   1755:                        countbits(modulus), unitcount);
                   1756:        }
                   1757: #endif /* COUNTMULTS */
                   1758: 
                   1759:        set_precision(oldprecision);    /* restore original precision */
                   1760: 
                   1761:        /* Do an explicit reference to the copyright notice so that the linker
                   1762:           will be forced to include it in the executable object image... */
                   1763:        copyright_notice();     /* has no real effect at run time */
                   1764: 
                   1765:        return(0);              /* normal return */
                   1766: }      /* mp_modexp */
                   1767: 
                   1768: 
                   1769: 
                   1770: 
                   1771: /*********************************************************************
                   1772:        RSA-specific routines follow.  These are the only functions that
                   1773:        are specific to the RSA public key cryptosystem.  The other
                   1774:        multiprecision integer math functions may be used for non-RSA
                   1775:        applications.
                   1776: 
                   1777:        The RSA public key cryptosystem is patented by the Massachusetts
                   1778:        Institute of Technology (U.S. patent #4,405,829).  This patent does
                   1779:        not apply outside the USA.  Public Key Partners (PKP) holds the
                   1780:        exclusive commercial license to sell and sub-license the RSA public
                   1781:        key cryptosystem.  The author of this software implementation of the
                   1782:        RSA algorithm is providing this software for educational use only.
                   1783:        Licensing this algorithm from PKP is the responsibility of you, the
                   1784:        user, not Philip Zimmermann, the author of this software.  The author
                   1785:        assumes no liability for any breach of patent law resulting from the
                   1786:        unlicensed use of this software by the user.
                   1787: *********************************************************************/
                   1788: 
                   1789: 
                   1790: int rsa_decrypt(unitptr M, unitptr C,
                   1791:        unitptr d, unitptr p, unitptr q, unitptr u)
                   1792:        /*      Uses Quisquater's Chinese Remainder Theorem shortcut
                   1793:                for RSA decryption.
                   1794:                M is the output plaintext message.
                   1795:                C is the input ciphertext message.
                   1796:                d is the secret decryption exponent.
                   1797:                p and q are the secret prime factors of n.
                   1798:                u is the multiplicative inverse of p, mod q.
                   1799:                Note that u is precomputed on the assumption that p<q.
                   1800:                n, the common modulus, is not used here because of the
                   1801:                Chinese Remainder Theorem shortcut.
                   1802:        */
                   1803: {
                   1804:        unit p2[MAX_UNIT_PRECISION];
                   1805:        unit q2[MAX_UNIT_PRECISION];
                   1806:        unit temp1[MAX_UNIT_PRECISION];
                   1807:        unit temp2[MAX_UNIT_PRECISION];
                   1808:        int status;
                   1809: 
                   1810:        mp_init(M,1);   /* initialize result in case of error */
                   1811: 
                   1812:        if (mp_compare(p,q) >= 0)       /* ensure that p<q */
                   1813:        {       /* swap the pointers p and q */
                   1814:                unitptr t;
                   1815:                t = p;  p = q; q = t;
                   1816:        }
                   1817: 
                   1818: /*     Rather than decrypting by computing modexp with full mod n
                   1819:        precision, compute a shorter modexp with mod p precision... */
                   1820: 
                   1821: /*     p2 = [ (C mod p)**( d mod (p-1) ) ] mod p               */
                   1822:        mp_move(temp1,p);
                   1823:        mp_dec(temp1);                  /* temp1 = p-1 */
                   1824:        mp_mod(temp2,d,temp1);  /* temp2 = d mod (p-1) ) */
                   1825:        mp_mod(temp1,C,p);              /* temp1 = C mod p  */
                   1826:        status = mp_modexp(p2,temp1,temp2,p);
                   1827:        if (status < 0) /* mp_modexp returned an error. */
                   1828:                return(status); /* error return */
                   1829: 
                   1830: /*     Now compute a short modexp with mod q precision... */
                   1831: 
                   1832: /*     q2 = [ (C mod q)**( d mod (q-1) ) ] mod q               */
                   1833:        mp_move(temp1,q);
                   1834:        mp_dec(temp1);                  /* temp1 = q-1 */
                   1835:        mp_mod(temp2,d,temp1);  /* temp2 = d mod (q-1) */
                   1836:        mp_mod(temp1,C,q);              /* temp1 = C mod q  */
                   1837:        status = mp_modexp(q2,temp1,temp2,q);
                   1838:        if (status < 0) /* mp_modexp returned an error. */
                   1839:                return(status); /* error return */
                   1840: 
                   1841: /*     Now use the multiplicative inverse u to glue together the
                   1842:        two halves, saving a lot of time by avoiding a full mod n
                   1843:        exponentiation.  At key generation time, u was computed
                   1844:        with the secret key as the multiplicative inverse of
                   1845:        p, mod q such that:  (p*u) mod q = 1, assuming p<q.
                   1846: */
                   1847:        if (mp_compare(p2,q2) == 0)     /* only happens if C<p */
                   1848:                mp_move(M,p2);
                   1849:        else
                   1850:        {       /*      q2 = q2 - p2;  if q2<0 then q2 = q2 + q  */
                   1851:                if (mp_sub(q2,p2))      /* if the result went negative... */
                   1852:                        mp_add(q2,q);           /* add q to q2 */
                   1853: 
                   1854:                /*      M = p2 + ( p * [(q2*u) mod q] ) */
                   1855:                mp_mult(temp1,q2,u);            /* temp1 = q2*u  */
                   1856:                mp_mod(temp2,temp1,q);          /* temp2 = temp1 mod q */
                   1857:                mp_mult(temp1,p,temp2); /* temp1 = p * temp2 */
                   1858:                mp_add(temp1,p2);               /* temp1 = temp1 + p2 */
                   1859:                mp_move(M,temp1);               /* M = temp1 */
                   1860:        }
                   1861: 
                   1862:        mp_burn(p2);    /* burn the evidence on the stack...*/
                   1863:        mp_burn(q2);
                   1864:        mp_burn(temp1);
                   1865:        mp_burn(temp2);
                   1866:        return(0);      /* normal return */
                   1867: }      /* rsa_decrypt */
                   1868: 
                   1869: 
                   1870: /****************** end of MPI library ****************************/
                   1871: 

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.