pgp/src/mpilib.c - annotate

Return to mpilib.c CVS log
Up to [PGP] / pgp / src
Annotation of pgp/src/mpilib.c, revision 1.1.1.2

1.1.1.2 ! root        1: /*     C source code for multiprecision arithmetic library routines.
        !             2:        Implemented Nov 86 by Philip Zimmermann
        !             3:        Last revised 27 Nov 91 by PRZ
        !             4: 
        !             5:        Boulder Software Engineering
        !             6:        3021 Eleventh Street
        !             7:        Boulder, CO 80304
        !             8:        (303) 541-0140
        !             9: 
        !            10:        (c) Copyright 1986-92 by Philip Zimmermann.  All rights reserved.
        !            11:        The author assumes no liability for damages resulting from the use
        !            12:        of this software, even if the damage results from defects in this
        !            13:        software.  No warranty is expressed or implied.  The use of this
        !            14:        cryptographic software for developing weapon systems is expressly
        !            15:        forbidden.
        !            16: 
        !            17:        These routines implement all of the multiprecision arithmetic
        !            18:        necessary for number-theoretic cryptographic algorithms such as
        !            19:        ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
        !            20:        composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
        !            21:        key cryptography.
        !            22: 
        !            23:        Although originally developed in Microsoft C for the IBM PC, this code
        !            24:        contains few machine dependencies.  It assumes 2's complement
        !            25:        arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines,
        !            26:        lowbyte-highbyte order or highbyte-lowbyte order.  This version
        !            27:        has been converted to ANSI C.
        !            28: 
        !            29: 
        !            30:        The internal representation for these extended precision integer
        !            31:        "registers" is an array of "units".  A unit is a machine word, which
        !            32:        is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
        !            33:        unsigned integer, depending on the machine's word size.  For example,
        !            34:        an IBM PC or AT uses a unit size of 16 bits.  To perform arithmetic
        !            35:        on these huge precision integers, we pass pointers to these unit
        !            36:        arrays to various subroutines.  A pointer to an array of units is of
        !            37:        type unitptr.  This is a pointer to a huge integer "register".
        !            38: 
        !            39:        When calling a subroutine, we always pass a pointer to the BEGINNING
        !            40:        of the array of units, regardless of the byte order of the machine.
        !            41:        On a lowbyte-first machine, such as the Intel 80x86, this unitptr
        !            42:        points to the LEAST significant unit, and the array of units increases
        !            43:        significance to the right.  On a highbyte-first machine, such as the
        !            44:        Motorola 680x0, this unitptr points to the MOST significant unit, and
        !            45:        the array of units decreases significance to the right.
        !            46: 
        !            47:        Modified 8 Apr 92 - HAJK
        !            48:        Implement new VAX/VMS primitive support.
        !            49: 
        !            50:        Modified 30 Sep 92 -Castor Fu <[email protected]>
        !            51:        Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long)
        !            52: 
        !            53:        Modified 28 Nov 92 - Thad Smith
        !            54:        Added Smith modmult, generalized non-portable support.
        !            55: */
        !            56: 
        !            57: /* #define COUNTMULTS */ /* count modmults for performance studies */
        !            58: 
        !            59: #ifdef DEBUG
        !            60: #ifdef MSDOS
        !            61: #include <conio.h>
        !            62: #define poll_for_break() {while (kbhit()) getch();}
        !            63: #endif /* MSDOS */
        !            64: #endif /* DEBUG */
        !            65: 
        !            66: #ifndef poll_for_break
        !            67: #define poll_for_break()  /* stub */
        !            68: #endif
        !            69: 
        !            70: #include "mpilib.h"
        !            71: 
        !            72: #ifdef mp_smula
        !            73: #ifdef mp_smul
        !            74:        Error: Both mp_smula and mp_smul cannot be defined.
        !            75: #else
        !            76: #define mp_smul        mp_smula
        !            77: #endif
        !            78: #endif
        !            79: 
        !            80: /* set macros for MULTUNIT */
        !            81: #ifdef MUNIT8
        !            82: #define MULTUNITSIZE   8
        !            83: typedef unsigned char MULTUNIT;
        !            84: #ifdef UNIT8
        !            85: #define MULTUNIT_SIZE_SAME
        !            86: #endif
        !            87: #else  /* not MUNIT8 */
        !            88: #ifdef MUNIT32
        !            89: #define MULTUNITSIZE   32
        !            90: typedef unsigned long MULTUNIT;
        !            91: #ifdef UNIT32
        !            92: #define MULTUNIT_SIZE_SAME
        !            93: #else
        !            94: /* #error is not portable, this has the same effect */
        !            95: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
        !            96: #endif
        !            97: #else   /* assume MUNIT16 */
        !            98: #define MULTUNITSIZE   16
        !            99: typedef unsigned short MULTUNIT;
        !           100: #ifdef UNIT16
        !           101: #define MULTUNIT_SIZE_SAME
        !           102: #endif /* UNIT16 */
        !           103: #ifdef UNIT8
        !           104: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
        !           105: #endif /* UNIT8 */
        !           106: #endif /* MUNIT32 */
        !           107: #endif /* MUNIT8 */
        !           108: 
        !           109: #define MULTUNIT_msb    ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */
        !           110: #define DMULTUNIT_msb   (1L << (2*MULTUNITSIZE-1))
        !           111: #define MULTUNIT_mask   ((MULTUNIT)((1L << MULTUNITSIZE)-1))
        !           112: #define MULTUNITs_perunit   (UNITSIZE/MULTUNITSIZE)
        !           113: 
        !           114: 
        !           115: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier);
        !           116: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
        !           117: 
        !           118: short global_precision = 0; /* units of precision for all routines */
        !           119: /*     global_precision is the unit precision last set by set_precision.
        !           120:        Initially, set_precision() should be called to define global_precision
        !           121:        before using any of these other multiprecision library routines.
        !           122:        i.e.:   set_precision(MAX_UNIT_PRECISION);
        !           123: */
        !           124: 
        !           125: /*************** multiprecision library primitives ****************/
        !           126: /*     The following portable C primitives should be recoded in assembly.
        !           127:        The entry point name should be defined, in "mpilib.h" to the external
        !           128:        entry point name.  If undefined, the C version will be used.
        !           129: */
        !           130: 
        !           131: typedef unsigned long int ulint;
        !           132: 
        !           133: #ifndef mp_addc
        !           134: boolean mp_addc
        !           135:        (register unitptr r1,register unitptr r2,register boolean carry)
        !           136:        /* multiprecision add with carry r2 to r1, result in r1 */
        !           137:        /* carry is incoming carry flag-- value should be 0 or 1 */
        !           138: {      register unit x;
        !           139:        short precision;        /* number of units to add */
        !           140:        precision = global_precision;
        !           141:        make_lsbptr(r1,precision);
        !           142:        make_lsbptr(r2,precision);
        !           143:        while (precision--)
        !           144:        {
        !           145:                if (carry)
        !           146:                {       x = *r1 + *r2 + 1;
        !           147:                        carry = (*r2  >= (unit)(~ *r1));
        !           148:                } else
        !           149:                {       x = *r1 + *r2;
        !           150:                        carry = (x < *r1) ;
        !           151:                }
        !           152:                post_higherunit(r2);
        !           153:                *post_higherunit(r1) = x;
        !           154:        }
        !           155:        return(carry);          /* return the final carry flag bit */
        !           156: }      /* mp_addc */
        !           157: #endif  /* mp_addc */
        !           158: 
        !           159: #ifndef mp_subb
        !           160: boolean mp_subb
        !           161:        (register unitptr r1,register unitptr r2,register boolean borrow)
        !           162:        /* multiprecision subtract with borrow, r2 from r1, result in r1 */
        !           163:        /* borrow is incoming borrow flag-- value should be 0 or 1 */
        !           164: {      register unit x;
        !           165:        short precision;        /* number of units to subtract */
        !           166:        precision = global_precision;
        !           167:        make_lsbptr(r1,precision);
        !           168:        make_lsbptr(r2,precision);
        !           169:        while (precision--)
        !           170:        {       if (borrow)
        !           171:                {       x = *r1 - *r2 - 1;
        !           172:                        borrow = (*r1 <= *r2);
        !           173:                } else
        !           174:                {       x = *r1 - *r2;
        !           175:                        borrow = (*r1 < *r2);
        !           176:                }
        !           177:                post_higherunit(r2);
        !           178:                *post_higherunit(r1) = x;
        !           179:        }
        !           180:        return(borrow); /* return the final carry/borrow flag bit */
        !           181: }      /* mp_subb */
        !           182: #endif  /* mp_subb */
        !           183: 
        !           184: #ifndef mp_rotate_left
        !           185: boolean mp_rotate_left(register unitptr r1,register boolean carry)
        !           186:        /* multiprecision rotate left 1 bit with carry, result in r1. */
        !           187:        /* carry is incoming carry flag-- value should be 0 or 1 */
        !           188: {      register int precision; /* number of units to rotate */
        !           189:        unsigned int mcarry = carry, nextcarry; /* int is supposed to be
        !           190:                                                 * the efficient size for ops*/
        !           191:        precision = global_precision;
        !           192:        make_lsbptr(r1,precision);
        !           193:        while (precision--)
        !           194:        {
        !           195:                nextcarry = (((signedunit) *r1) < 0);
        !           196:                *r1 = (*r1 << 1) | mcarry;
        !           197:                mcarry = nextcarry;
        !           198:                pre_higherunit(r1);
        !           199:        }
        !           200:        return(nextcarry);      /* return the final carry flag bit */
        !           201: }      /* mp_rotate_left */
        !           202: #endif  /* mp_rotate_left */
        !           203: 
        !           204: /************** end of primitives that should be in assembly *************/
        !           205: 
        !           206: 
        !           207: /*     The mp_shift_right_bits function is not called in any time-critical
        !           208:        situations in public-key cryptographic functions, so it doesn't
        !           209:        need to be coded in assembly language.
        !           210: */
        !           211: void mp_shift_right_bits(register unitptr r1,register short bits)
        !           212:        /*      multiprecision shift right bits, result in r1.
        !           213:                bits is how many bits to shift, must be <= UNITSIZE.
        !           214:        */
        !           215: {      register short precision;       /* number of units to shift */
        !           216:        register unit carry,nextcarry,bitmask;
        !           217:        register short unbits;
        !           218:        if (bits==0) return;    /* shift zero bits is a no-op */
        !           219:        carry = 0;
        !           220:        bitmask = power_of_2(bits)-1;
        !           221:        unbits = UNITSIZE-bits;         /* shift bits must be <= UNITSIZE */
        !           222:        precision = global_precision;
        !           223:        make_msbptr(r1,precision);
        !           224:        if (bits == UNITSIZE) {
        !           225:                while (precision--)  {
        !           226:                        nextcarry = *r1;
        !           227:                        *r1 = carry;
        !           228:                        carry = nextcarry;
        !           229:                        pre_lowerunit(r1);
        !           230:                }
        !           231:        } else {
        !           232:                while (precision--)
        !           233:                {       nextcarry = *r1 & bitmask;
        !           234:                        *r1 >>= bits;
        !           235:                        *r1 |= carry << unbits;
        !           236:                        carry = nextcarry;
        !           237:                        pre_lowerunit(r1);
        !           238:                }
        !           239:        }
        !           240: }      /* mp_shift_right_bits */
        !           241: 
        !           242: 
        !           243: #ifndef mp_compare
        !           244: short mp_compare(register unitptr r1,register unitptr r2)
        !           245: /*     Compares multiprecision integers *r1, *r2, and returns:
        !           246:        -1 iff *r1 < *r2
        !           247:         0 iff *r1 == *r2
        !           248:        +1 iff *r1 > *r2
        !           249: */
        !           250: {      register short precision;       /* number of units to compare */
        !           251: 
        !           252:        precision = global_precision;
        !           253:        make_msbptr(r1,precision);
        !           254:        make_msbptr(r2,precision);
        !           255:        do
        !           256:        {       if (*r1 < *r2)
        !           257:                        return(-1);
        !           258:                if (*post_lowerunit(r1) > *post_lowerunit(r2))
        !           259:                        return(1);
        !           260:        } while (--precision);
        !           261:        return(0);      /*  *r1 == *r2  */
        !           262: }      /* mp_compare */
        !           263: #endif /* mp_compare */
        !           264: 
        !           265: 
        !           266: boolean mp_inc(register unitptr r)
        !           267:        /* Increment multiprecision integer r. */
        !           268: {      register short precision;
        !           269:        precision = global_precision;
        !           270:        make_lsbptr(r,precision);
        !           271:        do
        !           272:        {       if ( ++(*r) ) return(0);        /* no carry */
        !           273:                post_higherunit(r);
        !           274:        } while (--precision);
        !           275:        return(1);              /* return carry set */
        !           276: }      /* mp_inc */
        !           277: 
        !           278: 
        !           279: boolean mp_dec(register unitptr r)
        !           280:        /* Decrement multiprecision integer r. */
        !           281: {      register short precision;
        !           282:        precision = global_precision;
        !           283:        make_lsbptr(r,precision);
        !           284:        do
        !           285:        {       if ( (signedunit) (--(*r)) != (signedunit) -1 )
        !           286:                        return(0);      /* no borrow */
        !           287:                post_higherunit(r);
        !           288:        } while (--precision);
        !           289:        return(1);              /* return borrow set */
        !           290: }      /* mp_dec */
        !           291: 
        !           292: 
        !           293: void mp_neg(register unitptr r)
        !           294:        /* Compute 2's complement, the arithmetic negative, of r */
        !           295: {      register short precision;       /* number of units to negate */
        !           296:        precision = global_precision;
        !           297:        mp_dec(r);      /* 2's complement is 1's complement plus 1 */
        !           298:        do      /* now do 1's complement */
        !           299:        {       *r = ~(*r);
        !           300:                r++;
        !           301:        } while (--precision);
        !           302: }      /* mp_neg */
        !           303: 
        !           304: #ifndef mp_move
        !           305: void mp_move(register unitptr dst,register unitptr src)
        !           306: {      register short precision;       /* number of units to move */
        !           307:        precision = global_precision;
        !           308:        do { *dst++ = *src++; } while (--precision);
        !           309: }      /* mp_move */
        !           310: #endif /* mp_move */
        !           311: 
        !           312: void mp_init(register unitptr r, word16 value)
        !           313:        /* Init multiprecision register r with short value. */
        !           314: {      /* Note that mp_init doesn't extend sign bit for >32767 */
        !           315: 
        !           316:        unitfill0( r, global_precision);
        !           317:        make_lsbptr(r,global_precision);
        !           318:        *post_higherunit(r) = value;
        !           319: #ifdef UNIT8
        !           320:        *post_higherunit(r) = value >> UNITSIZE;
        !           321: #endif
        !           322: }      /* mp_init */
        !           323: 
        !           324: 
        !           325: short significance(register unitptr r)
        !           326:        /* Returns number of significant units in r */
        !           327: {      register short precision;
        !           328:        precision = global_precision;
        !           329:        make_msbptr(r,precision);
        !           330:        do
        !           331:        {       if (*post_lowerunit(r))
        !           332:                        return(precision);
        !           333:        } while (--precision);
        !           334:        return(precision);
        !           335: }      /* significance */
        !           336: 
        !           337: 
        !           338: #ifndef unitfill0
        !           339: void unitfill0(unitptr r,word16 unitcount)
        !           340:        /* Zero-fill the unit buffer r. */
        !           341: {      while (unitcount--) *r++ = 0;
        !           342: }      /* unitfill0 */
        !           343: #endif  /* unitfill0 */
        !           344: 
        !           345: 
        !           346: int mp_udiv(register unitptr remainder,register unitptr quotient,
        !           347:        register unitptr dividend,register unitptr divisor)
        !           348:        /* Unsigned divide, treats both operands as positive. */
        !           349: {      int bits;
        !           350:        short dprec;
        !           351:        register unit bitmask;
        !           352:        if (testeq(divisor,0))
        !           353:                return(-1);     /* zero divisor means divide error */
        !           354:        mp_init0(remainder);
        !           355:        mp_init0(quotient);
        !           356:        /* normalize and compute number of bits in dividend first */
        !           357:        init_bitsniffer(dividend,bitmask,dprec,bits);
        !           358:        /* rescale quotient to same precision (dprec) as dividend */
        !           359:        rescale(quotient,global_precision,dprec);
        !           360:        make_msbptr(quotient,dprec);
        !           361: 
        !           362:        while (bits--)
        !           363:        {       mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
        !           364:                if (mp_compare(remainder,divisor) >= 0)
        !           365:                {       mp_sub(remainder,divisor);
        !           366:                        stuff_bit(quotient,bitmask);
        !           367:                }
        !           368:                bump_2bitsniffers(dividend,quotient,bitmask);
        !           369:        }
        !           370:        return(0);
        !           371: } /* mp_udiv */
        !           372: 
        !           373: 
        !           374: #ifdef UPTON_OR_SMITH
        !           375: #define RECIPMARGIN 0  /* extra margin bits used by mp_recip() */
        !           376: 
        !           377: int mp_recip(register unitptr quotient,register unitptr divisor)
        !           378:        /* Compute reciprocal (quotient) as 1/divisor.  Used by faster modmult. */
        !           379: {      int bits;
        !           380:        short qprec;
        !           381:        register unit bitmask;
        !           382:        unit remainder[MAX_UNIT_PRECISION];
        !           383:        if (testeq(divisor,0))
        !           384:                return(-1);     /* zero divisor means divide error */
        !           385:        mp_init0(remainder);
        !           386:        mp_init0(quotient);
        !           387: 
        !           388:        /* normalize and compute number of bits in quotient first */
        !           389:        bits = countbits(divisor) + RECIPMARGIN;
        !           390:        bitmask = bitmsk(bits);         /* bitmask within a single unit */
        !           391:        qprec = bits2units(bits+1);
        !           392:        mp_setbit(remainder,(bits-RECIPMARGIN)-1);
        !           393:        /* rescale quotient to precision of divisor + RECIPMARGIN bits */
        !           394:        rescale(quotient,global_precision,qprec);
        !           395:        make_msbptr(quotient,qprec);
        !           396: 
        !           397:        while (bits--)
        !           398:        {       mp_shift_left(remainder);
        !           399:                if (mp_compare(remainder,divisor) >= 0)
        !           400:                {       mp_sub(remainder,divisor);
        !           401:                        stuff_bit(quotient,bitmask);
        !           402:                }
        !           403:                bump_bitsniffer(quotient,bitmask);
        !           404:        }
        !           405:        mp_init0(remainder);    /* burn sensitive data left on stack */
        !           406:        return(0);
        !           407: } /* mp_recip */
        !           408: #endif /* UPTON_OR_SMITH */
        !           409: 
        !           410: 
        !           411: int mp_div(register unitptr remainder,register unitptr quotient,
        !           412:        register unitptr dividend,register unitptr divisor)
        !           413:        /* Signed divide, either or both operands may be negative. */
        !           414: {      boolean dvdsign,dsign;
        !           415:        int status;
        !           416:        dvdsign = (boolean)(mp_tstminus(dividend)!=0);
        !           417:        dsign = (boolean)(mp_tstminus(divisor)!=0);
        !           418:        if (dvdsign) mp_neg(dividend);
        !           419:        if (dsign) mp_neg(divisor);
        !           420:        status = mp_udiv(remainder,quotient,dividend,divisor);
        !           421:        if (dvdsign) mp_neg(dividend);          /* repair caller's dividend */
        !           422:        if (dsign) mp_neg(divisor);             /* repair caller's divisor */
        !           423:        if (status<0) return(status);           /* divide error? */
        !           424:        if (dvdsign) mp_neg(remainder);
        !           425:        if (dvdsign ^ dsign) mp_neg(quotient);
        !           426:        return(status);
        !           427: } /* mp_div */
        !           428: 
        !           429: 
        !           430: word16 mp_shortdiv(register unitptr quotient,
        !           431:        register unitptr dividend,register word16 divisor)
        !           432: /*     This function does a fast divide and mod on a multiprecision dividend
        !           433:        using a short integer divisor returning a short integer remainder.
        !           434:        This is an unsigned divide.  It treats both operands as positive.
        !           435:        It is used mainly for faster printing of large numbers in base 10.
        !           436: */
        !           437: {      int bits;
        !           438:        short dprec;
        !           439:        register unit bitmask;
        !           440:        register word16 remainder;
        !           441:        if (!divisor)   /* if divisor == 0 */
        !           442:                return(-1);     /* zero divisor means divide error */
        !           443:        remainder=0;
        !           444:        mp_init0(quotient);
        !           445:        /* normalize and compute number of bits in dividend first */
        !           446:        init_bitsniffer(dividend,bitmask,dprec,bits);
        !           447:        /* rescale quotient to same precision (dprec) as dividend */
        !           448:        rescale(quotient,global_precision,dprec);
        !           449:        make_msbptr(quotient,dprec);
        !           450: 
        !           451:        while (bits--)
        !           452:        {       remainder <<= 1;
        !           453:                if (sniff_bit(dividend,bitmask))
        !           454:                        remainder++;
        !           455:                if (remainder >= divisor)
        !           456:                {       remainder -= divisor;
        !           457:                        stuff_bit(quotient,bitmask);
        !           458:                }
        !           459:                bump_2bitsniffers(dividend,quotient,bitmask);
        !           460:        }
        !           461:        return(remainder);
        !           462: } /* mp_shortdiv */
        !           463: 
        !           464: 
        !           465: int mp_mod(register unitptr remainder,
        !           466:        register unitptr dividend,register unitptr divisor)
        !           467:        /* Unsigned divide, treats both operands as positive. */
        !           468: {      int bits;
        !           469:        short dprec;
        !           470:        register unit bitmask;
        !           471:        if (testeq(divisor,0))
        !           472:                return(-1);     /* zero divisor means divide error */
        !           473:        mp_init0(remainder);
        !           474:        /* normalize and compute number of bits in dividend first */
        !           475:        init_bitsniffer(dividend,bitmask,dprec,bits);
        !           476: 
        !           477:        while (bits--)
        !           478:        {       mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
        !           479:                msub(remainder,divisor);
        !           480:                bump_bitsniffer(dividend,bitmask);
        !           481:        }
        !           482:        return(0);
        !           483: } /* mp_mod */
        !           484: 
        !           485: 
        !           486: word16 mp_shortmod(register unitptr dividend,register word16 divisor)
        !           487: /*     This function does a fast mod operation on a multiprecision dividend
        !           488:        using a short integer modulus returning a short integer remainder.
        !           489:        This is an unsigned divide.  It treats both operands as positive.
        !           490:        It is used mainly for fast sieve searches for large primes.
        !           491: */
        !           492: {      int bits;
        !           493:        short dprec;
        !           494:        register unit bitmask;
        !           495:        register word16 remainder;
        !           496:        if (!divisor)   /* if divisor == 0 */
        !           497:                return(-1);     /* zero divisor means divide error */
        !           498:        remainder=0;
        !           499:        /* normalize and compute number of bits in dividend first */
        !           500:        init_bitsniffer(dividend,bitmask,dprec,bits);
        !           501: 
        !           502:        while (bits--)
        !           503:        {       remainder <<= 1;
        !           504:                if (sniff_bit(dividend,bitmask))
        !           505:                        remainder++;
        !           506:                if (remainder >= divisor) remainder -= divisor;
        !           507:                bump_bitsniffer(dividend,bitmask);
        !           508:        }
        !           509:        return(remainder);
        !           510: } /* mp_shortmod */
        !           511: 
        !           512: 
        !           513: 
        !           514: #ifdef COMB_MULT       /* use faster "comb" multiply algorithm */
        !           515:        /* We are skipping this code because it has a bug... */
        !           516: 
        !           517: int mp_mult(register unitptr prod,
        !           518:        register unitptr multiplicand, register unitptr multiplier)
        !           519:        /*      Computes multiprecision prod = multiplicand * multiplier */
        !           520: {      /*      Uses interleaved comb multiply algorithm.
        !           521:                This improved multiply more than twice as fast as a Russian
        !           522:                peasant multiply, because it does a lot fewer shifts.
        !           523:                Must have global_precision set to the size of the multiplicand
        !           524:                plus UNITSIZE-1 SLOP_BITS.  Produces a product that is the sum
        !           525:                of the lengths of the multiplier and multiplicand.
        !           526: 
        !           527:                BUG ALERT:  Unfortunately, this code has a bug.  It fails for
        !           528:                some numbers.  One such example:
        !           529:                x=   59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5
        !           530:                x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D
        !           531:                     52C8 CDC7 AFB3 61C8 243C 741B
        !           532:                --which is obviously wrong.  The answer should be:
        !           533:                x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A
        !           534:                     C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9
        !           535:                We'll have to fix this some day.  In the meantime, we'll
        !           536:                just have the compiler skip over this code.
        !           537:        */
        !           538:        int bits;
        !           539:        register unit bitmask;
        !           540:        unitptr product, mplier, temp;
        !           541:        short mprec,mprec2;
        !           542:        unit mplicand[MAX_UNIT_PRECISION];
        !           543:        mp_init(prod,0);                /* better clear full width--double precision */
        !           544:        if (testeq(multiplicand,0))
        !           545:                return(0);      /* zero multiplicand means zero product */
        !           546: 
        !           547:        mp_move(mplicand,multiplicand); /* save it from damage */
        !           548: 
        !           549:        normalize(multiplier,mprec);
        !           550:        if (!mprec)
        !           551:                return(0);
        !           552: 
        !           553:        make_lsbptr(multiplier,mprec);
        !           554:        bitmask = 1;    /* start scan at LSB of multiplier */
        !           555: 
        !           556:        do      /* UNITSIZE times */
        !           557:        {       /* do for bits 0-15 */
        !           558:                product = prod;
        !           559:                mplier = multiplier;
        !           560:                mprec2 = mprec;
        !           561:                while (mprec2--)        /* do for each word in multiplier */
        !           562:                {
        !           563:                        if (sniff_bit(mplier,bitmask))
        !           564:                        {       if (mp_addc(product,multiplicand,0))    /* ripple carry */
        !           565:                                {       /* After 1st time thru, this is rarely encountered. */
        !           566:                                        temp = msbptr(product,global_precision);
        !           567:                                        pre_higherunit(temp);
        !           568:                                        /* temp now points to LSU of carry region. */
        !           569:                                        unmake_lsbptr(temp,global_precision);
        !           570:                                        mp_inc(temp);
        !           571:                                }       /* ripple carry */
        !           572:                        }
        !           573:                        pre_higherunit(mplier);
        !           574:                        pre_higherunit(product);
        !           575:                }
        !           576:                if (!(bitmask <<= 1))
        !           577:                        break;
        !           578:                mp_shift_left(multiplicand);
        !           579: 
        !           580:        } while (TRUE);
        !           581: 
        !           582:        mp_move(multiplicand,mplicand); /* recover */
        !           583: 
        !           584:        return(0);      /* normal return */
        !           585: }      /* mp_mult */
        !           586: 
        !           587: #endif /* COMB_MULT */
        !           588: 
        !           589: 
        !           590: /*     Because the "comb" multiply has a bug, we will use the slower
        !           591:        Russian peasant multiply instead.  Fortunately, the mp_mult
        !           592:        function is not called from any time-critical code.
        !           593: */
        !           594: 
        !           595: int mp_mult(register unitptr prod,
        !           596:        register unitptr multiplicand,register unitptr multiplier)
        !           597:        /* Computes multiprecision prod = multiplicand * multiplier */
        !           598: {      /* Uses "Russian peasant" multiply algorithm. */
        !           599:        int bits;
        !           600:        register unit bitmask;
        !           601:        short mprec;
        !           602:        mp_init(prod,0);
        !           603:        if (testeq(multiplicand,0))
        !           604:                return(0);      /* zero multiplicand means zero product */
        !           605:        /* normalize and compute number of bits in multiplier first */
        !           606:        init_bitsniffer(multiplier,bitmask,mprec,bits);
        !           607: 
        !           608:        while (bits--)
        !           609:        {       mp_shift_left(prod);
        !           610:                if (sniff_bit(multiplier,bitmask))
        !           611:                        mp_add(prod,multiplicand);
        !           612:                bump_bitsniffer(multiplier,bitmask);
        !           613:        }
        !           614:        return(0);
        !           615: }      /* mp_mult */
        !           616: 
        !           617: 
        !           618: 
        !           619: /*     mp_modmult computes a multiprecision multiply combined with a
        !           620:        modulo operation.  This is the most time-critical function in
        !           621:        this multiprecision arithmetic library for performing modulo
        !           622:        exponentiation.  We experimented with different versions of modmult,
        !           623:        depending on the machine architecture and performance requirements.
        !           624:        We will either use a Russian Peasant modmult (peasant_modmult), 
        !           625:        Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's
        !           626:        modmult (upton_modmult), or Thad Smith's modmult (smith_modmult).
        !           627:        On machines with a hardware atomic multiply instruction,
        !           628:        Smith's modmult is fastest.  It can utilize assembly subroutines to
        !           629:        speed up the hardware multiply logic and trial quotient calculation.
        !           630:        If the machine lacks a fast hardware multiply, Merritt's modmult
        !           631:        is preferred, which doesn't call any assembly multiply routine.
        !           632:        We use the alias names mp_modmult, stage_modulus, and modmult_burn
        !           633:        for the corresponding true names, which depend on what flavor of
        !           634:        modmult we are using.
        !           635: 
        !           636:        Before making the first call to mp_modmult, you must set up the
        !           637:        modulus-dependant precomputated tables by calling stage_modulus.
        !           638:        After making all the calls to mp_modmult, you call modmult_burn to
        !           639:        erase the tables created by stage_modulus that were left in memory.
        !           640: */
        !           641: 
        !           642: #ifdef COUNTMULTS
        !           643: /* "number of modmults" counters, used for performance studies. */
        !           644: static unsigned int tally_modmults = 0;
        !           645: static unsigned int tally_modsquares = 0;
        !           646: #endif /* COUNTMULTS */
        !           647: 
        !           648: 
        !           649: #ifdef PEASANT
        !           650: /* Conventional Russian peasant multiply with modulo algorithm. */
        !           651: 
        !           652: static unitptr pmodulus = 0;   /* used only by mp_modmult */
        !           653: 
        !           654: int stage_peasant_modulus(unitptr n)
        !           655: /*     Must pass modulus to stage_modulus before calling modmult.
        !           656:        Assumes that global_precision has already been adjusted to the
        !           657:        size of the modulus, plus SLOP_BITS.
        !           658: */
        !           659: {      /* For this simple version of modmult, just copy unit pointer. */
        !           660:        pmodulus = n;
        !           661:        return(0);      /* normal return */
        !           662: }      /* stage_peasant_modulus */
        !           663: 
        !           664: 
        !           665: int peasant_modmult(register unitptr prod,
        !           666:        unitptr multiplicand,register unitptr multiplier)
        !           667: {      /*      "Russian peasant" multiply algorithm, combined with a modulo
        !           668:                operation.  This is a simple naive replacement for Merritt's
        !           669:                faster modmult algorithm.  References global unitptr "modulus".
        !           670:                Computes:  prod = (multiplicand*multiplier) mod modulus
        !           671:                WARNING: All the arguments must be less than the modulus!
        !           672:        */
        !           673:        int bits;
        !           674:        register unit bitmask;
        !           675:        short mprec;
        !           676:        mp_init(prod,0);
        !           677: /*     if (testeq(multiplicand,0))
        !           678:                return(0); */   /* zero multiplicand means zero product */
        !           679:        /* normalize and compute number of bits in multiplier first */
        !           680:        init_bitsniffer(multiplier,bitmask,mprec,bits);
        !           681: 
        !           682:        while (bits--)
        !           683:        {       mp_shift_left(prod);
        !           684:                msub(prod,pmodulus);    /* turns mult into modmult */
        !           685:                if (sniff_bit(multiplier,bitmask))
        !           686:                {       mp_add(prod,multiplicand);
        !           687:                        msub(prod,pmodulus);    /* turns mult into modmult */
        !           688:                }
        !           689:                bump_bitsniffer(multiplier,bitmask);
        !           690:        }
        !           691:        return(0);
        !           692: }      /* peasant_modmult */
        !           693: 
        !           694: 
        !           695: /*     If we are using a version of mp_modmult that uses internal tables
        !           696:        in memory, we have to call modmult_burn() at the end of mp_modexp.
        !           697:        This is so that no sensitive data is left in memory after the program
        !           698:        exits.  The Russian peasant method doesn't use any such tables.
        !           699: */
        !           700: void peasant_burn(void)
        !           701: /*     Alias for modmult_burn, called only from mp_modexp().  Destroys
        !           702:        internal modmult tables.  This version does nothing because no
        !           703:        tables are used by the Russian peasant modmult. */
        !           704: { }    /* peasant_burn */
        !           705: 
        !           706: #endif /* PEASANT */
        !           707: 
        !           708: 
        !           709: #ifdef MERRITT
        !           710: /*=========================================================================*/
        !           711: /*
        !           712:        This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
        !           713:        Also refined by Zhahai Stewart to reduce the number of subtracts.
        !           714:     Modified by Raymond Brand to reduce the number of SLOP_BITS by 1.
        !           715:        It performs a multiply combined with a modulo operation, without
        !           716:        going into "double precision".  It is faster than the Russian peasant
        !           717:        method, and still works well on machines that lack a fast hardware
        !           718:        multiply instruction.
        !           719: */
        !           720: 
        !           721: /*     The following support functions, macros, and data structures
        !           722:        are used only by Merritt's modmult algorithm... */
        !           723: 
        !           724: static void mp_lshift_unit(register unitptr r1)
        !           725: /*     Shift r1 1 whole unit to the left.  Used only by modmult function. */
        !           726: {      register short precision;
        !           727:        register unitptr r2;
        !           728:        precision = global_precision;
        !           729:        make_msbptr(r1,precision);
        !           730:        r2 = r1;
        !           731:        while (--precision)
        !           732:                *post_lowerunit(r1) = *pre_lowerunit(r2);
        !           733:        *r1 = 0;
        !           734: }      /* mp_lshift_unit */
        !           735: 
        !           736: 
        !           737: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
        !           738: static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
        !           739: static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */
        !           740: {      0
        !           741:        ,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0],
        !           742:         &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0]
        !           743: #ifndef UNIT8
        !           744:        ,&moduli_buf[ 7][0]
        !           745:        ,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0], 
        !           746:         &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0]
        !           747: #ifndef UNIT16 /* and not UNIT8 */
        !           748:        ,&moduli_buf[15][0]
        !           749:        ,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0], 
        !           750:         &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0], 
        !           751:         &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0], 
        !           752:         &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0]
        !           753: #endif /* UNIT16 and UNIT8 not defined */
        !           754: #endif /* UNIT8 not defined */
        !           755: };
        !           756: 
        !           757: /*     To optimize msubs, need following 2 unit arrays, each filled
        !           758:        with the most significant unit and next-to-most significant unit
        !           759:        of the preshifted images of the modulus. */
        !           760: static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */
        !           761: static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */
        !           762: 
        !           763: /*     mpdbuf contains preshifted images of the multiplicand, mod n.
        !           764:        It is used only by mp_modmult.  It could be staticly declared
        !           765:        inside of mp_modmult, but we put it outside mp_modmult so that
        !           766:        it can be wiped clean by modmult_burn(), which is called at the
        !           767:        end of mp_modexp.  This is so that no sensitive data is left in
        !           768:        memory after the program exits.
        !           769: */
        !           770: static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
        !           771: 
        !           772: 
        !           773: static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
        !           774: /*     Computes UNITSIZE images of r, each shifted left 1 more bit.
        !           775:        Used only by modmult function.
        !           776: */
        !           777: {      short int i;
        !           778:        images[0] = r;  /* no need to move the first image, just copy ptr */
        !           779:        for (i=1; i<UNITSIZE; i++)
        !           780:        {       mp_move(images[i],images[i-1]);
        !           781:                mp_shift_left(images[i]);
        !           782:                msub(images[i],moduli[0]);
        !           783:        }
        !           784: }      /* stage_mp_images */
        !           785: 
        !           786: 
        !           787: int stage_merritt_modulus(unitptr n)
        !           788: /*     Computes UNITSIZE images of modulus, each shifted left 1 more bit.
        !           789:        Before calling mp_modmult, you must first stage the moduli images by
        !           790:        calling stage_modulus.  n is the pointer to the modulus.
        !           791:        Assumes that global_precision has already been adjusted to the
        !           792:        size of the modulus, plus SLOP_BITS.
        !           793: */
        !           794: {      short int i;
        !           795:        unitptr msu;    /* ptr to most significant unit, for faster msubs */
        !           796:        moduli[0] = n;  /* no need to move the first image, just copy ptr */
        !           797: 
        !           798:        /* used by optimized msubs macro... */
        !           799:        msu = msbptr(n,global_precision);       /* needed by msubs */
        !           800:        msu_moduli[0] = *post_lowerunit(msu);
        !           801:        nmsu_moduli[0] = *msu;
        !           802: 
        !           803:        for (i=1; i<UNITSIZE; i++)
        !           804:        {       mp_move(moduli[i],moduli[i-1]);
        !           805:                mp_shift_left(moduli[i]);
        !           806: 
        !           807:                /* used by optimized msubs macro... */
        !           808:                msu = msbptr(moduli[i],global_precision);       /* needed by msubs */
        !           809:                msu_moduli[i] = *post_lowerunit(msu);   /* for faster msubs */
        !           810:                nmsu_moduli[i] = *msu;
        !           811:        }
        !           812:        return(0);      /* normal return */
        !           813: }      /* stage_merritt_modulus */
        !           814: 
        !           815: 
        !           816: /* The following macros, sniffadd and msubs, are used by modmult... */
        !           817: #define sniffadd(i) if (*multiplier & power_of_2(i))  mp_add(prod,mpd[i])
        !           818: 
        !           819: /* Unoptimized msubs macro (msubs0) follows... */
        !           820: /* #define msubs0(i) msub(prod,moduli[i])
        !           821: */
        !           822: 
        !           823: /*     To optimize msubs, compare the most significant units of the
        !           824:        product and the shifted modulus before deciding to call the full
        !           825:        mp_compare routine.  Better still, compare the upper two units
        !           826:        before deciding to call mp_compare.
        !           827:        Optimization of msubs relies heavily on standard C left-to-right
        !           828:        parsimonious evaluation of logical expressions.
        !           829: */
        !           830: 
        !           831: /* Partially-optimized msubs macro (msubs1) follows... */
        !           832: /* #define msubs1(i) if ( \
        !           833:   ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
        !           834:   (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
        !           835:   ) mp_sub(prod,moduli[i])
        !           836: */
        !           837: 
        !           838: /* Fully-optimized msubs macro (msubs2) follows... */
        !           839: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
        !           840:  (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
        !           841:  (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
        !           842:  mp_sub(prod,moduli[i])
        !           843: 
        !           844: 
        !           845: int merritt_modmult(register unitptr prod,
        !           846:        unitptr multiplicand,register unitptr multiplier)
        !           847:        /*      Performs combined multiply/modulo operation.
        !           848:                Computes:  prod = (multiplicand*multiplier) mod modulus
        !           849:                WARNING: All the arguments must be less than the modulus!
        !           850:                Assumes the modulus has been predefined by first calling
        !           851:                stage_modulus.
        !           852:        */
        !           853: {
        !           854:        /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
        !           855:        register signedunit p_m;
        !           856:        register unitptr msu_prod;      /* ptr to most significant unit of product */
        !           857:        register unitptr nmsu_prod;     /* next-most signif. unit of product */
        !           858:        short mprec;            /* precision of multiplier, in units */
        !           859:        /*      Array mpd contains a list of pointers to preshifted images of
        !           860:                the multiplicand: */
        !           861:        static unitptr mpd[UNITSIZE] =
        !           862:        {        0,              &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
        !           863:                 &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
        !           864: #ifndef UNIT8
        !           865:                ,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
        !           866:                 &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
        !           867: #ifndef UNIT16 /* and not UNIT8 */
        !           868:                ,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
        !           869:                 &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
        !           870:                 &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
        !           871:                 &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
        !           872: #endif /* UNIT16 and UNIT8 not defined */
        !           873: #endif /* UNIT8 not defined */
        !           874:        };
        !           875: 
        !           876:        /* Compute preshifted images of multiplicand, mod n: */
        !           877:        stage_mp_images(mpd,multiplicand);
        !           878: 
        !           879:        /* To optimize msubs, set up msu_prod and nmsu_prod: */
        !           880:        msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */
        !           881:        nmsu_prod = msu_prod;
        !           882:        post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
        !           883: 
        !           884:        /*      To understand this algorithm, it would be helpful to first
        !           885:                study the conventional Russian peasant modmult algorithm.
        !           886:                This one does about the same thing as Russian peasant, but
        !           887:                is organized differently to save some steps.  It loops
        !           888:                through the multiplier a word (unit) at a time, instead of
        !           889:                a bit at a time.  It word-shifts the product instead of
        !           890:                bit-shifting it, so it should be faster.  It also does about
        !           891:                half as many subtracts as Russian peasant.
        !           892:        */
        !           893: 
        !           894:        mp_init(prod,0);        /* Initialize product to 0. */
        !           895: 
        !           896:        /*      The way mp_modmult is actually used in cryptographic
        !           897:                applications, there will NEVER be a zero multiplier or
        !           898:                multiplicand.  So there is no need to optimize for that
        !           899:                condition.
        !           900:        */
        !           901: /*     if (testeq(multiplicand,0))
        !           902:                return(0); */   /* zero multiplicand means zero product */
        !           903:        /* Normalize and compute number of units in multiplier first: */
        !           904:        normalize(multiplier,mprec);
        !           905:        if (mprec==0)   /* if precision of multiplier is 0 */
        !           906:                return(0);      /* zero multiplier means zero product */
        !           907:        make_msbptr(multiplier,mprec); /* start at MSU of multiplier */
        !           908: 
        !           909:        while (mprec--) /* Loop for the number of units in the multiplier */
        !           910:        {
        !           911:                /*      Shift the product one whole unit to the left.
        !           912:                        This is part of the multiply phase of modmult.
        !           913:                */
        !           914: 
        !           915:                mp_lshift_unit(prod);
        !           916: 
        !           917:                /*      The product may have grown by as many as UNITSIZE
        !           918:                        bits.  That's why we have global_precision set to the
        !           919:                        size of the modulus plus UNITSIZE slop bits.
        !           920:                        Now reduce the product back down by conditionally
        !           921:                        subtracting the preshifted images of the modulus.
        !           922:                        This is part of the modulo reduction phase of modmult. 
        !           923:                        The following loop is unrolled for speed, using macros...
        !           924: 
        !           925:                for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--)
        !           926:                        if (mp_compare(prod,moduli[i]) >= 0)
        !           927:                                mp_sub(prod,moduli[i]);
        !           928:                */
        !           929: 
        !           930: #ifndef UNIT8
        !           931: #ifndef UNIT16  /* and not UNIT8 */
        !           932:                msubs(31);
        !           933:                msubs(30);
        !           934:                msubs(29);
        !           935:                msubs(28);
        !           936:                msubs(27);
        !           937:                msubs(26);
        !           938:                msubs(25);
        !           939:                msubs(24);
        !           940:                msubs(23);
        !           941:                msubs(22);
        !           942:                msubs(21);
        !           943:                msubs(20);
        !           944:                msubs(19);
        !           945:                msubs(18);
        !           946:                msubs(17);
        !           947:                msubs(16);
        !           948: #endif  /* not UNIT16 and not UNIT8 */
        !           949:                msubs(15);
        !           950:                msubs(14);
        !           951:                msubs(13);
        !           952:                msubs(12);
        !           953:                msubs(11);
        !           954:                msubs(10);
        !           955:                msubs(9);
        !           956:                msubs(8);
        !           957: #endif  /* not UNIT8 */
        !           958:                msubs(7);
        !           959:                msubs(6);
        !           960:                msubs(5);
        !           961: #ifndef UNIT32
        !           962:                msubs(4);
        !           963: #ifndef UNIT16
        !           964:                msubs(3);
        !           965: #endif
        !           966: #endif
        !           967: 
        !           968:                /*      Sniff each bit in the current unit of the multiplier, 
        !           969:                        and conditionally add the corresponding preshifted 
        !           970:                        image of the multiplicand to the product.
        !           971:                        This is also part of the multiply phase of modmult.
        !           972: 
        !           973:                        The following loop is unrolled for speed, using macros...
        !           974: 
        !           975:                for (i=UNITSIZE-1; i>=0; i--)
        !           976:                   if (*multiplier & power_of_2(i))
        !           977:                                mp_add(prod,mpd[i]);
        !           978:                */
        !           979: #ifndef UNIT8
        !           980: #ifndef UNIT16 /* and not UNIT8 */
        !           981:                sniffadd(31);
        !           982:                sniffadd(30);
        !           983:                sniffadd(29);
        !           984:                sniffadd(28);
        !           985:                sniffadd(27);
        !           986:                sniffadd(26);
        !           987:                sniffadd(25);
        !           988:                sniffadd(24);
        !           989:                sniffadd(23);
        !           990:                sniffadd(22);
        !           991:                sniffadd(21);
        !           992:                sniffadd(20);
        !           993:                sniffadd(19);
        !           994:                sniffadd(18);
        !           995:                sniffadd(17);
        !           996:                sniffadd(16);
        !           997: #endif /* not UNIT16 and not UNIT8 */
        !           998:                sniffadd(15);
        !           999:                sniffadd(14);
        !          1000:                sniffadd(13);
        !          1001:                sniffadd(12);
        !          1002:                sniffadd(11);
        !          1003:                sniffadd(10);
        !          1004:                sniffadd(9);
        !          1005:                sniffadd(8);
        !          1006: #endif /* not UNIT8 */
        !          1007:                sniffadd(7);
        !          1008:                sniffadd(6);
        !          1009:                sniffadd(5);
        !          1010:                sniffadd(4);
        !          1011:                sniffadd(3);
        !          1012:                sniffadd(2);
        !          1013:                sniffadd(1);
        !          1014:                sniffadd(0);
        !          1015: 
        !          1016:                /*      The product may have grown by as many as LOG_UNITSIZE+1
        !          1017:                        bits.
        !          1018:                        Now reduce the product back down by conditionally 
        !          1019:                        subtracting LOG_UNITSIZE+1 preshifted images of the 
        !          1020:                        modulus.  This is the modulo reduction phase of modmult.
        !          1021: 
        !          1022:                        The following loop is unrolled for speed, using macros...
        !          1023: 
        !          1024:                for (i=LOG_UNITSIZE; i>=0; i--)
        !          1025:                        if (mp_compare(prod,moduli[i]) >= 0) 
        !          1026:                                mp_sub(prod,moduli[i]); 
        !          1027:                */
        !          1028: 
        !          1029: #ifndef UNIT8
        !          1030: #ifndef UNIT16
        !          1031:                msubs(5); 
        !          1032: #endif
        !          1033:                msubs(4);
        !          1034: #endif
        !          1035:                msubs(3); 
        !          1036:                msubs(2); 
        !          1037:                msubs(1); 
        !          1038:                msubs(0);
        !          1039: 
        !          1040:                /* Bump pointer to next lower unit of multiplier: */
        !          1041:                post_lowerunit(multiplier);
        !          1042: 
        !          1043:        }       /* Loop for the number of units in the multiplier */
        !          1044: 
        !          1045:        return(0);      /* normal return */
        !          1046: 
        !          1047: }      /* merritt_modmult */
        !          1048: 
        !          1049: 
        !          1050: #undef msubs
        !          1051: #undef sniffadd
        !          1052: 
        !          1053: 
        !          1054: /*     Merritt's mp_modmult function leaves some internal tables in memory,
        !          1055:        so we have to call modmult_burn() at the end of mp_modexp.
        !          1056:        This is so that no cryptographically sensitive data is left in memory
        !          1057:        after the program exits.
        !          1058: */
        !          1059: void merritt_burn(void)
        !          1060: /*     Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
        !          1061: {      unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
        !          1062:        unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
        !          1063:        unitfill0(msu_moduli,UNITSIZE);
        !          1064:        unitfill0(nmsu_moduli,UNITSIZE);
        !          1065: } /* merritt_burn() */
        !          1066: 
        !          1067: /******* end of Merritt's MODMULT stuff. *******/
        !          1068: /*=========================================================================*/
        !          1069: #endif /* MERRITT */
        !          1070: 
        !          1071: 
        !          1072: #ifdef UPTON_OR_SMITH  /* Used by Upton's and Smith's modmult algorithms */
        !          1073: 
        !          1074: /*     Jimmy Upton's multiprecision modmult algorithm in C.
        !          1075:        Performs a multiply combined with a modulo operation.
        !          1076: 
        !          1077:        The following support functions and data structures
        !          1078:        are used only by Upton's modmult algorithm...
        !          1079: */
        !          1080: 
        !          1081: short munit_prec;      /* global_precision expressed in MULTUNITs */
        !          1082: 
        !          1083: /*     Note that since the SPARC CPU has no hardware integer multiply
        !          1084:        instruction, there is not that much advantage in having an
        !          1085:        assembly version of mp_smul on that machine.  It might be faster
        !          1086:        to use Merritt's modmult instead of Upton's modmult on the SPARC.
        !          1087: */
        !          1088: 
        !          1089: /*
        !          1090:        Multiply the single-word multiplier times the multiprecision integer
        !          1091:        in multiplicand, accumulating result in prod.  The resulting
        !          1092:        multiprecision prod will be 1 word longer than the multiplicand.
        !          1093:        multiplicand is munit_prec words long.  We add into prod, so caller
        !          1094:        should zero it out first.  For best results, this time-critical
        !          1095:        function should be implemented in assembly.
        !          1096:        NOTE:  Unlike other functions in the multiprecision arithmetic
        !          1097:        library, both multiplicand and prod are pointing at the LSB,
        !          1098:        regardless of byte order of the machine.  On an 80x86, this makes
        !          1099:        no difference.  But if this assembly function is implemented
        !          1100:        on a 680x0, it becomes important.
        !          1101: */
        !          1102: /*     Note that this has been modified from the previous version to allow
        !          1103:        better support for Smith's modmult:
        !          1104:                The final carry bit is added to the existing product
        !          1105:                array, rather than simply stored.
        !          1106: */
        !          1107: 
        !          1108: #ifndef mp_smul
        !          1109: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
        !          1110: {
        !          1111:        short i;
        !          1112:        unsigned long p, carry;
        !          1113: 
        !          1114:        carry = 0;
        !          1115:        for (i=0; i<munit_prec; ++i)
        !          1116:        {       p = (unsigned long)multiplier * *post_higherunit(multiplicand);
        !          1117:                p += *prod + carry;
        !          1118:                *post_higherunit(prod) = (MULTUNIT)p;
        !          1119:                carry = p >> MULTUNITSIZE;
        !          1120:        }
        !          1121:        /* Add carry to the next higher word of product / dividend */
        !          1122:        *prod += (MULTUNIT)carry;
        !          1123: }      /* mp_smul */
        !          1124: #endif
        !          1125: 
        !          1126: /*     mp_dmul is a double-precision multiply multiplicand times multiplier,
        !          1127:        result into prod.  prod must be pointing at a "double precision"
        !          1128:        buffer.  E.g. If global_precision is 10 words, prod must be
        !          1129:        pointing at a 20-word buffer.
        !          1130: */
        !          1131: #ifndef mp_dmul
        !          1132: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
        !          1133: {
        !          1134:        register        int i;
        !          1135:        register        MULTUNIT *p_multiplicand, *p_multiplier;
        !          1136:        register        MULTUNIT *prodp;
        !          1137: 
        !          1138: 
        !          1139:        unitfill0(prod,global_precision*2);     /* Pre-zero prod */
        !          1140:        /* Calculate precision in units of MULTUNIT */
        !          1141:        munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
        !          1142:        p_multiplicand = (MULTUNIT *)multiplicand;
        !          1143:        p_multiplier = (MULTUNIT *)multiplier;
        !          1144:        prodp = (MULTUNIT *)prod;
        !          1145:        make_lsbptr(p_multiplicand,munit_prec);
        !          1146:        make_lsbptr(p_multiplier,munit_prec);
        !          1147:        make_lsbptr(prodp,munit_prec*2);
        !          1148:        /* Multiply multiplicand by each word in multiplier, accumulating prod: */
        !          1149:        for (i=0; i<munit_prec; ++i)
        !          1150:                mp_smul (post_higherunit(prodp),
        !          1151:                        p_multiplicand, *post_higherunit(p_multiplier));
        !          1152: }      /* mp_dmul */
        !          1153: #endif  /* mp_dmul */
        !          1154: 
        !          1155: static unit ALIGN modulus[MAX_UNIT_PRECISION];
        !          1156: static short nbits;                    /* number of modulus significant bits */
        !          1157: #endif /* UPTON_OR_SMITH */
        !          1158: 
        !          1159: #ifdef UPTON
        !          1160: 
        !          1161: /*     These scratchpad arrays are used only by upton_modmult (mp_modmult).
        !          1162:        Some of them could be staticly declared inside of mp_modmult, but we
        !          1163:        put them outside mp_modmult so that they can be wiped clean by
        !          1164:        modmult_burn(), which is called at the end of mp_modexp.  This is
        !          1165:        so that no sensitive data is left in memory after the program exits.
        !          1166: */
        !          1167: 
        !          1168: static unit ALIGN reciprocal[MAX_UNIT_PRECISION];
        !          1169: static unit ALIGN dhi[MAX_UNIT_PRECISION];
        !          1170: static unit ALIGN d_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
        !          1171: static unit ALIGN e_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
        !          1172: static unit ALIGN f_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
        !          1173: 
        !          1174: static short nbitsDivUNITSIZE;
        !          1175: static short nbitsModUNITSIZE;
        !          1176: 
        !          1177: /*     stage_upton_modulus() is aliased to stage_modulus().
        !          1178:        Prepare for an Upton modmult.  Calculate the reciprocal of modulus,
        !          1179:        and save both.  Note that reciprocal will have one more bit than
        !          1180:        modulus.
        !          1181:        Assumes that global_precision has already been adjusted to the
        !          1182:        size of the modulus, plus SLOP_BITS.
        !          1183: */
        !          1184: int stage_upton_modulus(unitptr n)
        !          1185: {
        !          1186:        mp_move(modulus, n);
        !          1187:        mp_recip(reciprocal, modulus);
        !          1188:        nbits = countbits(modulus);
        !          1189:        nbitsDivUNITSIZE = nbits / UNITSIZE;
        !          1190:        nbitsModUNITSIZE = nbits % UNITSIZE;
        !          1191:        return(0);      /* normal return */
        !          1192: }      /* stage_upton_modulus */
        !          1193: 
        !          1194: 
        !          1195: /*     Upton's algorithm performs a multiply combined with a modulo operation.
        !          1196:        Computes:  prod = (multiplicand*multiplier) mod modulus
        !          1197:        WARNING: All the arguments must be less than the modulus!
        !          1198:        References global unitptr modulus and reciprocal.
        !          1199:        The reciprocal of modulus is 1 bit longer than the modulus.
        !          1200:        upton_modmult() is aliased to mp_modmult().
        !          1201: */
        !          1202: int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier)
        !          1203: {
        !          1204:        unitptr d = d_data;
        !          1205:        unitptr d1 = d_data;
        !          1206:        unitptr e = e_data;
        !          1207:        unitptr f = f_data;
        !          1208:        short orig_precision;
        !          1209: 
        !          1210:        orig_precision = global_precision;
        !          1211:        mp_dmul (d,multiplicand,multiplier);
        !          1212: 
        !          1213:        /* Throw off low nbits of d */
        !          1214: #ifndef HIGHFIRST
        !          1215:        d1 = d + nbitsDivUNITSIZE;
        !          1216: #else
        !          1217:        d1 = d + orig_precision - nbitsDivUNITSIZE;
        !          1218: #endif
        !          1219:        mp_move(dhi, d1);       /* Don't screw up d, we need it later */
        !          1220:        mp_shift_right_bits(dhi, nbitsModUNITSIZE);
        !          1221: 
        !          1222:        mp_dmul (e,dhi,reciprocal);     /* Note - reciprocal has nbits+1 bits */
        !          1223: 
        !          1224: #ifndef HIGHFIRST
        !          1225:        e += nbitsDivUNITSIZE;
        !          1226: #else
        !          1227:        e += orig_precision - nbitsDivUNITSIZE;
        !          1228: #endif
        !          1229:        mp_shift_right_bits(e, nbitsModUNITSIZE);
        !          1230: 
        !          1231:        mp_dmul (f,e,modulus);
        !          1232: 
        !          1233:        /* Now for the only double-precision call to mpilib: */
        !          1234:        set_precision (orig_precision * 2);
        !          1235:        mp_sub (d,f);
        !          1236: 
        !          1237:        /* d's precision should be <= orig_precision */
        !          1238:        rescale (d, orig_precision*2, orig_precision);
        !          1239:        set_precision (orig_precision);
        !          1240: 
        !          1241:        /* Should never have to do this final subtract more than twice: */
        !          1242:        while (mp_compare(d,modulus) > 0)
        !          1243:                mp_sub (d,modulus);
        !          1244: 
        !          1245:        mp_move(prod,d);
        !          1246:        return(0);  /* normal return */
        !          1247: }      /* upton_modmult */
        !          1248: 
        !          1249: 
        !          1250: /*     Upton's mp_modmult function leaves some internal arrays in memory,
        !          1251:        so we have to call modmult_burn() at the end of mp_modexp.
        !          1252:        This is so that no cryptographically sensitive data is left in memory
        !          1253:        after the program exits.
        !          1254:        upton_burn() is aliased to modmult_burn().
        !          1255: */
        !          1256: void upton_burn(void)
        !          1257: {
        !          1258:        unitfill0(modulus,MAX_UNIT_PRECISION);
        !          1259:        unitfill0(reciprocal,MAX_UNIT_PRECISION);
        !          1260:        unitfill0(dhi,MAX_UNIT_PRECISION);
        !          1261:        unitfill0(d_data,MAX_UNIT_PRECISION*2);
        !          1262:        unitfill0(e_data,MAX_UNIT_PRECISION*2);
        !          1263:        unitfill0(f_data,MAX_UNIT_PRECISION*2);
        !          1264:        nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
        !          1265: }      /* upton_burn */
        !          1266: 
        !          1267: /******* end of Upton's MODMULT stuff. *******/
        !          1268: /*=========================================================================*/
        !          1269: #endif  /* UPTON */
        !          1270: 
        !          1271: #ifdef SMITH   /* using Thad Smith's modmult algorithm */
        !          1272: 
        !          1273: /*     Thad Smith's implementation of multiprecision modmult algorithm in C.
        !          1274:        Performs a multiply combined with a modulo operation.
        !          1275:        The multiplication is done with mp_dmul, the same as for Upton's
        !          1276:        modmult.  The modulus reduction is done by long division, in
        !          1277:        which a trial quotient "digit" is determined, then the product of
        !          1278:        that digit and the divisor are subtracted from the dividend.
        !          1279: 
        !          1280:        In this case, the digit is MULTUNIT in size and the subtraction
        !          1281:        is done by adding the product to the one's complement of the
        !          1282:        dividend, which allows use of the existing mp_smul routine.
        !          1283: 
        !          1284:        The following support functions and data structures
        !          1285:        are used only by Smith's modmult algorithm...
        !          1286: */
        !          1287: 
        !          1288: /*     These scratchpad arrays are used only by smith_modmult (mp_modmult).
        !          1289:        Some of them could be statically declared inside of mp_modmult, but we
        !          1290:        put them outside mp_modmult so that they can be wiped clean by
        !          1291:        modmult_burn(), which is called at the end of mp_modexp.  This is
        !          1292:        so that no sensitive data is left in memory after the program exits.
        !          1293: */
        !          1294: 
        !          1295: static unit ALIGN modulus[MAX_UNIT_PRECISION];
        !          1296: static unit ALIGN ds_data[MAX_UNIT_PRECISION*2+2];
        !          1297: 
        !          1298: static unit mod_quotient  [4];
        !          1299: static unit mod_divisor   [4]; /* 2 most signif. MULTUNITs of modulus */
        !          1300: 
        !          1301: static MULTUNIT *modmpl;               /* ptr to modulus least significant
        !          1302:                                        ** MULTUNIT                         */
        !          1303: static int  mshift;                    /* number of bits for
        !          1304:                                        ** recip scaling          */
        !          1305: static MULTUNIT reciph;                /* MSunit of scaled recip */
        !          1306: static MULTUNIT recipl;                /* LSunit of scaled recip */
        !          1307: 
        !          1308: static short modlenMULTUNITS;          /* length of modulus in MULTUNITs */
        !          1309: static MULTUNIT mutemp;                /* temporary */
        !          1310: 
        !          1311: /*     The routines mp_smul and mp_dmul are the same as for UPTON and
        !          1312:        should be coded in assembly.  Note, however, that the previous
        !          1313:        Upton's mp_smul version has been modified to compatible with
        !          1314:        Smith's modmult.  The new version also still works for Upton's
        !          1315:        modmult.
        !          1316: */
        !          1317: 
        !          1318: #ifndef mp_set_recip
        !          1319: #define mp_set_recip(rh,rl,m)     /* null */
        !          1320: #else
        !          1321: /* setup routine for external mp_quo_digit */
        !          1322: void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m);
        !          1323: #endif
        !          1324: MULTUNIT mp_quo_digit (MULTUNIT *dividend);
        !          1325: 
        !          1326: #ifdef MULTUNIT_SIZE_SAME
        !          1327: #define mp_musubb mp_subb      /* use existing routine */
        !          1328: #else  /* ! MULTUNIT_SIZE_SAME */
        !          1329: 
        !          1330: /*     This performs the same function as mp_subb, but with MULTUNITs.
        !          1331:        Note: Processors without alignment requirements may be able to use
        !          1332:        mp_subb, even though MULTUNITs are smaller than units.  In that case,
        !          1333:        use mp_subb, since it would be faster if coded in assembly.  Note that
        !          1334:        this implementation won't work for MULTUNITs which are long -- use
        !          1335:        mp_subb in that case.
        !          1336: */
        !          1337: #ifndef mp_musubb
        !          1338: boolean mp_musubb
        !          1339:        (register MULTUNIT* r1,register MULTUNIT* r2,register boolean borrow)
        !          1340:        /* multiprecision subtract of MULTUNITs with borrow, r2 from r1,
        !          1341:        ** result in r1 */
        !          1342:        /* borrow is incoming borrow flag-- value should be 0 or 1 */
        !          1343: {      register ulint x;       /* won't work if sizeof(MULTUNIT)==
        !          1344:                                                 sizeof(long)       */
        !          1345:        short precision;        /* number of MULTUNITs to subtract */
        !          1346:        precision = global_precision * MULTUNITs_perunit;
        !          1347:        make_lsbptr(r1,precision);
        !          1348:        make_lsbptr(r2,precision);
        !          1349:        while (precision--)
        !          1350:        {       x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
        !          1351:                *post_higherunit(r1) = x;
        !          1352:                borrow = (((1L << MULTUNITSIZE) & x) != 0L);
        !          1353:        }
        !          1354:        return (borrow);
        !          1355: }      /* mp_musubb */
        !          1356: #endif /* mp_musubb */
        !          1357: #endif /* MULTUNIT_SIZE_SAME */
        !          1358: 
        !          1359: /*     The function mp_quo_digit is the heart of Smith's modulo reduction,
        !          1360:        which uses a form of long division.  It computes a trial quotient
        !          1361:        "digit" (MULTUNIT-sized digit) by multiplying the three most
        !          1362:        significant MULTUNITs of the dividend by the two most significant
        !          1363:        MULTUNITs of the reciprocal of the modulus.  Note that this function
        !          1364:        requires that MULTUNITSIZE * 2 <= sizeof(unsigned long).
        !          1365: 
        !          1366:        An important part of this technique is that the quotient never be
        !          1367:        too small, although it may occasionally be too large.  This was
        !          1368:        done to eliminate the need to check and correct for a remainder
        !          1369:        exceeding the divisor.  It is easier to check for a negative
        !          1370:        remainder.  The following technique rarely needs correction for
        !          1371:        MULTUNITs of at least 16 bits.
        !          1372: 
        !          1373:        The following routine has two implementations:
        !          1374: 
        !          1375:        ASM_PROTOTYPE defined: written to be an executable prototype for
        !          1376:        an efficient assembly language implementation.  Note that several
        !          1377:        of the following masks and shifts can be done by directly
        !          1378:        manipulating single precision registers on some architectures.
        !          1379: 
        !          1380:        ASM_PROTOTYPE undefined: a slightly more efficient implementation
        !          1381:        in C.  Although this version returns a result larger than the
        !          1382:        optimum (which is corrected in smith_modmult) more often than the
        !          1383:        prototype version, the faster execution in C more than makes up
        !          1384:        for the difference.
        !          1385:  
        !          1386:        Parameter: dividend - points to the most significant MULTUNIT
        !          1387:                of the dividend.  Note that dividend actually contains the 
        !          1388:                one's complement of the actual dividend value (see comments for 
        !          1389:                smith_modmult).
        !          1390:        
        !          1391:         Return: the trial quotient digit resulting from dividing the first
        !          1392:                three MULTUNITs at dividend by the upper two MULTUNITs of the 
        !          1393:                modulus.
        !          1394: */
        !          1395: 
        !          1396: /* #define ASM_PROTOTYPE */ /* undefined: use C-optimized version */
        !          1397: 
        !          1398: #ifndef mp_quo_digit
        !          1399: MULTUNIT mp_quo_digit (MULTUNIT *dividend) {
        !          1400:        unsigned long q, q0, q1, q2;
        !          1401:        unsigned short lsb_factor;
        !          1402: 
        !          1403: /*     Compute the least significant product group.
        !          1404:        The last terms of q1 and q2 perform upward rounding, which is
        !          1405:        needed to guarantee that the result not be too small.
        !          1406: */
        !          1407:        q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long)reciph
        !          1408:             + reciph;
        !          1409:        q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)recipl
        !          1410:             + (1L << MULTUNITSIZE) ;
        !          1411: #ifdef ASM_PROTOTYPE
        !          1412:        lsb_factor = 1 & (q1>>MULTUNITSIZE) & (q2>>MULTUNITSIZE);
        !          1413:        q = q1 + q2;
        !          1414: 
        !          1415:        /* The following statement is equivalent to shifting the sum right
        !          1416:           one bit while shifting in the carry bit.
        !          1417:        */
        !          1418:        q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
        !          1419: #else  /* optimized C version */
        !          1420:        q0 = (q1>>1) + (q2>>1) + 1;
        !          1421: #endif
        !          1422: 
        !          1423: /*     Compute the middle significant product group.   */
        !          1424: 
        !          1425:        q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph;
        !          1426:        q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl;
        !          1427: #ifdef ASM_PROTOTYPE
        !          1428:        q = q1 + q2;
        !          1429:        q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
        !          1430: 
        !          1431: /*     Add in the most significant word of the first group.
        !          1432:        The last term takes care of the carry from adding the lsb's
        !          1433:        that were shifted out prior to addition.
        !          1434: */
        !          1435:        q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2));
        !          1436: #else  /* optimized C version */
        !          1437:        q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1;
        !          1438: #endif
        !          1439: 
        !          1440: /*     Compute the most significant term and add in the others */
        !          1441: 
        !          1442:        q = (q >> (MULTUNITSIZE-2)) +
        !          1443:            (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1);
        !          1444:        q >>= mshift;
        !          1445: 
        !          1446: /*     Prevent overflow and then wipe out the intermediate results. */
        !          1447: 
        !          1448:        mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1);
        !          1449:        q= q0 = q1 = q2 = 0;   lsb_factor = 0; (void)lsb_factor;
        !          1450:        return mutemp;
        !          1451: }
        !          1452: #endif /* mp_quo_digit */
        !          1453: 
        !          1454: /*     stage_smith_modulus() - Prepare for a Smith modmult.
        !          1455: 
        !          1456:        Calculate the reciprocal of modulus with a precision of two MULTUNITs.
        !          1457:        Assumes that global_precision has already been adjusted to the
        !          1458:        size of the modulus, plus SLOP_BITS.
        !          1459: 
        !          1460:        Note: This routine was designed to work with large values and
        !          1461:        doesn't have the necessary testing or handling to work with a
        !          1462:        modulus having less than three significant units.  For such cases,
        !          1463:        the separate multiply and modulus routines can be used.
        !          1464: 
        !          1465:        stage_smith_modulus() is aliased to stage_modulus().
        !          1466: */
        !          1467: 
        !          1468: int stage_smith_modulus(unitptr n_modulus)
        !          1469: {
        !          1470:        int   original_precision;
        !          1471:        int   sigmod;           /* significant units in modulus */
        !          1472:        unitptr  mp;            /* modulus most significant pointer */
        !          1473:        MULTUNIT *mpm;          /* reciprocal pointer */
        !          1474:        int   prec;             /* precision of reciprocal calc in units */
        !          1475: 
        !          1476:        mp_move(modulus, n_modulus);
        !          1477:        modmpl = (MULTUNIT*) modulus;
        !          1478:        modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit);
        !          1479:        nbits = countbits(modulus);
        !          1480:        modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE;
        !          1481: 
        !          1482:        original_precision = global_precision;
        !          1483: 
        !          1484:        /* The following code copies the three most significant units of
        !          1485:         * modulus to mod_divisor.
        !          1486:        */
        !          1487:        mp = modulus;
        !          1488:        sigmod = significance (modulus);
        !          1489:        rescale (mp, original_precision, sigmod);
        !          1490: /* prec is the unit precision required for 3 MULTUNITs */
        !          1491:        prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit;
        !          1492:        set_precision (prec);
        !          1493:  
        !          1494:        /* set mp = ptr to most significant units of modulus, then move
        !          1495:         * the most significant units to mp_divisor 
        !          1496:        */
        !          1497:        mp = msbptr(mp,sigmod) -tohigher(prec-1);
        !          1498:        unmake_lsbptr (mp, prec);
        !          1499:        mp_move (mod_divisor, mp);
        !          1500:  
        !          1501:        /* Keep 2*MULTUNITSIZE bits in mod_divisor.
        !          1502:         * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits.
        !          1503:        */
        !          1504:        mshift = countbits (mod_divisor) - 2*MULTUNITSIZE;
        !          1505:        mp_shift_right_bits (mod_divisor, mshift);
        !          1506:        mp_recip(mod_quotient,mod_divisor);
        !          1507:        mp_shift_right_bits (mod_quotient,1);
        !          1508: 
        !          1509:        /* Reduce to:   0 < mshift <= MULTUNITSIZE */
        !          1510:        mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1;
        !          1511:        /* round up, rescaling if necessary */
        !          1512:        mp_inc (mod_quotient);
        !          1513:        if (countbits (mod_quotient) > 2*MULTUNITSIZE) {
        !          1514:                mp_shift_right_bits (mod_quotient,1);
        !          1515:                mshift--;       /* now  0 <= mshift <= MULTUNITSIZE */
        !          1516:        }
        !          1517:        mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit);
        !          1518:        recipl = *post_higherunit (mpm);
        !          1519:        reciph = *mpm;
        !          1520:        mp_set_recip (reciph, recipl, mshift);
        !          1521:        set_precision (original_precision);
        !          1522:        return(0);      /* normal return */
        !          1523: }      /* stage_smith_modulus */
        !          1524: 
        !          1525: /*     Smith's algorithm performs a multiply combined with a modulo operation.
        !          1526:        Computes:  prod = (multiplicand*multiplier) mod modulus
        !          1527:        The modulus must first be set by stage_smith_modulus().
        !          1528:        smith_modmult() is aliased to mp_modmult().
        !          1529: */
        !          1530: 
        !          1531: int
        !          1532: smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
        !          1533: {
        !          1534:        unitptr    d;   /* ptr to product */ 
        !          1535:        MULTUNIT   *dmph, *dmpl, *dmp;  /* ptrs to dividend (high, low, first)
        !          1536:                                         * aligned for subtraction           */
        !          1537: /*     Note that dmph points one MULTUNIT higher than indicated by
        !          1538:        global precision.  This allows us to zero out a word one higher than
        !          1539:        the normal precision.
        !          1540: */
        !          1541:        short       orig_precision;
        !          1542:        short       nqd;        /* number of quotient digits remaining to
        !          1543:                                 * be generated                              */
        !          1544:        short       dmi;        /* number of significant MULTUNITs in product */
        !          1545: 
        !          1546:        d = ds_data + 1;        /* room for leading MSB if HIGHFIRST */
        !          1547:        orig_precision = global_precision;
        !          1548:        mp_dmul(d, multiplicand, multiplier);
        !          1549: 
        !          1550:        rescale(d, orig_precision * 2, orig_precision * 2 + 1);
        !          1551:        set_precision(orig_precision * 2 + 1);
        !          1552:        *msbptr(d, global_precision) = 0;       /* leading 0 unit */
        !          1553: 
        !          1554: /*     We now start working with MULTUNITs.
        !          1555:        Determine the most significant MULTUNIT of the product so we don't
        !          1556:        have to process leading zeros in our divide loop.
        !          1557: */
        !          1558:        dmi = significance(d) * MULTUNITs_perunit;
        !          1559:        if (dmi >= modlenMULTUNITS)
        !          1560:        {       /* Make dividend negative.  This allows the use of mp_smul to
        !          1561:                 * "subtract" the product of the modulus and the trial divisor
        !          1562:                 * by actually adding to a negative dividend.
        !          1563:                 * The one's complement of the dividend is used, since it causes
        !          1564:                 * a zero value to be represented as all ones.  This facilitates
        !          1565:                 * testing the result for possible overflow, since a sign bit
        !          1566:                 * indicates that no adjustment is necessary, and we should not
        !          1567:                 * attempt to adjust if the result of the addition is zero.
        !          1568:                 */
        !          1569:                mp_inc(d);
        !          1570:                mp_neg(d);
        !          1571:                set_precision(orig_precision);
        !          1572:                munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
        !          1573: 
        !          1574:                /* Set msb, lsb, and normal ptrs of dividend */
        !          1575:                dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) *
        !          1576:                            MULTUNITs_perunit) + tohigher(dmi);
        !          1577:                nqd = dmi + 1 - modlenMULTUNITS;
        !          1578:                dmpl = dmph - tohigher(modlenMULTUNITS);
        !          1579: 
        !          1580: /*     Divide loop.
        !          1581:        Each iteration computes the next quotient MULTUNIT digit, then
        !          1582:        multiplies the divisor (modulus) by the quotient digit and adds
        !          1583:        it to the one's complement of the dividend (equivalent to
        !          1584:        subtracting).  If the product was greater than the remaining dividend,
        !          1585:        we get a non-negative result, in which case we subtract off the
        !          1586:        modulus to get the proper negative remainder.
        !          1587: */
        !          1588:                for (; nqd; nqd--)
        !          1589:                {       MULTUNIT    q;      /* quotient trial digit */
        !          1590: 
        !          1591:                        q = mp_quo_digit(dmph);
        !          1592:                        if (q > 0)
        !          1593:                        {       mp_smul(dmpl, modmpl, q);
        !          1594: 
        !          1595:                                /* Perform correction if q too large.
        !          1596:                                *  This rarely occurs.
        !          1597:                                */
        !          1598:                                if (!(*dmph & MULTUNIT_msb))
        !          1599:                                {       dmp = dmpl;
        !          1600:                                        unmake_lsbptr(dmp, orig_precision * 
        !          1601:                                                   MULTUNITs_perunit);
        !          1602:                                        if (mp_musubb(dmp,
        !          1603:                                                   (MULTUNIT *) modulus, 0))
        !          1604:                                                (*dmph)  --;
        !          1605:                                }
        !          1606:                        }
        !          1607:                        pre_lowerunit(dmph);
        !          1608:                        pre_lowerunit(dmpl);
        !          1609:                }
        !          1610:                /* d contains the one's complement of the remainder. */
        !          1611:                rescale(d, orig_precision * 2 + 1, orig_precision);
        !          1612:                set_precision(orig_precision);
        !          1613:                mp_neg(d);
        !          1614:                mp_dec(d);
        !          1615:        } else
        !          1616:        {       /* Product was less than modulus.  Return it. */
        !          1617:                rescale(d, orig_precision * 2 + 1, orig_precision);
        !          1618:                set_precision(orig_precision);
        !          1619:        }
        !          1620:        mp_move(prod, d);
        !          1621:        return (0);             /* normal return */
        !          1622: }                              /* smith_modmult */
        !          1623: 
        !          1624: 
        !          1625: /*     Smith's mp_modmult function leaves some internal arrays in memory,
        !          1626:        so we have to call modmult_burn() at the end of mp_modexp.
        !          1627:        This is so that no cryptographically sensitive data is left in memory
        !          1628:        after the program exits.
        !          1629:        smith_burn() is aliased to modmult_burn().
        !          1630: */
        !          1631: void smith_burn(void)
        !          1632: {
        !          1633:        empty_array (modulus);
        !          1634:        empty_array (ds_data);
        !          1635:        empty_array (mod_quotient);
        !          1636:        empty_array (mod_divisor);
        !          1637:        modmpl = 0;
        !          1638:        mshift =nbits = 0;
        !          1639:        reciph = recipl = 0;
        !          1640:        modlenMULTUNITS = mutemp = 0;
        !          1641:        mp_set_recip (0,0,0);
        !          1642: }      /* smith_burn */
        !          1643: 
        !          1644: /*     End of Thad Smith's implementation of modmult. */
        !          1645: 
        !          1646: #endif /* SMITH */
        !          1647: 
        !          1648: 
        !          1649: int countbits(unitptr r)
        !          1650:        /* Returns number of significant bits in r */
        !          1651: {      int bits;
        !          1652:        short prec;
        !          1653:        register unit bitmask;
        !          1654:        init_bitsniffer(r,bitmask,prec,bits);
        !          1655:        return(bits);
        !          1656: } /* countbits */
        !          1657: 
        !          1658: 
        !          1659: char *copyright_notice(void)
        !          1660: /* force linker to include copyright notice in the executable object image. */
        !          1661: { return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */
        !          1662: 
        !          1663: 
        !          1664: int mp_modexp(register unitptr expout,register unitptr expin,
        !          1665:        register unitptr exponent,register unitptr modulus)
        !          1666: {      /*      Russian peasant combined exponentiation/modulo algorithm.
        !          1667:                Calls modmult instead of mult.
        !          1668:                Computes:  expout = (expin**exponent) mod modulus
        !          1669:                WARNING: All the arguments must be less than the modulus!
        !          1670:        */
        !          1671:        int bits;
        !          1672:        short oldprecision;
        !          1673:        register unit bitmask;
        !          1674:        unit product[MAX_UNIT_PRECISION];
        !          1675:        short eprec;
        !          1676: 
        !          1677: #ifdef COUNTMULTS
        !          1678:        tally_modmults = 0;     /* clear "number of modmults" counter */
        !          1679:        tally_modsquares = 0;   /* clear "number of modsquares" counter */
        !          1680: #endif /* COUNTMULTS */
        !          1681:        mp_init(expout,1);
        !          1682:        if (testeq(exponent,0))
        !          1683:        {       if (testeq(expin,0))
        !          1684:                        return(-1); /* 0 to the 0th power means return error */
        !          1685:                return(0);      /* otherwise, zero exponent means expout is 1 */
        !          1686:        }
        !          1687:        if (testeq(modulus,0))
        !          1688:                return(-2);             /* zero modulus means error */
        !          1689: #if SLOP_BITS > 0      /* if there's room for sign bits */
        !          1690:        if (mp_tstminus(modulus))
        !          1691:                return(-2);             /* negative modulus means error */
        !          1692: #endif /* SLOP_BITS > 0 */
        !          1693:        if (mp_compare(expin,modulus) >= 0)
        !          1694:                return(-3); /* if expin >= modulus, return error */
        !          1695:        if (mp_compare(exponent,modulus) >= 0)
        !          1696:                return(-4); /* if exponent >= modulus, return error */
        !          1697: 
        !          1698:        oldprecision = global_precision;        /* save global_precision */
        !          1699:        /* set smallest optimum precision for this modulus */
        !          1700:        set_precision(bits2units(countbits(modulus)+SLOP_BITS));
        !          1701:        /* rescale all these registers to global_precision we just defined */
        !          1702:        rescale(modulus,oldprecision,global_precision);
        !          1703:        rescale(expin,oldprecision,global_precision);
        !          1704:        rescale(exponent,oldprecision,global_precision);
        !          1705:        rescale(expout,oldprecision,global_precision);
        !          1706: 
        !          1707:        if (stage_modulus(modulus))
        !          1708:        {       set_precision(oldprecision);    /* restore original precision */
        !          1709:                return(-5);             /* unstageable modulus (STEWART algorithm) */
        !          1710:        }
        !          1711: 
        !          1712:        /* normalize and compute number of bits in exponent first */
        !          1713:        init_bitsniffer(exponent,bitmask,eprec,bits);
        !          1714: 
        !          1715:        /* We can "optimize out" the first modsquare and modmult: */
        !          1716:        bits--;         /* We know for sure at this point that bits>0 */
        !          1717:        mp_move(expout,expin);          /*  expout = (1*1)*expin; */
        !          1718:        bump_bitsniffer(exponent,bitmask);
        !          1719: 
        !          1720:        while (bits--)
        !          1721:        {
        !          1722:                poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
        !          1723: #ifdef COUNTMULTS
        !          1724:                tally_modsquares++;     /* bump "number of modsquares" counter */
        !          1725: #endif /* COUNTMULTS */
        !          1726:                mp_modsquare(product,expout);
        !          1727:                if (sniff_bit(exponent,bitmask))
        !          1728:                {       mp_modmult(expout,product,expin);
        !          1729: #ifdef COUNTMULTS
        !          1730:                        tally_modmults++;       /* bump "number of modmults" counter */
        !          1731: #endif /* COUNTMULTS */
        !          1732:                } else
        !          1733:                {
        !          1734:                        mp_move(expout,product);
        !          1735:                }
        !          1736:                bump_bitsniffer(exponent,bitmask);
        !          1737:        }       /* while bits-- */
        !          1738:        mp_burn(product);       /* burn the evidence on the stack */
        !          1739:        modmult_burn(); /* ask mp_modmult to also burn its own evidence */
        !          1740: 
        !          1741: #ifdef COUNTMULTS      /* diagnostic analysis */
        !          1742:        {       long atomic_mults;
        !          1743:                unsigned int unitcount,totalmults;
        !          1744:                unitcount = bits2units(countbits(modulus));
        !          1745:                /* calculation assumes modsquare takes as long as a modmult: */
        !          1746:                atomic_mults = (long) tally_modmults * (unitcount * unitcount);
        !          1747:                atomic_mults += (long) tally_modsquares * (unitcount * unitcount);
        !          1748:                printf("%ld atomic mults for ",atomic_mults);
        !          1749:                printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
        !          1750:                        tally_modsquares,tally_modmults,
        !          1751:                        tally_modsquares+tally_modmults,
        !          1752:                        countbits(modulus), unitcount);
        !          1753:        }
        !          1754: #endif /* COUNTMULTS */
        !          1755: 
        !          1756:        set_precision(oldprecision);    /* restore original precision */
        !          1757: 
        !          1758:        /* Do an explicit reference to the copyright notice so that the linker
        !          1759:           will be forced to include it in the executable object image... */
        !          1760:        copyright_notice();     /* has no real effect at run time */
        !          1761: 
        !          1762:        return(0);              /* normal return */
        !          1763: }      /* mp_modexp */
        !          1764: 
        !          1765: 
        !          1766: 
        !          1767: 
        !          1768: /*********************************************************************
        !          1769:        RSA-specific routines follow.  These are the only functions that
        !          1770:        are specific to the RSA public key cryptosystem.  The other
        !          1771:        multiprecision integer math functions may be used for non-RSA
        !          1772:        applications.
        !          1773: 
        !          1774:        The RSA public key cryptosystem is patented by the Massachusetts
        !          1775:        Institute of Technology (U.S. patent #4,405,829).  This patent does
        !          1776:        not apply outside the USA.  Public Key Partners (PKP) holds the
        !          1777:        exclusive commercial license to sell and sub-license the RSA public
        !          1778:        key cryptosystem.  The author of this software implementation of the
        !          1779:        RSA algorithm is providing this software for educational use only.
        !          1780:        Licensing this algorithm from PKP is the responsibility of you, the
        !          1781:        user, not Philip Zimmermann, the author of this software.  The author
        !          1782:        assumes no liability for any breach of patent law resulting from the
        !          1783:        unlicensed use of this software by the user.
        !          1784: *********************************************************************/
        !          1785: 
        !          1786: 
        !          1787: int rsa_decrypt(unitptr M, unitptr C,
        !          1788:        unitptr d, unitptr p, unitptr q, unitptr u)
        !          1789:        /*      Uses Quisquater's Chinese Remainder Theorem shortcut
        !          1790:                for RSA decryption.
        !          1791:                M is the output plaintext message.
        !          1792:                C is the input ciphertext message.
        !          1793:                d is the secret decryption exponent.
        !          1794:                p and q are the secret prime factors of n.
        !          1795:                u is the multiplicative inverse of p, mod q.
        !          1796:                Note that u is precomputed on the assumption that p<q.
        !          1797:                n, the common modulus, is not used here because of the
        !          1798:                Chinese Remainder Theorem shortcut.
        !          1799:        */
        !          1800: {
        !          1801:        unit p2[MAX_UNIT_PRECISION];
        !          1802:        unit q2[MAX_UNIT_PRECISION];
        !          1803:        unit temp1[MAX_UNIT_PRECISION];
        !          1804:        unit temp2[MAX_UNIT_PRECISION];
        !          1805:        int status;
        !          1806: 
        !          1807:        mp_init(M,1);   /* initialize result in case of error */
        !          1808: 
        !          1809:        if (mp_compare(p,q) >= 0)       /* ensure that p<q */
        !          1810:        {       /* swap the pointers p and q */
        !          1811:                unitptr t;
        !          1812:                t = p;  p = q; q = t;
        !          1813:        }
        !          1814: 
        !          1815: /*     Rather than decrypting by computing modexp with full mod n
        !          1816:        precision, compute a shorter modexp with mod p precision... */
        !          1817: 
        !          1818: /*     p2 = [ (C mod p)**( d mod (p-1) ) ] mod p               */
        !          1819:        mp_move(temp1,p);
        !          1820:        mp_dec(temp1);                  /* temp1 = p-1 */
        !          1821:        mp_mod(temp2,d,temp1);  /* temp2 = d mod (p-1) ) */
        !          1822:        mp_mod(temp1,C,p);              /* temp1 = C mod p  */
        !          1823:        status = mp_modexp(p2,temp1,temp2,p);
        !          1824:        if (status < 0) /* mp_modexp returned an error. */
        !          1825:                return(status); /* error return */
        !          1826: 
        !          1827: /*     Now compute a short modexp with mod q precision... */
        !          1828: 
        !          1829: /*     q2 = [ (C mod q)**( d mod (q-1) ) ] mod q               */
        !          1830:        mp_move(temp1,q);
        !          1831:        mp_dec(temp1);                  /* temp1 = q-1 */
        !          1832:        mp_mod(temp2,d,temp1);  /* temp2 = d mod (q-1) */
        !          1833:        mp_mod(temp1,C,q);              /* temp1 = C mod q  */
        !          1834:        status = mp_modexp(q2,temp1,temp2,q);
        !          1835:        if (status < 0) /* mp_modexp returned an error. */
        !          1836:                return(status); /* error return */
        !          1837: 
        !          1838: /*     Now use the multiplicative inverse u to glue together the
        !          1839:        two halves, saving a lot of time by avoiding a full mod n
        !          1840:        exponentiation.  At key generation time, u was computed
        !          1841:        with the secret key as the multiplicative inverse of
        !          1842:        p, mod q such that:  (p*u) mod q = 1, assuming p<q.
        !          1843: */
        !          1844:        if (mp_compare(p2,q2) == 0)     /* only happens if C<p */
        !          1845:                mp_move(M,p2);
        !          1846:        else
        !          1847:        {       /*      q2 = q2 - p2;  if q2<0 then q2 = q2 + q  */
        !          1848:                if (mp_sub(q2,p2))      /* if the result went negative... */
        !          1849:                        mp_add(q2,q);           /* add q to q2 */
        !          1850: 
        !          1851:                /*      M = p2 + ( p * [(q2*u) mod q] ) */
        !          1852:                mp_mult(temp1,q2,u);            /* temp1 = q2*u  */
        !          1853:                mp_mod(temp2,temp1,q);          /* temp2 = temp1 mod q */
        !          1854:                mp_mult(temp1,p,temp2); /* temp1 = p * temp2 */
        !          1855:                mp_add(temp1,p2);               /* temp1 = temp1 + p2 */
        !          1856:                mp_move(M,temp1);               /* M = temp1 */
        !          1857:        }
        !          1858: 
        !          1859:        mp_burn(p2);    /* burn the evidence on the stack...*/
        !          1860:        mp_burn(q2);
        !          1861:        mp_burn(temp1);
        !          1862:        mp_burn(temp2);
        !          1863:        return(0);      /* normal return */
        !          1864: }      /* rsa_decrypt */
        !          1865: 
        !          1866: 
        !          1867: /****************** end of MPI library ****************************/
        !          1868:
unix.superglobalmegacorp.com
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.