Annotation of pgp/src/mpilib.c, revision 1.1.1.4

1.1.1.2   root        1: /*     C source code for multiprecision arithmetic library routines.
                      2:        Implemented Nov 86 by Philip Zimmermann
                      3:        Last revised 27 Nov 91 by PRZ
                      4: 
                      5:        Boulder Software Engineering
                      6:        3021 Eleventh Street
                      7:        Boulder, CO 80304
                      8:        (303) 541-0140
                      9: 
                     10:        (c) Copyright 1986-92 by Philip Zimmermann.  All rights reserved.
                     11:        The author assumes no liability for damages resulting from the use
                     12:        of this software, even if the damage results from defects in this
                     13:        software.  No warranty is expressed or implied.  The use of this
                     14:        cryptographic software for developing weapon systems is expressly
                     15:        forbidden.
                     16: 
                     17:        These routines implement all of the multiprecision arithmetic
                     18:        necessary for number-theoretic cryptographic algorithms such as
                     19:        ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
                     20:        composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
                     21:        key cryptography.
                     22: 
                     23:        Although originally developed in Microsoft C for the IBM PC, this code
                     24:        contains few machine dependencies.  It assumes 2's complement
                     25:        arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines,
                     26:        lowbyte-highbyte order or highbyte-lowbyte order.  This version
                     27:        has been converted to ANSI C.
                     28: 
                     29: 
                     30:        The internal representation for these extended precision integer
                     31:        "registers" is an array of "units".  A unit is a machine word, which
                     32:        is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
                     33:        unsigned integer, depending on the machine's word size.  For example,
                     34:        an IBM PC or AT uses a unit size of 16 bits.  To perform arithmetic
                     35:        on these huge precision integers, we pass pointers to these unit
                     36:        arrays to various subroutines.  A pointer to an array of units is of
                     37:        type unitptr.  This is a pointer to a huge integer "register".
                     38: 
                     39:        When calling a subroutine, we always pass a pointer to the BEGINNING
                     40:        of the array of units, regardless of the byte order of the machine.
                     41:        On a lowbyte-first machine, such as the Intel 80x86, this unitptr
                     42:        points to the LEAST significant unit, and the array of units increases
                     43:        significance to the right.  On a highbyte-first machine, such as the
                     44:        Motorola 680x0, this unitptr points to the MOST significant unit, and
                     45:        the array of units decreases significance to the right.
                     46: 
                     47:        Modified 8 Apr 92 - HAJK
                     48:        Implement new VAX/VMS primitive support.
                     49: 
                     50:        Modified 30 Sep 92 -Castor Fu <[email protected]>
                     51:        Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long)
                     52: 
                     53:        Modified 28 Nov 92 - Thad Smith
                     54:        Added Smith modmult, generalized non-portable support.
                     55: */
                     56: 
                     57: /* #define COUNTMULTS */ /* count modmults for performance studies */
                     58: 
                     59: #ifdef DEBUG
                     60: #ifdef MSDOS
1.1.1.3   root       61: #ifdef __GO32__ /* DJGPP */
                     62: #include <pc.h>
                     63: #else
1.1.1.2   root       64: #include <conio.h>
1.1.1.3   root       65: #endif  /* __GO32__ */
1.1.1.2   root       66: #define poll_for_break() {while (kbhit()) getch();}
                     67: #endif /* MSDOS */
                     68: #endif /* DEBUG */
                     69: 
                     70: #ifndef poll_for_break
                     71: #define poll_for_break()  /* stub */
                     72: #endif
                     73: 
                     74: #include "mpilib.h"
                     75: 
                     76: #ifdef mp_smula
                     77: #ifdef mp_smul
                     78:        Error: Both mp_smula and mp_smul cannot be defined.
                     79: #else
                     80: #define mp_smul        mp_smula
                     81: #endif
                     82: #endif
                     83: 
                     84: /* set macros for MULTUNIT */
                     85: #ifdef MUNIT8
                     86: #define MULTUNITSIZE   8
                     87: typedef unsigned char MULTUNIT;
                     88: #ifdef UNIT8
                     89: #define MULTUNIT_SIZE_SAME
                     90: #endif
                     91: #else  /* not MUNIT8 */
                     92: #ifdef MUNIT32
                     93: #define MULTUNITSIZE   32
                     94: typedef unsigned long MULTUNIT;
                     95: #ifdef UNIT32
                     96: #define MULTUNIT_SIZE_SAME
                     97: #else
                     98: /* #error is not portable, this has the same effect */
                     99: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
                    100: #endif
                    101: #else   /* assume MUNIT16 */
                    102: #define MULTUNITSIZE   16
                    103: typedef unsigned short MULTUNIT;
                    104: #ifdef UNIT16
                    105: #define MULTUNIT_SIZE_SAME
                    106: #endif /* UNIT16 */
                    107: #ifdef UNIT8
                    108: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
                    109: #endif /* UNIT8 */
                    110: #endif /* MUNIT32 */
                    111: #endif /* MUNIT8 */
                    112: 
                    113: #define MULTUNIT_msb    ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */
                    114: #define DMULTUNIT_msb   (1L << (2*MULTUNITSIZE-1))
                    115: #define MULTUNIT_mask   ((MULTUNIT)((1L << MULTUNITSIZE)-1))
                    116: #define MULTUNITs_perunit   (UNITSIZE/MULTUNITSIZE)
                    117: 
                    118: 
                    119: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier);
                    120: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
                    121: 
                    122: short global_precision = 0; /* units of precision for all routines */
                    123: /*     global_precision is the unit precision last set by set_precision.
                    124:        Initially, set_precision() should be called to define global_precision
                    125:        before using any of these other multiprecision library routines.
                    126:        i.e.:   set_precision(MAX_UNIT_PRECISION);
                    127: */
                    128: 
                    129: /*************** multiprecision library primitives ****************/
                    130: /*     The following portable C primitives should be recoded in assembly.
                    131:        The entry point name should be defined, in "mpilib.h" to the external
                    132:        entry point name.  If undefined, the C version will be used.
                    133: */
                    134: 
                    135: typedef unsigned long int ulint;
                    136: 
                    137: #ifndef mp_addc
                    138: boolean mp_addc
                    139:        (register unitptr r1,register unitptr r2,register boolean carry)
                    140:        /* multiprecision add with carry r2 to r1, result in r1 */
                    141:        /* carry is incoming carry flag-- value should be 0 or 1 */
                    142: {      register unit x;
                    143:        short precision;        /* number of units to add */
                    144:        precision = global_precision;
                    145:        make_lsbptr(r1,precision);
                    146:        make_lsbptr(r2,precision);
                    147:        while (precision--)
                    148:        {
                    149:                if (carry)
                    150:                {       x = *r1 + *r2 + 1;
                    151:                        carry = (*r2  >= (unit)(~ *r1));
                    152:                } else
                    153:                {       x = *r1 + *r2;
                    154:                        carry = (x < *r1) ;
                    155:                }
                    156:                post_higherunit(r2);
                    157:                *post_higherunit(r1) = x;
                    158:        }
                    159:        return(carry);          /* return the final carry flag bit */
                    160: }      /* mp_addc */
                    161: #endif  /* mp_addc */
                    162: 
                    163: #ifndef mp_subb
                    164: boolean mp_subb
                    165:        (register unitptr r1,register unitptr r2,register boolean borrow)
                    166:        /* multiprecision subtract with borrow, r2 from r1, result in r1 */
                    167:        /* borrow is incoming borrow flag-- value should be 0 or 1 */
                    168: {      register unit x;
                    169:        short precision;        /* number of units to subtract */
                    170:        precision = global_precision;
                    171:        make_lsbptr(r1,precision);
                    172:        make_lsbptr(r2,precision);
                    173:        while (precision--)
                    174:        {       if (borrow)
                    175:                {       x = *r1 - *r2 - 1;
                    176:                        borrow = (*r1 <= *r2);
                    177:                } else
                    178:                {       x = *r1 - *r2;
                    179:                        borrow = (*r1 < *r2);
                    180:                }
                    181:                post_higherunit(r2);
                    182:                *post_higherunit(r1) = x;
                    183:        }
                    184:        return(borrow); /* return the final carry/borrow flag bit */
                    185: }      /* mp_subb */
                    186: #endif  /* mp_subb */
                    187: 
                    188: #ifndef mp_rotate_left
                    189: boolean mp_rotate_left(register unitptr r1,register boolean carry)
                    190:        /* multiprecision rotate left 1 bit with carry, result in r1. */
                    191:        /* carry is incoming carry flag-- value should be 0 or 1 */
                    192: {      register int precision; /* number of units to rotate */
                    193:        unsigned int mcarry = carry, nextcarry; /* int is supposed to be
                    194:                                                 * the efficient size for ops*/
                    195:        precision = global_precision;
                    196:        make_lsbptr(r1,precision);
                    197:        while (precision--)
                    198:        {
                    199:                nextcarry = (((signedunit) *r1) < 0);
                    200:                *r1 = (*r1 << 1) | mcarry;
                    201:                mcarry = nextcarry;
                    202:                pre_higherunit(r1);
                    203:        }
                    204:        return(nextcarry);      /* return the final carry flag bit */
                    205: }      /* mp_rotate_left */
                    206: #endif  /* mp_rotate_left */
                    207: 
                    208: /************** end of primitives that should be in assembly *************/
                    209: 
                    210: 
                    211: /*     The mp_shift_right_bits function is not called in any time-critical
                    212:        situations in public-key cryptographic functions, so it doesn't
                    213:        need to be coded in assembly language.
                    214: */
                    215: void mp_shift_right_bits(register unitptr r1,register short bits)
                    216:        /*      multiprecision shift right bits, result in r1.
                    217:                bits is how many bits to shift, must be <= UNITSIZE.
                    218:        */
                    219: {      register short precision;       /* number of units to shift */
                    220:        register unit carry,nextcarry,bitmask;
                    221:        register short unbits;
                    222:        if (bits==0) return;    /* shift zero bits is a no-op */
                    223:        carry = 0;
                    224:        bitmask = power_of_2(bits)-1;
                    225:        unbits = UNITSIZE-bits;         /* shift bits must be <= UNITSIZE */
                    226:        precision = global_precision;
                    227:        make_msbptr(r1,precision);
                    228:        if (bits == UNITSIZE) {
                    229:                while (precision--)  {
                    230:                        nextcarry = *r1;
                    231:                        *r1 = carry;
                    232:                        carry = nextcarry;
                    233:                        pre_lowerunit(r1);
                    234:                }
                    235:        } else {
                    236:                while (precision--)
                    237:                {       nextcarry = *r1 & bitmask;
                    238:                        *r1 >>= bits;
                    239:                        *r1 |= carry << unbits;
                    240:                        carry = nextcarry;
                    241:                        pre_lowerunit(r1);
                    242:                }
                    243:        }
                    244: }      /* mp_shift_right_bits */
                    245: 
                    246: 
                    247: #ifndef mp_compare
                    248: short mp_compare(register unitptr r1,register unitptr r2)
                    249: /*     Compares multiprecision integers *r1, *r2, and returns:
                    250:        -1 iff *r1 < *r2
                    251:         0 iff *r1 == *r2
                    252:        +1 iff *r1 > *r2
                    253: */
                    254: {      register short precision;       /* number of units to compare */
                    255: 
                    256:        precision = global_precision;
                    257:        make_msbptr(r1,precision);
                    258:        make_msbptr(r2,precision);
                    259:        do
                    260:        {       if (*r1 < *r2)
                    261:                        return(-1);
                    262:                if (*post_lowerunit(r1) > *post_lowerunit(r2))
                    263:                        return(1);
                    264:        } while (--precision);
                    265:        return(0);      /*  *r1 == *r2  */
                    266: }      /* mp_compare */
                    267: #endif /* mp_compare */
                    268: 
                    269: 
                    270: boolean mp_inc(register unitptr r)
                    271:        /* Increment multiprecision integer r. */
                    272: {      register short precision;
                    273:        precision = global_precision;
                    274:        make_lsbptr(r,precision);
                    275:        do
                    276:        {       if ( ++(*r) ) return(0);        /* no carry */
                    277:                post_higherunit(r);
                    278:        } while (--precision);
                    279:        return(1);              /* return carry set */
                    280: }      /* mp_inc */
                    281: 
                    282: 
                    283: boolean mp_dec(register unitptr r)
                    284:        /* Decrement multiprecision integer r. */
                    285: {      register short precision;
                    286:        precision = global_precision;
                    287:        make_lsbptr(r,precision);
                    288:        do
                    289:        {       if ( (signedunit) (--(*r)) != (signedunit) -1 )
                    290:                        return(0);      /* no borrow */
                    291:                post_higherunit(r);
                    292:        } while (--precision);
                    293:        return(1);              /* return borrow set */
                    294: }      /* mp_dec */
                    295: 
                    296: 
                    297: void mp_neg(register unitptr r)
                    298:        /* Compute 2's complement, the arithmetic negative, of r */
                    299: {      register short precision;       /* number of units to negate */
                    300:        precision = global_precision;
                    301:        mp_dec(r);      /* 2's complement is 1's complement plus 1 */
                    302:        do      /* now do 1's complement */
                    303:        {       *r = ~(*r);
                    304:                r++;
                    305:        } while (--precision);
                    306: }      /* mp_neg */
                    307: 
                    308: #ifndef mp_move
                    309: void mp_move(register unitptr dst,register unitptr src)
                    310: {      register short precision;       /* number of units to move */
                    311:        precision = global_precision;
                    312:        do { *dst++ = *src++; } while (--precision);
                    313: }      /* mp_move */
                    314: #endif /* mp_move */
                    315: 
                    316: void mp_init(register unitptr r, word16 value)
                    317:        /* Init multiprecision register r with short value. */
                    318: {      /* Note that mp_init doesn't extend sign bit for >32767 */
                    319: 
                    320:        unitfill0( r, global_precision);
                    321:        make_lsbptr(r,global_precision);
                    322:        *post_higherunit(r) = value;
                    323: #ifdef UNIT8
                    324:        *post_higherunit(r) = value >> UNITSIZE;
                    325: #endif
                    326: }      /* mp_init */
                    327: 
                    328: 
                    329: short significance(register unitptr r)
                    330:        /* Returns number of significant units in r */
                    331: {      register short precision;
                    332:        precision = global_precision;
                    333:        make_msbptr(r,precision);
                    334:        do
                    335:        {       if (*post_lowerunit(r))
                    336:                        return(precision);
                    337:        } while (--precision);
                    338:        return(precision);
                    339: }      /* significance */
                    340: 
                    341: 
                    342: #ifndef unitfill0
                    343: void unitfill0(unitptr r,word16 unitcount)
                    344:        /* Zero-fill the unit buffer r. */
                    345: {      while (unitcount--) *r++ = 0;
                    346: }      /* unitfill0 */
                    347: #endif  /* unitfill0 */
                    348: 
                    349: 
                    350: int mp_udiv(register unitptr remainder,register unitptr quotient,
                    351:        register unitptr dividend,register unitptr divisor)
                    352:        /* Unsigned divide, treats both operands as positive. */
                    353: {      int bits;
                    354:        short dprec;
                    355:        register unit bitmask;
                    356:        if (testeq(divisor,0))
                    357:                return(-1);     /* zero divisor means divide error */
                    358:        mp_init0(remainder);
                    359:        mp_init0(quotient);
                    360:        /* normalize and compute number of bits in dividend first */
                    361:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    362:        /* rescale quotient to same precision (dprec) as dividend */
                    363:        rescale(quotient,global_precision,dprec);
                    364:        make_msbptr(quotient,dprec);
                    365: 
                    366:        while (bits--)
                    367:        {       mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
                    368:                if (mp_compare(remainder,divisor) >= 0)
                    369:                {       mp_sub(remainder,divisor);
                    370:                        stuff_bit(quotient,bitmask);
                    371:                }
                    372:                bump_2bitsniffers(dividend,quotient,bitmask);
                    373:        }
                    374:        return(0);
                    375: } /* mp_udiv */
                    376: 
                    377: 
                    378: #ifdef UPTON_OR_SMITH
                    379: #define RECIPMARGIN 0  /* extra margin bits used by mp_recip() */
                    380: 
                    381: int mp_recip(register unitptr quotient,register unitptr divisor)
                    382:        /* Compute reciprocal (quotient) as 1/divisor.  Used by faster modmult. */
                    383: {      int bits;
                    384:        short qprec;
                    385:        register unit bitmask;
                    386:        unit remainder[MAX_UNIT_PRECISION];
                    387:        if (testeq(divisor,0))
                    388:                return(-1);     /* zero divisor means divide error */
                    389:        mp_init0(remainder);
                    390:        mp_init0(quotient);
                    391: 
                    392:        /* normalize and compute number of bits in quotient first */
                    393:        bits = countbits(divisor) + RECIPMARGIN;
                    394:        bitmask = bitmsk(bits);         /* bitmask within a single unit */
                    395:        qprec = bits2units(bits+1);
                    396:        mp_setbit(remainder,(bits-RECIPMARGIN)-1);
                    397:        /* rescale quotient to precision of divisor + RECIPMARGIN bits */
                    398:        rescale(quotient,global_precision,qprec);
                    399:        make_msbptr(quotient,qprec);
                    400: 
                    401:        while (bits--)
                    402:        {       mp_shift_left(remainder);
                    403:                if (mp_compare(remainder,divisor) >= 0)
                    404:                {       mp_sub(remainder,divisor);
                    405:                        stuff_bit(quotient,bitmask);
                    406:                }
                    407:                bump_bitsniffer(quotient,bitmask);
                    408:        }
                    409:        mp_init0(remainder);    /* burn sensitive data left on stack */
                    410:        return(0);
                    411: } /* mp_recip */
                    412: #endif /* UPTON_OR_SMITH */
                    413: 
                    414: 
                    415: int mp_div(register unitptr remainder,register unitptr quotient,
                    416:        register unitptr dividend,register unitptr divisor)
                    417:        /* Signed divide, either or both operands may be negative. */
                    418: {      boolean dvdsign,dsign;
                    419:        int status;
                    420:        dvdsign = (boolean)(mp_tstminus(dividend)!=0);
                    421:        dsign = (boolean)(mp_tstminus(divisor)!=0);
                    422:        if (dvdsign) mp_neg(dividend);
                    423:        if (dsign) mp_neg(divisor);
                    424:        status = mp_udiv(remainder,quotient,dividend,divisor);
                    425:        if (dvdsign) mp_neg(dividend);          /* repair caller's dividend */
                    426:        if (dsign) mp_neg(divisor);             /* repair caller's divisor */
                    427:        if (status<0) return(status);           /* divide error? */
                    428:        if (dvdsign) mp_neg(remainder);
                    429:        if (dvdsign ^ dsign) mp_neg(quotient);
                    430:        return(status);
                    431: } /* mp_div */
                    432: 
                    433: 
                    434: word16 mp_shortdiv(register unitptr quotient,
                    435:        register unitptr dividend,register word16 divisor)
                    436: /*     This function does a fast divide and mod on a multiprecision dividend
                    437:        using a short integer divisor returning a short integer remainder.
                    438:        This is an unsigned divide.  It treats both operands as positive.
                    439:        It is used mainly for faster printing of large numbers in base 10.
                    440: */
                    441: {      int bits;
                    442:        short dprec;
                    443:        register unit bitmask;
                    444:        register word16 remainder;
                    445:        if (!divisor)   /* if divisor == 0 */
                    446:                return(-1);     /* zero divisor means divide error */
                    447:        remainder=0;
                    448:        mp_init0(quotient);
                    449:        /* normalize and compute number of bits in dividend first */
                    450:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    451:        /* rescale quotient to same precision (dprec) as dividend */
                    452:        rescale(quotient,global_precision,dprec);
                    453:        make_msbptr(quotient,dprec);
                    454: 
                    455:        while (bits--)
                    456:        {       remainder <<= 1;
                    457:                if (sniff_bit(dividend,bitmask))
                    458:                        remainder++;
                    459:                if (remainder >= divisor)
                    460:                {       remainder -= divisor;
                    461:                        stuff_bit(quotient,bitmask);
                    462:                }
                    463:                bump_2bitsniffers(dividend,quotient,bitmask);
                    464:        }
                    465:        return(remainder);
                    466: } /* mp_shortdiv */
                    467: 
                    468: 
                    469: int mp_mod(register unitptr remainder,
                    470:        register unitptr dividend,register unitptr divisor)
                    471:        /* Unsigned divide, treats both operands as positive. */
                    472: {      int bits;
                    473:        short dprec;
                    474:        register unit bitmask;
                    475:        if (testeq(divisor,0))
                    476:                return(-1);     /* zero divisor means divide error */
                    477:        mp_init0(remainder);
                    478:        /* normalize and compute number of bits in dividend first */
                    479:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    480: 
                    481:        while (bits--)
                    482:        {       mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
                    483:                msub(remainder,divisor);
                    484:                bump_bitsniffer(dividend,bitmask);
                    485:        }
                    486:        return(0);
                    487: } /* mp_mod */
                    488: 
                    489: 
                    490: word16 mp_shortmod(register unitptr dividend,register word16 divisor)
                    491: /*     This function does a fast mod operation on a multiprecision dividend
                    492:        using a short integer modulus returning a short integer remainder.
                    493:        This is an unsigned divide.  It treats both operands as positive.
                    494:        It is used mainly for fast sieve searches for large primes.
                    495: */
                    496: {      int bits;
                    497:        short dprec;
                    498:        register unit bitmask;
                    499:        register word16 remainder;
                    500:        if (!divisor)   /* if divisor == 0 */
                    501:                return(-1);     /* zero divisor means divide error */
                    502:        remainder=0;
                    503:        /* normalize and compute number of bits in dividend first */
                    504:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    505: 
                    506:        while (bits--)
                    507:        {       remainder <<= 1;
                    508:                if (sniff_bit(dividend,bitmask))
                    509:                        remainder++;
                    510:                if (remainder >= divisor) remainder -= divisor;
                    511:                bump_bitsniffer(dividend,bitmask);
                    512:        }
                    513:        return(remainder);
                    514: } /* mp_shortmod */
                    515: 
                    516: 
                    517: 
                    518: #ifdef COMB_MULT       /* use faster "comb" multiply algorithm */
                    519:        /* We are skipping this code because it has a bug... */
                    520: 
                    521: int mp_mult(register unitptr prod,
                    522:        register unitptr multiplicand, register unitptr multiplier)
                    523:        /*      Computes multiprecision prod = multiplicand * multiplier */
                    524: {      /*      Uses interleaved comb multiply algorithm.
                    525:                This improved multiply more than twice as fast as a Russian
                    526:                peasant multiply, because it does a lot fewer shifts.
                    527:                Must have global_precision set to the size of the multiplicand
                    528:                plus UNITSIZE-1 SLOP_BITS.  Produces a product that is the sum
                    529:                of the lengths of the multiplier and multiplicand.
                    530: 
                    531:                BUG ALERT:  Unfortunately, this code has a bug.  It fails for
                    532:                some numbers.  One such example:
                    533:                x=   59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5
                    534:                x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D
                    535:                     52C8 CDC7 AFB3 61C8 243C 741B
                    536:                --which is obviously wrong.  The answer should be:
                    537:                x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A
                    538:                     C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9
                    539:                We'll have to fix this some day.  In the meantime, we'll
                    540:                just have the compiler skip over this code.
1.1.1.4 ! root      541: 
        !           542:                BUG NOTE: Possibly fixed.  Needs testing.
1.1.1.2   root      543:        */
                    544:        int bits;
                    545:        register unit bitmask;
                    546:        unitptr product, mplier, temp;
                    547:        short mprec,mprec2;
                    548:        unit mplicand[MAX_UNIT_PRECISION];
1.1.1.4 ! root      549: 
        !           550:        /* better clear full width--double precision */
        !           551:        mp_init(prod+tohigher(global_precision),0);
        !           552: 
1.1.1.2   root      553:        if (testeq(multiplicand,0))
                    554:                return(0);      /* zero multiplicand means zero product */
                    555: 
                    556:        mp_move(mplicand,multiplicand); /* save it from damage */
                    557: 
                    558:        normalize(multiplier,mprec);
                    559:        if (!mprec)
                    560:                return(0);
                    561: 
                    562:        make_lsbptr(multiplier,mprec);
                    563:        bitmask = 1;    /* start scan at LSB of multiplier */
                    564: 
                    565:        do      /* UNITSIZE times */
                    566:        {       /* do for bits 0-15 */
                    567:                product = prod;
                    568:                mplier = multiplier;
                    569:                mprec2 = mprec;
                    570:                while (mprec2--)        /* do for each word in multiplier */
                    571:                {
                    572:                        if (sniff_bit(mplier,bitmask))
                    573:                        {       if (mp_addc(product,multiplicand,0))    /* ripple carry */
                    574:                                {       /* After 1st time thru, this is rarely encountered. */
                    575:                                        temp = msbptr(product,global_precision);
                    576:                                        pre_higherunit(temp);
                    577:                                        /* temp now points to LSU of carry region. */
                    578:                                        unmake_lsbptr(temp,global_precision);
                    579:                                        mp_inc(temp);
                    580:                                }       /* ripple carry */
                    581:                        }
                    582:                        pre_higherunit(mplier);
                    583:                        pre_higherunit(product);
                    584:                }
                    585:                if (!(bitmask <<= 1))
                    586:                        break;
                    587:                mp_shift_left(multiplicand);
                    588: 
                    589:        } while (TRUE);
                    590: 
                    591:        mp_move(multiplicand,mplicand); /* recover */
                    592: 
                    593:        return(0);      /* normal return */
                    594: }      /* mp_mult */
                    595: 
                    596: #endif /* COMB_MULT */
                    597: 
                    598: 
                    599: /*     Because the "comb" multiply has a bug, we will use the slower
                    600:        Russian peasant multiply instead.  Fortunately, the mp_mult
                    601:        function is not called from any time-critical code.
                    602: */
                    603: 
                    604: int mp_mult(register unitptr prod,
                    605:        register unitptr multiplicand,register unitptr multiplier)
                    606:        /* Computes multiprecision prod = multiplicand * multiplier */
                    607: {      /* Uses "Russian peasant" multiply algorithm. */
                    608:        int bits;
                    609:        register unit bitmask;
                    610:        short mprec;
                    611:        mp_init(prod,0);
                    612:        if (testeq(multiplicand,0))
                    613:                return(0);      /* zero multiplicand means zero product */
                    614:        /* normalize and compute number of bits in multiplier first */
                    615:        init_bitsniffer(multiplier,bitmask,mprec,bits);
                    616: 
                    617:        while (bits--)
                    618:        {       mp_shift_left(prod);
                    619:                if (sniff_bit(multiplier,bitmask))
                    620:                        mp_add(prod,multiplicand);
                    621:                bump_bitsniffer(multiplier,bitmask);
                    622:        }
                    623:        return(0);
                    624: }      /* mp_mult */
                    625: 
                    626: 
                    627: 
                    628: /*     mp_modmult computes a multiprecision multiply combined with a
                    629:        modulo operation.  This is the most time-critical function in
                    630:        this multiprecision arithmetic library for performing modulo
                    631:        exponentiation.  We experimented with different versions of modmult,
                    632:        depending on the machine architecture and performance requirements.
                    633:        We will either use a Russian Peasant modmult (peasant_modmult), 
                    634:        Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's
                    635:        modmult (upton_modmult), or Thad Smith's modmult (smith_modmult).
                    636:        On machines with a hardware atomic multiply instruction,
                    637:        Smith's modmult is fastest.  It can utilize assembly subroutines to
                    638:        speed up the hardware multiply logic and trial quotient calculation.
                    639:        If the machine lacks a fast hardware multiply, Merritt's modmult
                    640:        is preferred, which doesn't call any assembly multiply routine.
                    641:        We use the alias names mp_modmult, stage_modulus, and modmult_burn
                    642:        for the corresponding true names, which depend on what flavor of
                    643:        modmult we are using.
                    644: 
                    645:        Before making the first call to mp_modmult, you must set up the
                    646:        modulus-dependant precomputated tables by calling stage_modulus.
                    647:        After making all the calls to mp_modmult, you call modmult_burn to
                    648:        erase the tables created by stage_modulus that were left in memory.
                    649: */
                    650: 
                    651: #ifdef COUNTMULTS
                    652: /* "number of modmults" counters, used for performance studies. */
                    653: static unsigned int tally_modmults = 0;
                    654: static unsigned int tally_modsquares = 0;
                    655: #endif /* COUNTMULTS */
                    656: 
                    657: 
                    658: #ifdef PEASANT
                    659: /* Conventional Russian peasant multiply with modulo algorithm. */
                    660: 
                    661: static unitptr pmodulus = 0;   /* used only by mp_modmult */
                    662: 
                    663: int stage_peasant_modulus(unitptr n)
                    664: /*     Must pass modulus to stage_modulus before calling modmult.
                    665:        Assumes that global_precision has already been adjusted to the
                    666:        size of the modulus, plus SLOP_BITS.
                    667: */
                    668: {      /* For this simple version of modmult, just copy unit pointer. */
                    669:        pmodulus = n;
                    670:        return(0);      /* normal return */
                    671: }      /* stage_peasant_modulus */
                    672: 
                    673: 
                    674: int peasant_modmult(register unitptr prod,
                    675:        unitptr multiplicand,register unitptr multiplier)
                    676: {      /*      "Russian peasant" multiply algorithm, combined with a modulo
                    677:                operation.  This is a simple naive replacement for Merritt's
                    678:                faster modmult algorithm.  References global unitptr "modulus".
                    679:                Computes:  prod = (multiplicand*multiplier) mod modulus
                    680:                WARNING: All the arguments must be less than the modulus!
                    681:        */
                    682:        int bits;
                    683:        register unit bitmask;
                    684:        short mprec;
                    685:        mp_init(prod,0);
                    686: /*     if (testeq(multiplicand,0))
                    687:                return(0); */   /* zero multiplicand means zero product */
                    688:        /* normalize and compute number of bits in multiplier first */
                    689:        init_bitsniffer(multiplier,bitmask,mprec,bits);
                    690: 
                    691:        while (bits--)
                    692:        {       mp_shift_left(prod);
                    693:                msub(prod,pmodulus);    /* turns mult into modmult */
                    694:                if (sniff_bit(multiplier,bitmask))
                    695:                {       mp_add(prod,multiplicand);
                    696:                        msub(prod,pmodulus);    /* turns mult into modmult */
                    697:                }
                    698:                bump_bitsniffer(multiplier,bitmask);
                    699:        }
                    700:        return(0);
                    701: }      /* peasant_modmult */
                    702: 
                    703: 
                    704: /*     If we are using a version of mp_modmult that uses internal tables
                    705:        in memory, we have to call modmult_burn() at the end of mp_modexp.
                    706:        This is so that no sensitive data is left in memory after the program
                    707:        exits.  The Russian peasant method doesn't use any such tables.
                    708: */
                    709: void peasant_burn(void)
                    710: /*     Alias for modmult_burn, called only from mp_modexp().  Destroys
                    711:        internal modmult tables.  This version does nothing because no
                    712:        tables are used by the Russian peasant modmult. */
                    713: { }    /* peasant_burn */
                    714: 
                    715: #endif /* PEASANT */
                    716: 
                    717: 
                    718: #ifdef MERRITT
                    719: /*=========================================================================*/
                    720: /*
                    721:        This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
                    722:        Also refined by Zhahai Stewart to reduce the number of subtracts.
                    723:     Modified by Raymond Brand to reduce the number of SLOP_BITS by 1.
                    724:        It performs a multiply combined with a modulo operation, without
                    725:        going into "double precision".  It is faster than the Russian peasant
                    726:        method, and still works well on machines that lack a fast hardware
                    727:        multiply instruction.
                    728: */
                    729: 
                    730: /*     The following support functions, macros, and data structures
                    731:        are used only by Merritt's modmult algorithm... */
                    732: 
                    733: static void mp_lshift_unit(register unitptr r1)
                    734: /*     Shift r1 1 whole unit to the left.  Used only by modmult function. */
                    735: {      register short precision;
                    736:        register unitptr r2;
                    737:        precision = global_precision;
                    738:        make_msbptr(r1,precision);
                    739:        r2 = r1;
                    740:        while (--precision)
                    741:                *post_lowerunit(r1) = *pre_lowerunit(r2);
                    742:        *r1 = 0;
                    743: }      /* mp_lshift_unit */
                    744: 
                    745: 
                    746: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
                    747: static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
                    748: static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */
                    749: {      0
                    750:        ,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0],
                    751:         &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0]
                    752: #ifndef UNIT8
                    753:        ,&moduli_buf[ 7][0]
                    754:        ,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0], 
                    755:         &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0]
                    756: #ifndef UNIT16 /* and not UNIT8 */
                    757:        ,&moduli_buf[15][0]
                    758:        ,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0], 
                    759:         &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0], 
                    760:         &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0], 
                    761:         &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0]
                    762: #endif /* UNIT16 and UNIT8 not defined */
                    763: #endif /* UNIT8 not defined */
                    764: };
                    765: 
                    766: /*     To optimize msubs, need following 2 unit arrays, each filled
                    767:        with the most significant unit and next-to-most significant unit
                    768:        of the preshifted images of the modulus. */
                    769: static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */
                    770: static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */
                    771: 
                    772: /*     mpdbuf contains preshifted images of the multiplicand, mod n.
                    773:        It is used only by mp_modmult.  It could be staticly declared
                    774:        inside of mp_modmult, but we put it outside mp_modmult so that
                    775:        it can be wiped clean by modmult_burn(), which is called at the
                    776:        end of mp_modexp.  This is so that no sensitive data is left in
                    777:        memory after the program exits.
                    778: */
                    779: static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
                    780: 
                    781: 
                    782: static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
                    783: /*     Computes UNITSIZE images of r, each shifted left 1 more bit.
                    784:        Used only by modmult function.
                    785: */
                    786: {      short int i;
                    787:        images[0] = r;  /* no need to move the first image, just copy ptr */
                    788:        for (i=1; i<UNITSIZE; i++)
                    789:        {       mp_move(images[i],images[i-1]);
                    790:                mp_shift_left(images[i]);
                    791:                msub(images[i],moduli[0]);
                    792:        }
                    793: }      /* stage_mp_images */
                    794: 
                    795: 
                    796: int stage_merritt_modulus(unitptr n)
                    797: /*     Computes UNITSIZE images of modulus, each shifted left 1 more bit.
                    798:        Before calling mp_modmult, you must first stage the moduli images by
                    799:        calling stage_modulus.  n is the pointer to the modulus.
                    800:        Assumes that global_precision has already been adjusted to the
                    801:        size of the modulus, plus SLOP_BITS.
                    802: */
                    803: {      short int i;
                    804:        unitptr msu;    /* ptr to most significant unit, for faster msubs */
                    805:        moduli[0] = n;  /* no need to move the first image, just copy ptr */
                    806: 
                    807:        /* used by optimized msubs macro... */
                    808:        msu = msbptr(n,global_precision);       /* needed by msubs */
                    809:        msu_moduli[0] = *post_lowerunit(msu);
                    810:        nmsu_moduli[0] = *msu;
                    811: 
                    812:        for (i=1; i<UNITSIZE; i++)
                    813:        {       mp_move(moduli[i],moduli[i-1]);
                    814:                mp_shift_left(moduli[i]);
                    815: 
                    816:                /* used by optimized msubs macro... */
                    817:                msu = msbptr(moduli[i],global_precision);       /* needed by msubs */
                    818:                msu_moduli[i] = *post_lowerunit(msu);   /* for faster msubs */
                    819:                nmsu_moduli[i] = *msu;
                    820:        }
                    821:        return(0);      /* normal return */
                    822: }      /* stage_merritt_modulus */
                    823: 
                    824: 
                    825: /* The following macros, sniffadd and msubs, are used by modmult... */
                    826: #define sniffadd(i) if (*multiplier & power_of_2(i))  mp_add(prod,mpd[i])
                    827: 
                    828: /* Unoptimized msubs macro (msubs0) follows... */
                    829: /* #define msubs0(i) msub(prod,moduli[i])
                    830: */
                    831: 
                    832: /*     To optimize msubs, compare the most significant units of the
                    833:        product and the shifted modulus before deciding to call the full
                    834:        mp_compare routine.  Better still, compare the upper two units
                    835:        before deciding to call mp_compare.
                    836:        Optimization of msubs relies heavily on standard C left-to-right
                    837:        parsimonious evaluation of logical expressions.
                    838: */
                    839: 
                    840: /* Partially-optimized msubs macro (msubs1) follows... */
                    841: /* #define msubs1(i) if ( \
                    842:   ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
                    843:   (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
                    844:   ) mp_sub(prod,moduli[i])
                    845: */
                    846: 
                    847: /* Fully-optimized msubs macro (msubs2) follows... */
                    848: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
                    849:  (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
                    850:  (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
                    851:  mp_sub(prod,moduli[i])
                    852: 
                    853: 
                    854: int merritt_modmult(register unitptr prod,
                    855:        unitptr multiplicand,register unitptr multiplier)
                    856:        /*      Performs combined multiply/modulo operation.
                    857:                Computes:  prod = (multiplicand*multiplier) mod modulus
                    858:                WARNING: All the arguments must be less than the modulus!
                    859:                Assumes the modulus has been predefined by first calling
                    860:                stage_modulus.
                    861:        */
                    862: {
                    863:        /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
                    864:        register signedunit p_m;
                    865:        register unitptr msu_prod;      /* ptr to most significant unit of product */
                    866:        register unitptr nmsu_prod;     /* next-most signif. unit of product */
                    867:        short mprec;            /* precision of multiplier, in units */
                    868:        /*      Array mpd contains a list of pointers to preshifted images of
                    869:                the multiplicand: */
                    870:        static unitptr mpd[UNITSIZE] =
                    871:        {        0,              &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
                    872:                 &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
                    873: #ifndef UNIT8
                    874:                ,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
                    875:                 &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
                    876: #ifndef UNIT16 /* and not UNIT8 */
                    877:                ,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
                    878:                 &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
                    879:                 &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
                    880:                 &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
                    881: #endif /* UNIT16 and UNIT8 not defined */
                    882: #endif /* UNIT8 not defined */
                    883:        };
                    884: 
                    885:        /* Compute preshifted images of multiplicand, mod n: */
                    886:        stage_mp_images(mpd,multiplicand);
                    887: 
                    888:        /* To optimize msubs, set up msu_prod and nmsu_prod: */
                    889:        msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */
                    890:        nmsu_prod = msu_prod;
                    891:        post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
                    892: 
                    893:        /*      To understand this algorithm, it would be helpful to first
                    894:                study the conventional Russian peasant modmult algorithm.
                    895:                This one does about the same thing as Russian peasant, but
                    896:                is organized differently to save some steps.  It loops
                    897:                through the multiplier a word (unit) at a time, instead of
                    898:                a bit at a time.  It word-shifts the product instead of
                    899:                bit-shifting it, so it should be faster.  It also does about
                    900:                half as many subtracts as Russian peasant.
                    901:        */
                    902: 
                    903:        mp_init(prod,0);        /* Initialize product to 0. */
                    904: 
                    905:        /*      The way mp_modmult is actually used in cryptographic
                    906:                applications, there will NEVER be a zero multiplier or
                    907:                multiplicand.  So there is no need to optimize for that
                    908:                condition.
                    909:        */
                    910: /*     if (testeq(multiplicand,0))
                    911:                return(0); */   /* zero multiplicand means zero product */
                    912:        /* Normalize and compute number of units in multiplier first: */
                    913:        normalize(multiplier,mprec);
                    914:        if (mprec==0)   /* if precision of multiplier is 0 */
                    915:                return(0);      /* zero multiplier means zero product */
                    916:        make_msbptr(multiplier,mprec); /* start at MSU of multiplier */
                    917: 
                    918:        while (mprec--) /* Loop for the number of units in the multiplier */
                    919:        {
                    920:                /*      Shift the product one whole unit to the left.
                    921:                        This is part of the multiply phase of modmult.
                    922:                */
                    923: 
                    924:                mp_lshift_unit(prod);
                    925: 
                    926:                /*      The product may have grown by as many as UNITSIZE
                    927:                        bits.  That's why we have global_precision set to the
                    928:                        size of the modulus plus UNITSIZE slop bits.
                    929:                        Now reduce the product back down by conditionally
                    930:                        subtracting the preshifted images of the modulus.
                    931:                        This is part of the modulo reduction phase of modmult. 
                    932:                        The following loop is unrolled for speed, using macros...
                    933: 
                    934:                for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--)
                    935:                        if (mp_compare(prod,moduli[i]) >= 0)
                    936:                                mp_sub(prod,moduli[i]);
                    937:                */
                    938: 
                    939: #ifndef UNIT8
                    940: #ifndef UNIT16  /* and not UNIT8 */
                    941:                msubs(31);
                    942:                msubs(30);
                    943:                msubs(29);
                    944:                msubs(28);
                    945:                msubs(27);
                    946:                msubs(26);
                    947:                msubs(25);
                    948:                msubs(24);
                    949:                msubs(23);
                    950:                msubs(22);
                    951:                msubs(21);
                    952:                msubs(20);
                    953:                msubs(19);
                    954:                msubs(18);
                    955:                msubs(17);
                    956:                msubs(16);
                    957: #endif  /* not UNIT16 and not UNIT8 */
                    958:                msubs(15);
                    959:                msubs(14);
                    960:                msubs(13);
                    961:                msubs(12);
                    962:                msubs(11);
                    963:                msubs(10);
                    964:                msubs(9);
                    965:                msubs(8);
                    966: #endif  /* not UNIT8 */
                    967:                msubs(7);
                    968:                msubs(6);
                    969:                msubs(5);
                    970: #ifndef UNIT32
                    971:                msubs(4);
                    972: #ifndef UNIT16
                    973:                msubs(3);
                    974: #endif
                    975: #endif
                    976: 
                    977:                /*      Sniff each bit in the current unit of the multiplier, 
                    978:                        and conditionally add the corresponding preshifted 
                    979:                        image of the multiplicand to the product.
                    980:                        This is also part of the multiply phase of modmult.
                    981: 
                    982:                        The following loop is unrolled for speed, using macros...
                    983: 
                    984:                for (i=UNITSIZE-1; i>=0; i--)
                    985:                   if (*multiplier & power_of_2(i))
                    986:                                mp_add(prod,mpd[i]);
                    987:                */
                    988: #ifndef UNIT8
                    989: #ifndef UNIT16 /* and not UNIT8 */
                    990:                sniffadd(31);
                    991:                sniffadd(30);
                    992:                sniffadd(29);
                    993:                sniffadd(28);
                    994:                sniffadd(27);
                    995:                sniffadd(26);
                    996:                sniffadd(25);
                    997:                sniffadd(24);
                    998:                sniffadd(23);
                    999:                sniffadd(22);
                   1000:                sniffadd(21);
                   1001:                sniffadd(20);
                   1002:                sniffadd(19);
                   1003:                sniffadd(18);
                   1004:                sniffadd(17);
                   1005:                sniffadd(16);
                   1006: #endif /* not UNIT16 and not UNIT8 */
                   1007:                sniffadd(15);
                   1008:                sniffadd(14);
                   1009:                sniffadd(13);
                   1010:                sniffadd(12);
                   1011:                sniffadd(11);
                   1012:                sniffadd(10);
                   1013:                sniffadd(9);
                   1014:                sniffadd(8);
                   1015: #endif /* not UNIT8 */
                   1016:                sniffadd(7);
                   1017:                sniffadd(6);
                   1018:                sniffadd(5);
                   1019:                sniffadd(4);
                   1020:                sniffadd(3);
                   1021:                sniffadd(2);
                   1022:                sniffadd(1);
                   1023:                sniffadd(0);
                   1024: 
                   1025:                /*      The product may have grown by as many as LOG_UNITSIZE+1
                   1026:                        bits.
                   1027:                        Now reduce the product back down by conditionally 
                   1028:                        subtracting LOG_UNITSIZE+1 preshifted images of the 
                   1029:                        modulus.  This is the modulo reduction phase of modmult.
                   1030: 
                   1031:                        The following loop is unrolled for speed, using macros...
                   1032: 
                   1033:                for (i=LOG_UNITSIZE; i>=0; i--)
                   1034:                        if (mp_compare(prod,moduli[i]) >= 0) 
                   1035:                                mp_sub(prod,moduli[i]); 
                   1036:                */
                   1037: 
                   1038: #ifndef UNIT8
                   1039: #ifndef UNIT16
                   1040:                msubs(5); 
                   1041: #endif
                   1042:                msubs(4);
                   1043: #endif
                   1044:                msubs(3); 
                   1045:                msubs(2); 
                   1046:                msubs(1); 
                   1047:                msubs(0);
                   1048: 
                   1049:                /* Bump pointer to next lower unit of multiplier: */
                   1050:                post_lowerunit(multiplier);
                   1051: 
                   1052:        }       /* Loop for the number of units in the multiplier */
                   1053: 
                   1054:        return(0);      /* normal return */
                   1055: 
                   1056: }      /* merritt_modmult */
                   1057: 
                   1058: 
                   1059: #undef msubs
                   1060: #undef sniffadd
                   1061: 
                   1062: 
                   1063: /*     Merritt's mp_modmult function leaves some internal tables in memory,
                   1064:        so we have to call modmult_burn() at the end of mp_modexp.
                   1065:        This is so that no cryptographically sensitive data is left in memory
                   1066:        after the program exits.
                   1067: */
                   1068: void merritt_burn(void)
                   1069: /*     Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
                   1070: {      unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
                   1071:        unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
                   1072:        unitfill0(msu_moduli,UNITSIZE);
                   1073:        unitfill0(nmsu_moduli,UNITSIZE);
                   1074: } /* merritt_burn() */
                   1075: 
                   1076: /******* end of Merritt's MODMULT stuff. *******/
                   1077: /*=========================================================================*/
                   1078: #endif /* MERRITT */
                   1079: 
                   1080: 
                   1081: #ifdef UPTON_OR_SMITH  /* Used by Upton's and Smith's modmult algorithms */
                   1082: 
                   1083: /*     Jimmy Upton's multiprecision modmult algorithm in C.
                   1084:        Performs a multiply combined with a modulo operation.
                   1085: 
                   1086:        The following support functions and data structures
                   1087:        are used only by Upton's modmult algorithm...
                   1088: */
                   1089: 
                   1090: short munit_prec;      /* global_precision expressed in MULTUNITs */
                   1091: 
                   1092: /*     Note that since the SPARC CPU has no hardware integer multiply
                   1093:        instruction, there is not that much advantage in having an
                   1094:        assembly version of mp_smul on that machine.  It might be faster
                   1095:        to use Merritt's modmult instead of Upton's modmult on the SPARC.
                   1096: */
                   1097: 
                   1098: /*
                   1099:        Multiply the single-word multiplier times the multiprecision integer
                   1100:        in multiplicand, accumulating result in prod.  The resulting
                   1101:        multiprecision prod will be 1 word longer than the multiplicand.
                   1102:        multiplicand is munit_prec words long.  We add into prod, so caller
                   1103:        should zero it out first.  For best results, this time-critical
                   1104:        function should be implemented in assembly.
                   1105:        NOTE:  Unlike other functions in the multiprecision arithmetic
                   1106:        library, both multiplicand and prod are pointing at the LSB,
                   1107:        regardless of byte order of the machine.  On an 80x86, this makes
                   1108:        no difference.  But if this assembly function is implemented
                   1109:        on a 680x0, it becomes important.
                   1110: */
                   1111: /*     Note that this has been modified from the previous version to allow
                   1112:        better support for Smith's modmult:
                   1113:                The final carry bit is added to the existing product
                   1114:                array, rather than simply stored.
                   1115: */
                   1116: 
                   1117: #ifndef mp_smul
                   1118: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
                   1119: {
                   1120:        short i;
                   1121:        unsigned long p, carry;
                   1122: 
                   1123:        carry = 0;
                   1124:        for (i=0; i<munit_prec; ++i)
                   1125:        {       p = (unsigned long)multiplier * *post_higherunit(multiplicand);
                   1126:                p += *prod + carry;
                   1127:                *post_higherunit(prod) = (MULTUNIT)p;
                   1128:                carry = p >> MULTUNITSIZE;
                   1129:        }
                   1130:        /* Add carry to the next higher word of product / dividend */
                   1131:        *prod += (MULTUNIT)carry;
                   1132: }      /* mp_smul */
                   1133: #endif
                   1134: 
                   1135: /*     mp_dmul is a double-precision multiply multiplicand times multiplier,
                   1136:        result into prod.  prod must be pointing at a "double precision"
                   1137:        buffer.  E.g. If global_precision is 10 words, prod must be
                   1138:        pointing at a 20-word buffer.
                   1139: */
                   1140: #ifndef mp_dmul
                   1141: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
                   1142: {
                   1143:        register        int i;
                   1144:        register        MULTUNIT *p_multiplicand, *p_multiplier;
                   1145:        register        MULTUNIT *prodp;
                   1146: 
                   1147: 
                   1148:        unitfill0(prod,global_precision*2);     /* Pre-zero prod */
                   1149:        /* Calculate precision in units of MULTUNIT */
                   1150:        munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
                   1151:        p_multiplicand = (MULTUNIT *)multiplicand;
                   1152:        p_multiplier = (MULTUNIT *)multiplier;
                   1153:        prodp = (MULTUNIT *)prod;
                   1154:        make_lsbptr(p_multiplicand,munit_prec);
                   1155:        make_lsbptr(p_multiplier,munit_prec);
                   1156:        make_lsbptr(prodp,munit_prec*2);
                   1157:        /* Multiply multiplicand by each word in multiplier, accumulating prod: */
                   1158:        for (i=0; i<munit_prec; ++i)
                   1159:                mp_smul (post_higherunit(prodp),
                   1160:                        p_multiplicand, *post_higherunit(p_multiplier));
                   1161: }      /* mp_dmul */
                   1162: #endif  /* mp_dmul */
                   1163: 
                   1164: static unit ALIGN modulus[MAX_UNIT_PRECISION];
                   1165: static short nbits;                    /* number of modulus significant bits */
                   1166: #endif /* UPTON_OR_SMITH */
                   1167: 
                   1168: #ifdef UPTON
                   1169: 
                   1170: /*     These scratchpad arrays are used only by upton_modmult (mp_modmult).
                   1171:        Some of them could be staticly declared inside of mp_modmult, but we
                   1172:        put them outside mp_modmult so that they can be wiped clean by
                   1173:        modmult_burn(), which is called at the end of mp_modexp.  This is
                   1174:        so that no sensitive data is left in memory after the program exits.
                   1175: */
                   1176: 
                   1177: static unit ALIGN reciprocal[MAX_UNIT_PRECISION];
                   1178: static unit ALIGN dhi[MAX_UNIT_PRECISION];
                   1179: static unit ALIGN d_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
                   1180: static unit ALIGN e_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
                   1181: static unit ALIGN f_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
                   1182: 
                   1183: static short nbitsDivUNITSIZE;
                   1184: static short nbitsModUNITSIZE;
                   1185: 
                   1186: /*     stage_upton_modulus() is aliased to stage_modulus().
                   1187:        Prepare for an Upton modmult.  Calculate the reciprocal of modulus,
                   1188:        and save both.  Note that reciprocal will have one more bit than
                   1189:        modulus.
                   1190:        Assumes that global_precision has already been adjusted to the
                   1191:        size of the modulus, plus SLOP_BITS.
                   1192: */
                   1193: int stage_upton_modulus(unitptr n)
                   1194: {
                   1195:        mp_move(modulus, n);
                   1196:        mp_recip(reciprocal, modulus);
                   1197:        nbits = countbits(modulus);
                   1198:        nbitsDivUNITSIZE = nbits / UNITSIZE;
                   1199:        nbitsModUNITSIZE = nbits % UNITSIZE;
                   1200:        return(0);      /* normal return */
                   1201: }      /* stage_upton_modulus */
                   1202: 
                   1203: 
                   1204: /*     Upton's algorithm performs a multiply combined with a modulo operation.
                   1205:        Computes:  prod = (multiplicand*multiplier) mod modulus
                   1206:        WARNING: All the arguments must be less than the modulus!
                   1207:        References global unitptr modulus and reciprocal.
                   1208:        The reciprocal of modulus is 1 bit longer than the modulus.
                   1209:        upton_modmult() is aliased to mp_modmult().
                   1210: */
                   1211: int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier)
                   1212: {
                   1213:        unitptr d = d_data;
                   1214:        unitptr d1 = d_data;
                   1215:        unitptr e = e_data;
                   1216:        unitptr f = f_data;
                   1217:        short orig_precision;
                   1218: 
                   1219:        orig_precision = global_precision;
                   1220:        mp_dmul (d,multiplicand,multiplier);
                   1221: 
                   1222:        /* Throw off low nbits of d */
                   1223: #ifndef HIGHFIRST
                   1224:        d1 = d + nbitsDivUNITSIZE;
                   1225: #else
                   1226:        d1 = d + orig_precision - nbitsDivUNITSIZE;
                   1227: #endif
                   1228:        mp_move(dhi, d1);       /* Don't screw up d, we need it later */
                   1229:        mp_shift_right_bits(dhi, nbitsModUNITSIZE);
                   1230: 
                   1231:        mp_dmul (e,dhi,reciprocal);     /* Note - reciprocal has nbits+1 bits */
                   1232: 
                   1233: #ifndef HIGHFIRST
                   1234:        e += nbitsDivUNITSIZE;
                   1235: #else
                   1236:        e += orig_precision - nbitsDivUNITSIZE;
                   1237: #endif
                   1238:        mp_shift_right_bits(e, nbitsModUNITSIZE);
                   1239: 
                   1240:        mp_dmul (f,e,modulus);
                   1241: 
                   1242:        /* Now for the only double-precision call to mpilib: */
                   1243:        set_precision (orig_precision * 2);
                   1244:        mp_sub (d,f);
                   1245: 
                   1246:        /* d's precision should be <= orig_precision */
                   1247:        rescale (d, orig_precision*2, orig_precision);
                   1248:        set_precision (orig_precision);
                   1249: 
                   1250:        /* Should never have to do this final subtract more than twice: */
                   1251:        while (mp_compare(d,modulus) > 0)
                   1252:                mp_sub (d,modulus);
                   1253: 
                   1254:        mp_move(prod,d);
                   1255:        return(0);  /* normal return */
                   1256: }      /* upton_modmult */
                   1257: 
                   1258: 
                   1259: /*     Upton's mp_modmult function leaves some internal arrays in memory,
                   1260:        so we have to call modmult_burn() at the end of mp_modexp.
                   1261:        This is so that no cryptographically sensitive data is left in memory
                   1262:        after the program exits.
                   1263:        upton_burn() is aliased to modmult_burn().
                   1264: */
                   1265: void upton_burn(void)
                   1266: {
                   1267:        unitfill0(modulus,MAX_UNIT_PRECISION);
                   1268:        unitfill0(reciprocal,MAX_UNIT_PRECISION);
                   1269:        unitfill0(dhi,MAX_UNIT_PRECISION);
                   1270:        unitfill0(d_data,MAX_UNIT_PRECISION*2);
                   1271:        unitfill0(e_data,MAX_UNIT_PRECISION*2);
                   1272:        unitfill0(f_data,MAX_UNIT_PRECISION*2);
                   1273:        nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
                   1274: }      /* upton_burn */
                   1275: 
                   1276: /******* end of Upton's MODMULT stuff. *******/
                   1277: /*=========================================================================*/
                   1278: #endif  /* UPTON */
                   1279: 
                   1280: #ifdef SMITH   /* using Thad Smith's modmult algorithm */
                   1281: 
                   1282: /*     Thad Smith's implementation of multiprecision modmult algorithm in C.
                   1283:        Performs a multiply combined with a modulo operation.
                   1284:        The multiplication is done with mp_dmul, the same as for Upton's
                   1285:        modmult.  The modulus reduction is done by long division, in
                   1286:        which a trial quotient "digit" is determined, then the product of
                   1287:        that digit and the divisor are subtracted from the dividend.
                   1288: 
                   1289:        In this case, the digit is MULTUNIT in size and the subtraction
                   1290:        is done by adding the product to the one's complement of the
                   1291:        dividend, which allows use of the existing mp_smul routine.
                   1292: 
                   1293:        The following support functions and data structures
                   1294:        are used only by Smith's modmult algorithm...
                   1295: */
                   1296: 
                   1297: /*     These scratchpad arrays are used only by smith_modmult (mp_modmult).
                   1298:        Some of them could be statically declared inside of mp_modmult, but we
                   1299:        put them outside mp_modmult so that they can be wiped clean by
                   1300:        modmult_burn(), which is called at the end of mp_modexp.  This is
                   1301:        so that no sensitive data is left in memory after the program exits.
                   1302: */
                   1303: 
                   1304: static unit ALIGN ds_data[MAX_UNIT_PRECISION*2+2];
                   1305: 
                   1306: static unit mod_quotient  [4];
                   1307: static unit mod_divisor   [4]; /* 2 most signif. MULTUNITs of modulus */
                   1308: 
                   1309: static MULTUNIT *modmpl;               /* ptr to modulus least significant
                   1310:                                        ** MULTUNIT                         */
                   1311: static int  mshift;                    /* number of bits for
                   1312:                                        ** recip scaling          */
                   1313: static MULTUNIT reciph;                /* MSunit of scaled recip */
                   1314: static MULTUNIT recipl;                /* LSunit of scaled recip */
                   1315: 
                   1316: static short modlenMULTUNITS;          /* length of modulus in MULTUNITs */
                   1317: static MULTUNIT mutemp;                /* temporary */
                   1318: 
                   1319: /*     The routines mp_smul and mp_dmul are the same as for UPTON and
                   1320:        should be coded in assembly.  Note, however, that the previous
                   1321:        Upton's mp_smul version has been modified to compatible with
                   1322:        Smith's modmult.  The new version also still works for Upton's
                   1323:        modmult.
                   1324: */
                   1325: 
                   1326: #ifndef mp_set_recip
                   1327: #define mp_set_recip(rh,rl,m)     /* null */
                   1328: #else
                   1329: /* setup routine for external mp_quo_digit */
                   1330: void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m);
                   1331: #endif
                   1332: MULTUNIT mp_quo_digit (MULTUNIT *dividend);
                   1333: 
                   1334: #ifdef MULTUNIT_SIZE_SAME
                   1335: #define mp_musubb mp_subb      /* use existing routine */
                   1336: #else  /* ! MULTUNIT_SIZE_SAME */
                   1337: 
                   1338: /*     This performs the same function as mp_subb, but with MULTUNITs.
                   1339:        Note: Processors without alignment requirements may be able to use
                   1340:        mp_subb, even though MULTUNITs are smaller than units.  In that case,
                   1341:        use mp_subb, since it would be faster if coded in assembly.  Note that
                   1342:        this implementation won't work for MULTUNITs which are long -- use
                   1343:        mp_subb in that case.
                   1344: */
                   1345: #ifndef mp_musubb
                   1346: boolean mp_musubb
                   1347:        (register MULTUNIT* r1,register MULTUNIT* r2,register boolean borrow)
                   1348:        /* multiprecision subtract of MULTUNITs with borrow, r2 from r1,
                   1349:        ** result in r1 */
                   1350:        /* borrow is incoming borrow flag-- value should be 0 or 1 */
                   1351: {      register ulint x;       /* won't work if sizeof(MULTUNIT)==
                   1352:                                                 sizeof(long)       */
                   1353:        short precision;        /* number of MULTUNITs to subtract */
                   1354:        precision = global_precision * MULTUNITs_perunit;
                   1355:        make_lsbptr(r1,precision);
                   1356:        make_lsbptr(r2,precision);
                   1357:        while (precision--)
                   1358:        {       x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
                   1359:                *post_higherunit(r1) = x;
                   1360:                borrow = (((1L << MULTUNITSIZE) & x) != 0L);
                   1361:        }
                   1362:        return (borrow);
                   1363: }      /* mp_musubb */
                   1364: #endif /* mp_musubb */
                   1365: #endif /* MULTUNIT_SIZE_SAME */
                   1366: 
                   1367: /*     The function mp_quo_digit is the heart of Smith's modulo reduction,
                   1368:        which uses a form of long division.  It computes a trial quotient
                   1369:        "digit" (MULTUNIT-sized digit) by multiplying the three most
                   1370:        significant MULTUNITs of the dividend by the two most significant
                   1371:        MULTUNITs of the reciprocal of the modulus.  Note that this function
                   1372:        requires that MULTUNITSIZE * 2 <= sizeof(unsigned long).
                   1373: 
                   1374:        An important part of this technique is that the quotient never be
                   1375:        too small, although it may occasionally be too large.  This was
                   1376:        done to eliminate the need to check and correct for a remainder
                   1377:        exceeding the divisor.  It is easier to check for a negative
                   1378:        remainder.  The following technique rarely needs correction for
                   1379:        MULTUNITs of at least 16 bits.
                   1380: 
                   1381:        The following routine has two implementations:
                   1382: 
                   1383:        ASM_PROTOTYPE defined: written to be an executable prototype for
                   1384:        an efficient assembly language implementation.  Note that several
                   1385:        of the following masks and shifts can be done by directly
                   1386:        manipulating single precision registers on some architectures.
                   1387: 
                   1388:        ASM_PROTOTYPE undefined: a slightly more efficient implementation
                   1389:        in C.  Although this version returns a result larger than the
                   1390:        optimum (which is corrected in smith_modmult) more often than the
                   1391:        prototype version, the faster execution in C more than makes up
                   1392:        for the difference.
                   1393:  
                   1394:        Parameter: dividend - points to the most significant MULTUNIT
                   1395:                of the dividend.  Note that dividend actually contains the 
                   1396:                one's complement of the actual dividend value (see comments for 
                   1397:                smith_modmult).
                   1398:        
                   1399:         Return: the trial quotient digit resulting from dividing the first
                   1400:                three MULTUNITs at dividend by the upper two MULTUNITs of the 
                   1401:                modulus.
                   1402: */
                   1403: 
                   1404: /* #define ASM_PROTOTYPE */ /* undefined: use C-optimized version */
                   1405: 
                   1406: #ifndef mp_quo_digit
                   1407: MULTUNIT mp_quo_digit (MULTUNIT *dividend) {
                   1408:        unsigned long q, q0, q1, q2;
                   1409:        unsigned short lsb_factor;
                   1410: 
                   1411: /*     Compute the least significant product group.
                   1412:        The last terms of q1 and q2 perform upward rounding, which is
                   1413:        needed to guarantee that the result not be too small.
                   1414: */
                   1415:        q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long)reciph
                   1416:             + reciph;
                   1417:        q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)recipl
                   1418:             + (1L << MULTUNITSIZE) ;
                   1419: #ifdef ASM_PROTOTYPE
                   1420:        lsb_factor = 1 & (q1>>MULTUNITSIZE) & (q2>>MULTUNITSIZE);
                   1421:        q = q1 + q2;
                   1422: 
                   1423:        /* The following statement is equivalent to shifting the sum right
                   1424:           one bit while shifting in the carry bit.
                   1425:        */
                   1426:        q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
                   1427: #else  /* optimized C version */
                   1428:        q0 = (q1>>1) + (q2>>1) + 1;
                   1429: #endif
                   1430: 
                   1431: /*     Compute the middle significant product group.   */
                   1432: 
                   1433:        q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph;
                   1434:        q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl;
                   1435: #ifdef ASM_PROTOTYPE
                   1436:        q = q1 + q2;
                   1437:        q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
                   1438: 
                   1439: /*     Add in the most significant word of the first group.
                   1440:        The last term takes care of the carry from adding the lsb's
                   1441:        that were shifted out prior to addition.
                   1442: */
                   1443:        q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2));
                   1444: #else  /* optimized C version */
                   1445:        q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1;
                   1446: #endif
                   1447: 
                   1448: /*     Compute the most significant term and add in the others */
                   1449: 
                   1450:        q = (q >> (MULTUNITSIZE-2)) +
                   1451:            (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1);
                   1452:        q >>= mshift;
                   1453: 
                   1454: /*     Prevent overflow and then wipe out the intermediate results. */
                   1455: 
                   1456:        mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1);
                   1457:        q= q0 = q1 = q2 = 0;   lsb_factor = 0; (void)lsb_factor;
                   1458:        return mutemp;
                   1459: }
                   1460: #endif /* mp_quo_digit */
                   1461: 
                   1462: /*     stage_smith_modulus() - Prepare for a Smith modmult.
                   1463: 
                   1464:        Calculate the reciprocal of modulus with a precision of two MULTUNITs.
                   1465:        Assumes that global_precision has already been adjusted to the
                   1466:        size of the modulus, plus SLOP_BITS.
                   1467: 
                   1468:        Note: This routine was designed to work with large values and
                   1469:        doesn't have the necessary testing or handling to work with a
                   1470:        modulus having less than three significant units.  For such cases,
                   1471:        the separate multiply and modulus routines can be used.
                   1472: 
                   1473:        stage_smith_modulus() is aliased to stage_modulus().
                   1474: */
                   1475: 
                   1476: int stage_smith_modulus(unitptr n_modulus)
                   1477: {
                   1478:        int   original_precision;
                   1479:        int   sigmod;           /* significant units in modulus */
                   1480:        unitptr  mp;            /* modulus most significant pointer */
                   1481:        MULTUNIT *mpm;          /* reciprocal pointer */
                   1482:        int   prec;             /* precision of reciprocal calc in units */
                   1483: 
                   1484:        mp_move(modulus, n_modulus);
                   1485:        modmpl = (MULTUNIT*) modulus;
                   1486:        modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit);
                   1487:        nbits = countbits(modulus);
                   1488:        modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE;
                   1489: 
                   1490:        original_precision = global_precision;
                   1491: 
                   1492:        /* The following code copies the three most significant units of
                   1493:         * modulus to mod_divisor.
                   1494:        */
                   1495:        mp = modulus;
                   1496:        sigmod = significance (modulus);
                   1497:        rescale (mp, original_precision, sigmod);
                   1498: /* prec is the unit precision required for 3 MULTUNITs */
                   1499:        prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit;
                   1500:        set_precision (prec);
                   1501:  
                   1502:        /* set mp = ptr to most significant units of modulus, then move
                   1503:         * the most significant units to mp_divisor 
                   1504:        */
                   1505:        mp = msbptr(mp,sigmod) -tohigher(prec-1);
                   1506:        unmake_lsbptr (mp, prec);
                   1507:        mp_move (mod_divisor, mp);
                   1508:  
                   1509:        /* Keep 2*MULTUNITSIZE bits in mod_divisor.
                   1510:         * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits.
                   1511:        */
                   1512:        mshift = countbits (mod_divisor) - 2*MULTUNITSIZE;
                   1513:        mp_shift_right_bits (mod_divisor, mshift);
                   1514:        mp_recip(mod_quotient,mod_divisor);
                   1515:        mp_shift_right_bits (mod_quotient,1);
                   1516: 
                   1517:        /* Reduce to:   0 < mshift <= MULTUNITSIZE */
                   1518:        mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1;
                   1519:        /* round up, rescaling if necessary */
                   1520:        mp_inc (mod_quotient);
                   1521:        if (countbits (mod_quotient) > 2*MULTUNITSIZE) {
                   1522:                mp_shift_right_bits (mod_quotient,1);
                   1523:                mshift--;       /* now  0 <= mshift <= MULTUNITSIZE */
                   1524:        }
                   1525:        mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit);
                   1526:        recipl = *post_higherunit (mpm);
                   1527:        reciph = *mpm;
                   1528:        mp_set_recip (reciph, recipl, mshift);
                   1529:        set_precision (original_precision);
                   1530:        return(0);      /* normal return */
                   1531: }      /* stage_smith_modulus */
                   1532: 
                   1533: /*     Smith's algorithm performs a multiply combined with a modulo operation.
                   1534:        Computes:  prod = (multiplicand*multiplier) mod modulus
                   1535:        The modulus must first be set by stage_smith_modulus().
                   1536:        smith_modmult() is aliased to mp_modmult().
                   1537: */
                   1538: 
                   1539: int
                   1540: smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
                   1541: {
                   1542:        unitptr    d;   /* ptr to product */ 
                   1543:        MULTUNIT   *dmph, *dmpl, *dmp;  /* ptrs to dividend (high, low, first)
                   1544:                                         * aligned for subtraction           */
                   1545: /*     Note that dmph points one MULTUNIT higher than indicated by
                   1546:        global precision.  This allows us to zero out a word one higher than
                   1547:        the normal precision.
                   1548: */
                   1549:        short       orig_precision;
                   1550:        short       nqd;        /* number of quotient digits remaining to
                   1551:                                 * be generated                              */
                   1552:        short       dmi;        /* number of significant MULTUNITs in product */
                   1553: 
                   1554:        d = ds_data + 1;        /* room for leading MSB if HIGHFIRST */
                   1555:        orig_precision = global_precision;
                   1556:        mp_dmul(d, multiplicand, multiplier);
                   1557: 
                   1558:        rescale(d, orig_precision * 2, orig_precision * 2 + 1);
                   1559:        set_precision(orig_precision * 2 + 1);
                   1560:        *msbptr(d, global_precision) = 0;       /* leading 0 unit */
                   1561: 
                   1562: /*     We now start working with MULTUNITs.
                   1563:        Determine the most significant MULTUNIT of the product so we don't
                   1564:        have to process leading zeros in our divide loop.
                   1565: */
                   1566:        dmi = significance(d) * MULTUNITs_perunit;
                   1567:        if (dmi >= modlenMULTUNITS)
                   1568:        {       /* Make dividend negative.  This allows the use of mp_smul to
                   1569:                 * "subtract" the product of the modulus and the trial divisor
                   1570:                 * by actually adding to a negative dividend.
                   1571:                 * The one's complement of the dividend is used, since it causes
                   1572:                 * a zero value to be represented as all ones.  This facilitates
                   1573:                 * testing the result for possible overflow, since a sign bit
                   1574:                 * indicates that no adjustment is necessary, and we should not
                   1575:                 * attempt to adjust if the result of the addition is zero.
                   1576:                 */
                   1577:                mp_inc(d);
                   1578:                mp_neg(d);
                   1579:                set_precision(orig_precision);
                   1580:                munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
                   1581: 
                   1582:                /* Set msb, lsb, and normal ptrs of dividend */
                   1583:                dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) *
                   1584:                            MULTUNITs_perunit) + tohigher(dmi);
                   1585:                nqd = dmi + 1 - modlenMULTUNITS;
                   1586:                dmpl = dmph - tohigher(modlenMULTUNITS);
                   1587: 
                   1588: /*     Divide loop.
                   1589:        Each iteration computes the next quotient MULTUNIT digit, then
                   1590:        multiplies the divisor (modulus) by the quotient digit and adds
                   1591:        it to the one's complement of the dividend (equivalent to
                   1592:        subtracting).  If the product was greater than the remaining dividend,
                   1593:        we get a non-negative result, in which case we subtract off the
                   1594:        modulus to get the proper negative remainder.
                   1595: */
                   1596:                for (; nqd; nqd--)
                   1597:                {       MULTUNIT    q;      /* quotient trial digit */
                   1598: 
                   1599:                        q = mp_quo_digit(dmph);
                   1600:                        if (q > 0)
                   1601:                        {       mp_smul(dmpl, modmpl, q);
                   1602: 
                   1603:                                /* Perform correction if q too large.
                   1604:                                *  This rarely occurs.
                   1605:                                */
                   1606:                                if (!(*dmph & MULTUNIT_msb))
                   1607:                                {       dmp = dmpl;
                   1608:                                        unmake_lsbptr(dmp, orig_precision * 
                   1609:                                                   MULTUNITs_perunit);
                   1610:                                        if (mp_musubb(dmp,
                   1611:                                                   (MULTUNIT *) modulus, 0))
                   1612:                                                (*dmph)  --;
                   1613:                                }
                   1614:                        }
                   1615:                        pre_lowerunit(dmph);
                   1616:                        pre_lowerunit(dmpl);
                   1617:                }
                   1618:                /* d contains the one's complement of the remainder. */
                   1619:                rescale(d, orig_precision * 2 + 1, orig_precision);
                   1620:                set_precision(orig_precision);
                   1621:                mp_neg(d);
                   1622:                mp_dec(d);
                   1623:        } else
                   1624:        {       /* Product was less than modulus.  Return it. */
                   1625:                rescale(d, orig_precision * 2 + 1, orig_precision);
                   1626:                set_precision(orig_precision);
                   1627:        }
                   1628:        mp_move(prod, d);
                   1629:        return (0);             /* normal return */
                   1630: }                              /* smith_modmult */
                   1631: 
                   1632: 
                   1633: /*     Smith's mp_modmult function leaves some internal arrays in memory,
                   1634:        so we have to call modmult_burn() at the end of mp_modexp.
                   1635:        This is so that no cryptographically sensitive data is left in memory
                   1636:        after the program exits.
                   1637:        smith_burn() is aliased to modmult_burn().
                   1638: */
                   1639: void smith_burn(void)
                   1640: {
                   1641:        empty_array (modulus);
                   1642:        empty_array (ds_data);
                   1643:        empty_array (mod_quotient);
                   1644:        empty_array (mod_divisor);
                   1645:        modmpl = 0;
                   1646:        mshift =nbits = 0;
                   1647:        reciph = recipl = 0;
                   1648:        modlenMULTUNITS = mutemp = 0;
                   1649:        mp_set_recip (0,0,0);
                   1650: }      /* smith_burn */
                   1651: 
                   1652: /*     End of Thad Smith's implementation of modmult. */
                   1653: 
                   1654: #endif /* SMITH */
                   1655: 
                   1656: 
                   1657: int countbits(unitptr r)
                   1658:        /* Returns number of significant bits in r */
                   1659: {      int bits;
                   1660:        short prec;
                   1661:        register unit bitmask;
                   1662:        init_bitsniffer(r,bitmask,prec,bits);
                   1663:        return(bits);
                   1664: } /* countbits */
                   1665: 
                   1666: 
                   1667: char *copyright_notice(void)
                   1668: /* force linker to include copyright notice in the executable object image. */
                   1669: { return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */
                   1670: 
                   1671: 
                   1672: int mp_modexp(register unitptr expout,register unitptr expin,
                   1673:        register unitptr exponent,register unitptr modulus)
                   1674: {      /*      Russian peasant combined exponentiation/modulo algorithm.
                   1675:                Calls modmult instead of mult.
                   1676:                Computes:  expout = (expin**exponent) mod modulus
                   1677:                WARNING: All the arguments must be less than the modulus!
                   1678:        */
                   1679:        int bits;
                   1680:        short oldprecision;
                   1681:        register unit bitmask;
                   1682:        unit product[MAX_UNIT_PRECISION];
                   1683:        short eprec;
                   1684: 
                   1685: #ifdef COUNTMULTS
                   1686:        tally_modmults = 0;     /* clear "number of modmults" counter */
                   1687:        tally_modsquares = 0;   /* clear "number of modsquares" counter */
                   1688: #endif /* COUNTMULTS */
                   1689:        mp_init(expout,1);
                   1690:        if (testeq(exponent,0))
                   1691:        {       if (testeq(expin,0))
                   1692:                        return(-1); /* 0 to the 0th power means return error */
                   1693:                return(0);      /* otherwise, zero exponent means expout is 1 */
                   1694:        }
                   1695:        if (testeq(modulus,0))
                   1696:                return(-2);             /* zero modulus means error */
                   1697: #if SLOP_BITS > 0      /* if there's room for sign bits */
                   1698:        if (mp_tstminus(modulus))
                   1699:                return(-2);             /* negative modulus means error */
                   1700: #endif /* SLOP_BITS > 0 */
                   1701:        if (mp_compare(expin,modulus) >= 0)
                   1702:                return(-3); /* if expin >= modulus, return error */
                   1703:        if (mp_compare(exponent,modulus) >= 0)
                   1704:                return(-4); /* if exponent >= modulus, return error */
                   1705: 
                   1706:        oldprecision = global_precision;        /* save global_precision */
                   1707:        /* set smallest optimum precision for this modulus */
                   1708:        set_precision(bits2units(countbits(modulus)+SLOP_BITS));
                   1709:        /* rescale all these registers to global_precision we just defined */
                   1710:        rescale(modulus,oldprecision,global_precision);
                   1711:        rescale(expin,oldprecision,global_precision);
                   1712:        rescale(exponent,oldprecision,global_precision);
                   1713:        rescale(expout,oldprecision,global_precision);
                   1714: 
                   1715:        if (stage_modulus(modulus))
                   1716:        {       set_precision(oldprecision);    /* restore original precision */
                   1717:                return(-5);             /* unstageable modulus (STEWART algorithm) */
                   1718:        }
                   1719: 
                   1720:        /* normalize and compute number of bits in exponent first */
                   1721:        init_bitsniffer(exponent,bitmask,eprec,bits);
                   1722: 
                   1723:        /* We can "optimize out" the first modsquare and modmult: */
                   1724:        bits--;         /* We know for sure at this point that bits>0 */
                   1725:        mp_move(expout,expin);          /*  expout = (1*1)*expin; */
                   1726:        bump_bitsniffer(exponent,bitmask);
                   1727: 
                   1728:        while (bits--)
                   1729:        {
                   1730:                poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
                   1731: #ifdef COUNTMULTS
                   1732:                tally_modsquares++;     /* bump "number of modsquares" counter */
                   1733: #endif /* COUNTMULTS */
                   1734:                mp_modsquare(product,expout);
                   1735:                if (sniff_bit(exponent,bitmask))
                   1736:                {       mp_modmult(expout,product,expin);
                   1737: #ifdef COUNTMULTS
                   1738:                        tally_modmults++;       /* bump "number of modmults" counter */
                   1739: #endif /* COUNTMULTS */
                   1740:                } else
                   1741:                {
                   1742:                        mp_move(expout,product);
                   1743:                }
                   1744:                bump_bitsniffer(exponent,bitmask);
                   1745:        }       /* while bits-- */
                   1746:        mp_burn(product);       /* burn the evidence on the stack */
                   1747:        modmult_burn(); /* ask mp_modmult to also burn its own evidence */
                   1748: 
                   1749: #ifdef COUNTMULTS      /* diagnostic analysis */
                   1750:        {       long atomic_mults;
                   1751:                unsigned int unitcount,totalmults;
                   1752:                unitcount = bits2units(countbits(modulus));
                   1753:                /* calculation assumes modsquare takes as long as a modmult: */
                   1754:                atomic_mults = (long) tally_modmults * (unitcount * unitcount);
                   1755:                atomic_mults += (long) tally_modsquares * (unitcount * unitcount);
                   1756:                printf("%ld atomic mults for ",atomic_mults);
                   1757:                printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
                   1758:                        tally_modsquares,tally_modmults,
                   1759:                        tally_modsquares+tally_modmults,
                   1760:                        countbits(modulus), unitcount);
                   1761:        }
                   1762: #endif /* COUNTMULTS */
                   1763: 
                   1764:        set_precision(oldprecision);    /* restore original precision */
                   1765: 
                   1766:        /* Do an explicit reference to the copyright notice so that the linker
                   1767:           will be forced to include it in the executable object image... */
                   1768:        copyright_notice();     /* has no real effect at run time */
                   1769: 
                   1770:        return(0);              /* normal return */
                   1771: }      /* mp_modexp */
                   1772: 
1.1.1.4 ! root     1773: int mp_modexp_crt(unitptr expout, unitptr expin,
        !          1774:        unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u)
        !          1775:        /*      This is a faster modexp for moduli with a known
        !          1776:                factorisation into two relatively prime factors p and q,
        !          1777:                and an input relatively prime to the modulus,
        !          1778:                the Chinese Remainder Theorem to do the computation
        !          1779:                mod p and mod q, and then combine the results.  This
        !          1780:                relies on a number of precomputed values, but does not
        !          1781:                actually require the modulus n or the exponent e.
        !          1782: 
        !          1783:                expout = expin ^ e mod (p*q).
        !          1784:                We form this by evaluating
        !          1785:                p2 = (expin ^ e) mod p and
        !          1786:                q2 = (expin ^ e) mod q
        !          1787:                and then combining the two by the CRT.
        !          1788: 
        !          1789:                Two optimisations of this are possible.  First, we can
        !          1790:                reduce expin modulo p and q before starting.
        !          1791: 
        !          1792:                Second, since we know the factorisation of p and q
        !          1793:                (trivially derived from the factorisation of n = p*q),
        !          1794:                and expin is relatively prime to both p and q,
        !          1795:                we can use Euler's theorem, expin^phi(m) = 1 (mod m),
        !          1796:                to throw away multiples of phi(p) or phi(q) in e.
        !          1797:                Letting ep = e mod phi(p) and
        !          1798:                        eq = e mod phi(q)
        !          1799:                then combining these two speedups, we only need to evaluate
        !          1800:                p2 = ((expin mod p) ^ ep) mod p and
        !          1801:                q2 = ((expin mod q) ^ eq) mod q.
        !          1802: 
        !          1803:                Now we need to apply the CRT.  Starting with
        !          1804:                expout = p2 (mod p) and
        !          1805:                expout = q2 (mod q)
        !          1806:                we can say that expout = p2 + p * k, and if we assume that
        !          1807:                0 <= p2 < p, then 0 <= expout < p*q for some 0 <= k < q.
        !          1808:                Since we want expout = q2 (mod q), then
        !          1809:                p*k = q2-p2 (mod q).  Since p and q are relatively
        !          1810:                prime, p has a multiplicative inverse u mod q.  In other
        !          1811:                words,
        !          1812:                       u = 1/p (mod q).
        !          1813:                Multiplying by u on both sides gives k = u*(q2-p2) (mod q).
        !          1814:                Since we want 0 <= k < q, we can thus find k as
        !          1815:                k = (u * (q2-p2)) mod q.
        !          1816: 
        !          1817:                Once we have k, evaluating p2 + p * k is easy, and
        !          1818:                that gives us the result.
        !          1819: 
        !          1820:                In the detailed implementation, there is a temporary, temp,
        !          1821:                used to hold intermediate results, p2 is held in expout,
        !          1822:                and q2 is used as a temporary in the final step when it is
        !          1823:                no longer needed.  With that, you should be able to
        !          1824:                understand the code below.
1.1.1.2   root     1825:        */
                   1826: {
                   1827:        unit q2[MAX_UNIT_PRECISION];
1.1.1.4 ! root     1828:        unit temp[MAX_UNIT_PRECISION];
1.1.1.2   root     1829:        int status;
                   1830: 
1.1.1.4 ! root     1831: /*     First, compiute p2 (physically held in M) */
1.1.1.2   root     1832: 
1.1.1.4 ! root     1833: /*     p2 = [ (expin mod p) ^ ep ] mod p               */
        !          1834:        mp_mod(temp,expin,p);           /* temp = expin mod p  */
        !          1835:        status = mp_modexp(expout,temp,ep,p);
1.1.1.2   root     1836:        if (status < 0) /* mp_modexp returned an error. */
1.1.1.4 ! root     1837:        {       mp_init(expout,1);
1.1.1.2   root     1838:                return(status); /* error return */
1.1.1.4 ! root     1839:        }
1.1.1.2   root     1840: 
1.1.1.4 ! root     1841: /*     And the same thing for q2 */
1.1.1.2   root     1842: 
1.1.1.4 ! root     1843: /*     q2 = [ (expin mod q) ^ eq ] mod q               */
        !          1844:        mp_mod(temp,expin,q);           /* temp = expin mod q  */
        !          1845:        status = mp_modexp(q2,temp,eq,q);
1.1.1.2   root     1846:        if (status < 0) /* mp_modexp returned an error. */
1.1.1.4 ! root     1847:        {       mp_init(expout,1);
1.1.1.2   root     1848:                return(status); /* error return */
1.1.1.4 ! root     1849:        }
1.1.1.2   root     1850: 
                   1851: /*     Now use the multiplicative inverse u to glue together the
1.1.1.4 ! root     1852:        two halves.
        !          1853: */
        !          1854: #if 0
        !          1855: /* This optimisation is useful if you commonly get small results,
        !          1856:    but for random results and large q, the odds of (1/q) of it
        !          1857:    being useful do not warrant the test.
        !          1858: */
        !          1859:        if (mp_compare(expout,q2) != 0)
        !          1860:        {
        !          1861: #endif
        !          1862:        /*      Find q2-p2 mod q */
        !          1863:        if (mp_sub(q2,expout))  /* if the result went negative */
        !          1864:                mp_add(q2,q);   /* add q to q2 */
        !          1865: 
        !          1866:        /*      expout = p2 + ( p * [(q2*u) mod q] )    */
        !          1867:        mp_mult(temp,q2,u);             /* q2*u  */
        !          1868:        mp_mod(q2,temp,q);              /* (q2*u) mod q */
        !          1869:        mp_mult(temp,p,q2);             /* p * [(q2*u) mod q] */
        !          1870:        mp_add(expout,temp);            /* expout = p2 + p * [...] */
        !          1871: #if 0
1.1.1.2   root     1872:        }
1.1.1.4 ! root     1873: #endif
1.1.1.2   root     1874: 
1.1.1.4 ! root     1875:        mp_burn(q2);    /* burn the evidence on the stack...*/
        !          1876:        mp_burn(temp);
1.1.1.2   root     1877:        return(0);      /* normal return */
1.1.1.4 ! root     1878: }      /* mp_modexp_crt */
1.1.1.2   root     1879: 
                   1880: 
                   1881: /****************** end of MPI library ****************************/

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.