Annotation of pgp/src/mpilib.c, revision 1.1.1.1

1.1       root        1: /*     C source code for multiprecision arithmetic library routines.
                      2:        Implemented Nov 86 by Philip Zimmermann
                      3:        Last revised 27 Nov 91 by PRZ
                      4: 
                      5:        Boulder Software Engineering
                      6:        3021 Eleventh Street
                      7:        Boulder, CO 80304
                      8:        (303) 541-0140
                      9: 
                     10:        (c) Copyright 1986-92 by Philip Zimmermann.  All rights reserved.
                     11:        The author assumes no liability for damages resulting from the use 
                     12:        of this software, even if the damage results from defects in this 
                     13:        software.  No warranty is expressed or implied.  The use of this 
                     14:        cryptographic software for developing weapon systems is expressly 
                     15:        forbidden.
                     16: 
                     17:        These routines implement all of the multiprecision arithmetic 
                     18:        necessary for number-theoretic cryptographic algorithms such as 
                     19:        ElGamal, Diffie-Hellman, Rabin, or factoring studies for large 
                     20:        composite numbers, as well as Rivest-Shamir-Adleman (RSA) public 
                     21:        key cryptography.
                     22: 
                     23:        Although originally developed in Microsoft C for the IBM PC, this code 
                     24:        contains few machine dependancies.  It assumes 2's complement 
                     25:        arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines, 
                     26:        lowbyte-highbyte order or highbyte-lowbyte order.  This version
                     27:        has been converted to ANSI C.
                     28: 
                     29: 
                     30:        The internal representation for these extended precision integer
                     31:        "registers" is an array of "units".  A unit is a machine word, which
                     32:        is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
                     33:        unsigned integer, depending on the machine's word size.  For example,
                     34:        an IBM PC or AT uses a unit size of 16 bits.  To perform arithmetic
                     35:        on these huge precision integers, we pass pointers to these unit
                     36:        arrays to various subroutines.  A pointer to an array of units is of
                     37:        type unitptr.  This is a pointer to a huge integer "register".
                     38: 
                     39:        When calling a subroutine, we always pass a pointer to the BEGINNING
                     40:        of the array of units, regardless of the byte order of the machine.
                     41:        On a lowbyte-first machine, such as the Intel 80x86, this unitptr
                     42:        points to the LEAST significant unit, and the array of units increases
                     43:        significance to the right.  On a highbyte-first machine, such as the
                     44:        Motorola 680x0, this unitptr points to the MOST significant unit, and
                     45:        the array of units decreases significance to the right.
                     46: 
                     47:        Modified 8 Apr 92 - HAJK
                     48:        Implement new VAX/VMS primitive support.
                     49: 
                     50: */
                     51: 
                     52: /* #define COUNTMULTS */ /* count modmults for performance studies */
                     53: 
                     54: #ifdef DEBUG
                     55: #ifdef MSDOS
                     56: #include <conio.h>
                     57: #define poll_for_break() {while (kbhit()) getch();}
                     58: #endif /* MSDOS */
                     59: #endif /* DEBUG */
                     60: 
                     61: #ifndef poll_for_break
                     62: #define poll_for_break()  /* stub */
                     63: #endif
                     64: 
                     65: #include "mpilib.h"
                     66: 
                     67: short global_precision = 0; /* units of precision for all routines */
                     68: /*     global_precision is the unit precision last set by set_precision.
                     69:        Initially, set_precision() should be called to define global_precision
                     70:        before using any of these other multiprecision library routines.
                     71:        i.e.:   set_precision(MAX_UNIT_PRECISION);
                     72: */
                     73: 
                     74: #ifdef PORTABLE
                     75: /*************** multiprecision library primitives ****************/
                     76: /*     The following portable C primitives should be recoded in assembly.
                     77:        The equivalent assembly primitives are externally defined under
                     78:        different names, unless PORTABLE is defined.  See the header file
                     79:        "mpilib.h" for further details.
                     80: 
                     81:        The carry bit mechanism we use for these portable primitives
                     82:        will only work if the size of unit is smaller than the size of
                     83:        a long integer.  In most cases, this means UNITSIZE must be
                     84:        less than 32 bits -- if you use the C portable primitives.
                     85: */
                     86: 
                     87: typedef unsigned long int ulint;
                     88: #define carrybit ((ulint) 1 << UNITSIZE)
                     89: /* ...assumes sizeof(unit) < sizeof(unsigned long) */
                     90: 
                     91: boolean mp_addc
                     92:        (register unitptr r1,register unitptr r2,register boolean carry)
                     93:        /* multiprecision add with carry r2 to r1, result in r1 */
                     94:        /* carry is incoming carry flag-- value should be 0 or 1 */
                     95: {      register ulint x;       /* won't work if sizeof(unit)==sizeof(long) */
                     96:        short precision;        /* number of units to add */
                     97:        precision = global_precision;
                     98:        make_lsbptr(r1,precision);
                     99:        make_lsbptr(r2,precision);
                    100:        while (precision--)     
                    101:        {       x = (ulint) *r1 + (ulint) *post_higherunit(r2) + (ulint) carry;
                    102:                *post_higherunit(r1) = x;
                    103:                carry = ((x & carrybit) != 0L);
                    104:        }
                    105:        return(carry);          /* return the final carry flag bit */
                    106: }      /* mp_addc */
                    107: 
                    108: 
                    109: boolean mp_subb
                    110:        (register unitptr r1,register unitptr r2,register boolean borrow)
                    111:        /* multiprecision subtract with borrow, r2 from r1, result in r1 */
                    112:        /* borrow is incoming borrow flag-- value should be 0 or 1 */
                    113: {      register ulint x;       /* won't work if sizeof(unit)==sizeof(long) */
                    114:        short precision;        /* number of units to subtract */
                    115:        precision = global_precision;
                    116:        make_lsbptr(r1,precision);
                    117:        make_lsbptr(r2,precision);
                    118:        while (precision--)     
                    119:        {       x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
                    120:                *post_higherunit(r1) = x;
                    121:                borrow = ((x & carrybit) != 0L);
                    122:        }
                    123:        return(borrow);         /* return the final carry/borrow flag bit */
                    124: }      /* mp_subb */
                    125: 
                    126: #undef carrybit
                    127: 
                    128: 
                    129: boolean mp_rotate_left(register unitptr r1,register boolean carry)
                    130:        /* multiprecision rotate left 1 bit with carry, result in r1. */
                    131:        /* carry is incoming carry flag-- value should be 0 or 1 */
                    132: {      register short precision;       /* number of units to rotate */
                    133:        register boolean nextcarry;
                    134:        precision = global_precision;
                    135:        make_lsbptr(r1,precision);
                    136:        while (precision--)     
                    137:        {       
                    138:                nextcarry = (((signedunit) *r1) < 0);
                    139:                /* nextcarry = ((*r1 & uppermostbit) != 0); */
                    140:                *r1 <<= 1 ;
                    141:                if (carry) *r1 |= 1;
                    142:                carry = nextcarry;
                    143:                pre_higherunit(r1);
                    144:        }
                    145:        return(nextcarry);      /* return the final carry flag bit */
                    146: }      /* mp_rotate_left */
                    147: 
                    148: /************** end of primitives that should be in assembly *************/
                    149: #endif /* PORTABLE */
                    150: 
                    151: 
                    152: /*     The mp_shift_right_bits function is not called in any time-critical
                    153:        situations in public-key cryptographic functions, so it doesn't 
                    154:        need to be coded in assembly language.
                    155: */
                    156: void mp_shift_right_bits(register unitptr r1,register short bits)
                    157:        /*      multiprecision shift right bits, result in r1. 
                    158:                bits is how many bits to shift, must be < UNITSIZE.
                    159:        */
                    160: {      register short precision;       /* number of units to shift */
                    161:        register unit carry,nextcarry,bitmask;
                    162:        register short unbits;
                    163:        if (bits==0) return;    /* shift zero bits is a no-op */
                    164:        carry = 0;
                    165:        bitmask = power_of_2(bits)-1;
                    166:        unbits = UNITSIZE-bits;         /* shift bits must be < UNITSIZE */
                    167:        precision = global_precision;
                    168:        make_msbptr(r1,precision);
                    169:        while (precision--)     
                    170:        {       nextcarry = *r1 & bitmask;
                    171:                *r1 >>= bits;
                    172:                *r1 |= carry << unbits;
                    173:                carry = nextcarry;
                    174:                pre_lowerunit(r1);
                    175:        }
                    176: }      /* mp_shift_right_bits */
                    177: 
                    178: 
                    179: #ifndef VMS
                    180: 
                    181: short mp_compare(register unitptr r1,register unitptr r2)
                    182: /*     Compares multiprecision integers *r1, *r2, and returns: 
                    183:        -1 iff *r1 < *r2
                    184:         0 iff *r1 == *r2
                    185:        +1 iff *r1 > *r2
                    186: */
                    187: {      register short precision;       /* number of units to compare */
                    188: 
                    189:        precision = global_precision;
                    190:        make_msbptr(r1,precision);
                    191:        make_msbptr(r2,precision);
                    192:        do      
                    193:        {       if (*r1 < *r2) 
                    194:                        return(-1);
                    195:                if (*post_lowerunit(r1) > *post_lowerunit(r2)) 
                    196:                        return(1);
                    197:        } while (--precision);  
                    198:        return(0);      /*  *r1 == *r2  */
                    199: }      /* mp_compare */
                    200: 
                    201: #endif /* VMS */
                    202: 
                    203: boolean mp_inc(register unitptr r)
                    204:        /* Increment multiprecision integer r. */
                    205: {      register short precision;
                    206:        precision = global_precision;
                    207:        make_lsbptr(r,precision);
                    208:        do      
                    209:        {       if ( ++(*r) ) return(0);        /* no carry */
                    210:                post_higherunit(r);     
                    211:        } while (--precision);
                    212:        return(1);              /* return carry set */
                    213: }      /* mp_inc */
                    214: 
                    215: 
                    216: boolean mp_dec(register unitptr r)
                    217:        /* Decrement multiprecision integer r. */
                    218: {      register short precision;
                    219:        precision = global_precision;
                    220:        make_lsbptr(r,precision);
                    221:        do      
                    222:        {       if ( (signedunit) (--(*r)) != (signedunit) -1 )
                    223:                        return(0);      /* no borrow */
                    224:                post_higherunit(r);     
                    225:        } while (--precision);
                    226:        return(1);              /* return borrow set */
                    227: }      /* mp_dec */
                    228: 
                    229: 
                    230: void mp_neg(register unitptr r)
                    231:        /* Compute 2's complement, the arithmetic negative, of r */
                    232: {      register short precision;       /* number of units to negate */
                    233:        precision = global_precision;
                    234:        mp_dec(r);      /* 2's complement is 1's complement plus 1 */
                    235:        do      /* now do 1's complement */
                    236:        {       *r = ~(*r);
                    237:                r++;    
                    238:        } while (--precision);
                    239: }      /* mp_neg */
                    240: 
                    241: #ifndef VAXC /* Replaced by a macro under VAX C */
                    242: 
                    243: void mp_move(register unitptr dst,register unitptr src)
                    244: {      register short precision;       /* number of units to move */
                    245:        precision = global_precision;
                    246:        do { *dst++ = *src++; } while (--precision);
                    247: }      /* mp_move */
                    248: 
                    249: #endif /* VAXC */
                    250: void mp_init(register unitptr r, word16 value)
                    251:        /* Init multiprecision register r with short value. */
                    252: {      /* Note that mp_init doesn't extend sign bit for >32767 */
                    253:        register short precision;       /* number of units to init */
                    254:        precision = global_precision;
                    255:        make_lsbptr(r,precision);
                    256:        *post_higherunit(r) = value; precision--;
                    257: #ifdef UNIT8
                    258:        *post_higherunit(r) = value >> UNITSIZE; precision--;
                    259: #endif
                    260: #ifdef VAXC
                    261: 
                    262:        unitfill0( r, precision); /* Use character insts. on a VAX */
                    263: 
                    264: #else /* VAXC */
                    265: 
                    266:        while (precision--)
                    267:                *post_higherunit(r) = 0;
                    268: 
                    269: #endif /* VAXC */
                    270: 
                    271: }      /* mp_init */
                    272: 
                    273: 
                    274: short significance(register unitptr r)
                    275:        /* Returns number of significant units in r */
                    276: {      register short precision;
                    277:        precision = global_precision;
                    278:        make_msbptr(r,precision);
                    279:        do      
                    280:        {       if (*post_lowerunit(r)) 
                    281:                        return(precision);
                    282:        } while (--precision);
                    283:        return(precision);
                    284: }      /* significance */
                    285: 
                    286: 
                    287: #ifndef VAXC   /* Replaced by a macro under VAX C */
                    288: 
                    289: void unitfill0(unitptr r,word16 unitcount)
                    290:        /* Zero-fill the unit buffer r. */
                    291: {      while (unitcount--) *r++ = 0;
                    292: }      /* unitfill0 */
                    293: 
                    294: #endif /* VAXC */
                    295: 
                    296: /* The macro normalize(r,precision) "normalizes" a multiprecision integer 
                    297:    by adjusting r and precision to new values.  For Motorola-style processors 
                    298:    (MSB-first), r is a pointer to the MSB of the register, and must
                    299:    be adjusted to point to the first nonzero unit.  For Intel/VAX-style
                    300:    (LSB-first) processors, r is a  pointer to the LSB of the register,
                    301:    and must be left unchanged.  The precision counter is always adjusted,
                    302:    regardless of processor type.  In the case of precision = 0,
                    303:    r becomes undefined.
                    304: */
                    305: 
                    306: 
                    307: /* The macro rescale(r,current_precision,new_precision) rescales
                    308:    a multiprecision integer by adjusting r and its precision to new values.  
                    309:    It can be used to reverse the effects of the normalize 
                    310:    routine given above.  See the comments in normalize concerning 
                    311:    Intel vs. Motorola LSB/MSB conventions.
                    312:    WARNING:  You can only safely call rescale on registers that 
                    313:    you have previously normalized with the above normalize routine, 
                    314:    or are known to be big enough for the new precision.  You may
                    315:    specify a new precision that is smaller than the current precision.
                    316:    You must be careful not to specify a new_precision value that is 
                    317:    too big, or which adjusts the r pointer out of range.
                    318: */
                    319: 
                    320: 
                    321: /*     The "bit sniffer" macros all begin sniffing at the MSB.
                    322:        They are used internally by all the various multiply, divide, 
                    323:        modulo, exponentiation, and square root functions.
                    324: */
                    325: 
                    326: 
                    327: int mp_udiv(register unitptr remainder,register unitptr quotient,
                    328:        register unitptr dividend,register unitptr divisor)
                    329:        /* Unsigned divide, treats both operands as positive. */
                    330: {      int bits;
                    331:        short dprec;
                    332:        register unit bitmask;
                    333:        if (testeq(divisor,0))
                    334:                return(-1);     /* zero divisor means divide error */
                    335:        mp_init0(remainder);
                    336:        mp_init0(quotient);
                    337:        /* normalize and compute number of bits in dividend first */
                    338:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    339:        /* rescale quotient to same precision (dprec) as dividend */
                    340:        rescale(quotient,global_precision,dprec);
                    341:        make_msbptr(quotient,dprec); 
                    342: 
                    343:        while (bits--)
                    344:        {       mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
                    345:                if (mp_compare(remainder,divisor) >= 0)
                    346:                {       mp_sub(remainder,divisor);
                    347:                        stuff_bit(quotient,bitmask);
                    348:                }
                    349:                bump_2bitsniffers(dividend,quotient,bitmask); 
                    350:        }
                    351:        return(0);
                    352: } /* mp_udiv */
                    353: 
                    354: 
                    355: #define RECIPMARGIN 0  /* extra margin bits used by mp_recip() */
                    356: 
                    357: int mp_recip(register unitptr quotient,register unitptr divisor)
                    358:        /* Compute reciprocal (quotient) as 1/divisor.  Used by faster modmult. */
                    359: {      int bits;
                    360:        short qprec;
                    361:        register unit bitmask;
                    362:        unit remainder[MAX_UNIT_PRECISION];
                    363:        if (testeq(divisor,0))
                    364:                return(-1);     /* zero divisor means divide error */
                    365:        mp_init0(remainder);
                    366:        mp_init0(quotient);
                    367: 
                    368:        /* normalize and compute number of bits in quotient first */
                    369:        bits = countbits(divisor) + RECIPMARGIN;
                    370:        bitmask = bitmsk(bits);         /* bitmask within a single unit */
                    371:        qprec = bits2units(bits+1);
                    372:        mp_setbit(remainder,(bits-RECIPMARGIN)-1);
                    373:        /* rescale quotient to precision of divisor + RECIPMARGIN bits */
                    374:        rescale(quotient,global_precision,qprec);
                    375:        make_msbptr(quotient,qprec); 
                    376: 
                    377:        while (bits--)
                    378:        {       mp_shift_left(remainder);
                    379:                if (mp_compare(remainder,divisor) >= 0)
                    380:                {       mp_sub(remainder,divisor);
                    381:                        stuff_bit(quotient,bitmask);
                    382:                }
                    383:                bump_bitsniffer(quotient,bitmask);
                    384:        }
                    385:        mp_init0(remainder);    /* burn sensitive data left on stack */
                    386:        return(0);
                    387: } /* mp_recip */
                    388: 
                    389: 
                    390: int mp_div(register unitptr remainder,register unitptr quotient,
                    391:        register unitptr dividend,register unitptr divisor)
                    392:        /* Signed divide, either or both operands may be negative. */
                    393: {      boolean dvdsign,dsign;
                    394:        int status;
                    395:        dvdsign = (mp_tstminus(dividend)!=0);
                    396:        dsign = (mp_tstminus(divisor)!=0);
                    397:        if (dvdsign) mp_neg(dividend);
                    398:        if (dsign) mp_neg(divisor);
                    399:        status = mp_udiv(remainder,quotient,dividend,divisor);
                    400:        if (dvdsign) mp_neg(dividend);          /* repair caller's dividend */
                    401:        if (dsign) mp_neg(divisor);             /* repair caller's divisor */
                    402:        if (status<0) return(status);           /* divide error? */
                    403:        if (dvdsign) mp_neg(remainder);
                    404:        if (dvdsign ^ dsign) mp_neg(quotient);
                    405:        return(status);
                    406: } /* mp_div */
                    407: 
                    408: 
                    409: word16 mp_shortdiv(register unitptr quotient,
                    410:        register unitptr dividend,register word16 divisor)
                    411: /*     This function does a fast divide and mod on a multiprecision dividend
                    412:        using a short integer divisor returning a short integer remainder.
                    413:        This is an unsigned divide.  It treats both operands as positive.
                    414:        It is used mainly for faster printing of large numbers in base 10. 
                    415: */
                    416: {      int bits;
                    417:        short dprec;
                    418:        register unit bitmask;
                    419:        register word16 remainder;
                    420:        if (!divisor)   /* if divisor == 0 */
                    421:                return(-1);     /* zero divisor means divide error */
                    422:        remainder=0;
                    423:        mp_init0(quotient);
                    424:        /* normalize and compute number of bits in dividend first */
                    425:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    426:        /* rescale quotient to same precision (dprec) as dividend */
                    427:        rescale(quotient,global_precision,dprec);
                    428:        make_msbptr(quotient,dprec); 
                    429: 
                    430:        while (bits--)
                    431:        {       remainder <<= 1;
                    432:                if (sniff_bit(dividend,bitmask))
                    433:                        remainder++;
                    434:                if (remainder >= divisor)
                    435:                {       remainder -= divisor;
                    436:                        stuff_bit(quotient,bitmask);
                    437:                }
                    438:                bump_2bitsniffers(dividend,quotient,bitmask); 
                    439:        }       
                    440:        return(remainder);
                    441: } /* mp_shortdiv */
                    442: 
                    443: 
                    444: int mp_mod(register unitptr remainder,
                    445:        register unitptr dividend,register unitptr divisor)
                    446:        /* Unsigned divide, treats both operands as positive. */
                    447: {      int bits;
                    448:        short dprec;
                    449:        register unit bitmask;
                    450:        if (testeq(divisor,0))
                    451:                return(-1);     /* zero divisor means divide error */
                    452:        mp_init0(remainder);
                    453:        /* normalize and compute number of bits in dividend first */
                    454:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    455: 
                    456:        while (bits--)
                    457:        {       mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
                    458:                msub(remainder,divisor);
                    459:                bump_bitsniffer(dividend,bitmask);
                    460:        }       
                    461:        return(0);
                    462: } /* mp_mod */
                    463: 
                    464: 
                    465: word16 mp_shortmod(register unitptr dividend,register word16 divisor)
                    466: /*     This function does a fast mod operation on a multprecision dividend
                    467:        using a short integer modulus returning a short integer remainder.
                    468:        This is an unsigned divide.  It treats both operands as positive.
                    469:        It is used mainly for fast sieve searches for large primes. 
                    470: */
                    471: {      int bits;
                    472:        short dprec;
                    473:        register unit bitmask;
                    474:        register word16 remainder;
                    475:        if (!divisor)   /* if divisor == 0 */
                    476:                return(-1);     /* zero divisor means divide error */
                    477:        remainder=0;
                    478:        /* normalize and compute number of bits in dividend first */
                    479:        init_bitsniffer(dividend,bitmask,dprec,bits);
                    480: 
                    481:        while (bits--)
                    482:        {       remainder <<= 1;
                    483:                if (sniff_bit(dividend,bitmask))
                    484:                        remainder++;
                    485:                if (remainder >= divisor) remainder -= divisor;
                    486:                bump_bitsniffer(dividend,bitmask);
                    487:        }       
                    488:        return(remainder);
                    489: } /* mp_shortmod */
                    490: 
                    491: 
                    492: 
                    493: #ifdef COMB_MULT       /* use faster "comb" multiply algorithm */
                    494:        /* We are skipping this code because it has a bug... */
                    495: 
                    496: int mp_mult(register unitptr prod,
                    497:        register unitptr multiplicand, register unitptr multiplier)
                    498:        /*      Computes multiprecision prod = multiplicand * multiplier */
                    499: {      /*      Uses interleaved comb multiply algorithm.
                    500:                This improved multiply more than twice as fast as a Russian 
                    501:                peasant multiply, because it does a lot fewer shifts. 
                    502:                Must have global_precision set to the size of the multiplicand 
                    503:                plus UNITSIZE-1 SLOP_BITS.  Produces a product that is the sum 
                    504:                of the lengths of the multiplier and multiplicand.
                    505: 
                    506:                BUG ALERT:  Unfortunately, this code has a bug.  It fails for 
                    507:                some numbers.  One such example:
                    508:                x=   59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 
                    509:                x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D 
                    510:              52C8 CDC7 AFB3 61C8 243C 741B
                    511:                --which is obviously wrong.  The answer should be:
                    512:                x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A 
                    513:              C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 
                    514:                We'll have to fix this some day.  In the meantime, we'll 
                    515:                just have the compiler skip over this code. 
                    516:        */
                    517:        int bits;
                    518:        register unit bitmask;
                    519:        unitptr product, mplier, temp;
                    520:        short mprec,mprec2;
                    521:        unit mplicand[MAX_UNIT_PRECISION];
                    522:        mp_init(prod,0);                /* better clear full width--double precision */
                    523:        if (testeq(multiplicand,0))
                    524:                return(0);      /* zero multiplicand means zero product */
                    525: 
                    526:        mp_move(mplicand,multiplicand); /* save it from damage */
                    527: 
                    528:        normalize(multiplier,mprec);
                    529:        if (!mprec)
                    530:                return(0);
                    531: 
                    532:        make_lsbptr(multiplier,mprec);
                    533:        bitmask = 1;    /* start scan at LSB of multiplier */
                    534: 
                    535:        do      /* UNITSIZE times */
                    536:        {       /* do for bits 0-15 */
                    537:                product = prod;
                    538:                mplier = multiplier;
                    539:                mprec2 = mprec;
                    540:                while (mprec2--)        /* do for each word in multiplier */
                    541:                {
                    542:                        if (sniff_bit(mplier,bitmask))
                    543:                        {       if (mp_addc(product,multiplicand,0))    /* ripple carry */
                    544:                                {       /* After 1st time thru, this is rarely encountered. */
                    545:                                        temp = msbptr(product,global_precision);
                    546:                                        pre_higherunit(temp);
                    547:                                        /* temp now points to LSU of carry region. */
                    548:                                        unmake_lsbptr(temp,global_precision);
                    549:                                        mp_inc(temp);
                    550:                                }       /* ripple carry */
                    551:                        }
                    552:                        pre_higherunit(mplier);
                    553:                        pre_higherunit(product);
                    554:                }
                    555:                if (!(bitmask <<= 1))
                    556:                        break;
                    557:                mp_shift_left(multiplicand);
                    558: 
                    559:        } while (TRUE);
                    560: 
                    561:        mp_move(multiplicand,mplicand); /* recover */
                    562: 
                    563:        return(0);      /* normal return */     
                    564: }      /* mp_mult */
                    565: 
                    566: #endif /* COMB_MULT */
                    567: 
                    568: 
                    569: /*     Because the "comb" multiply has a bug, we will use the slower
                    570:        Russian peasant multiply instead.  Fortunately, the mp_mult 
                    571:        function is not called from any time-critical code.
                    572: */
                    573: 
                    574: int mp_mult(register unitptr prod,
                    575:        register unitptr multiplicand,register unitptr multiplier)
                    576:        /* Computes multiprecision prod = multiplicand * multiplier */
                    577: {      /* Uses "Russian peasant" multiply algorithm. */
                    578:        int bits;
                    579:        register unit bitmask;
                    580:        short mprec;
                    581:        mp_init(prod,0);
                    582:        if (testeq(multiplicand,0))
                    583:                return(0);      /* zero multiplicand means zero product */
                    584:        /* normalize and compute number of bits in multiplier first */
                    585:        init_bitsniffer(multiplier,bitmask,mprec,bits);
                    586: 
                    587:        while (bits--)
                    588:        {       mp_shift_left(prod);
                    589:                if (sniff_bit(multiplier,bitmask))
                    590:                        mp_add(prod,multiplicand);
                    591:                bump_bitsniffer(multiplier,bitmask);
                    592:        }
                    593:        return(0);      
                    594: }      /* mp_mult */
                    595: 
                    596: 
                    597: 
                    598: /*     mp_modmult computes a multiprecision multiply combined with a 
                    599:        modulo operation.  This is the most time-critical function in
                    600:        this multiprecision arithmetic library for performing modulo
                    601:        exponentiation.  We experimented with different versions of modmult,
                    602:        depending on the machine architecture and performance requirements.
                    603:        We will either use a Russian Peasant modmult (peasant_modmult), 
                    604:        Charlie Merritt's modmult (merritt_modmult) or Jimmy Upton's
                    605:        modmult (upton_modmult).  On machines with a hardware atomic 
                    606:        multiply instruction, Upton's modmult is fastest, which depends
                    607:        on an assembly subroutine to speed up the hardware multiply logic.
                    608:        If the machine lacks a fast hardware multiply, Merritt's modmult
                    609:        is preferred, which doesn't call any assembly multiply routine.
                    610:        We use the alias names mp_modmult, stage_modulus, and modmult_burn 
                    611:        for the corresponding true names, which depend on what flavor of 
                    612:        modmult we are using.
                    613: 
                    614:        Before making the first call to mp_modmult, you must set up the 
                    615:        modulus-dependant precomputated tables by calling stage_modulus.
                    616:        After making all the calls to mp_modmult, you call modmult_burn to 
                    617:        erase the tables created by stage_modulus that were left in memory.
                    618: */
                    619: 
                    620: #ifdef COUNTMULTS
                    621: /* "number of modmults" counters, used for performance studies. */
                    622: static unsigned int tally_modmults = 0;
                    623: static unsigned int tally_modsquares = 0;
                    624: #endif /* COUNTMULTS */
                    625: 
                    626: 
                    627: #ifdef PEASANT
                    628: /* Conventional Russian peasant multiply with modulo algorithm. */
                    629: 
                    630: static unitptr modulus = 0;    /* used only by mp_modmult */
                    631: 
                    632: int stage_peasant_modulus(unitptr n)
                    633: /*     Must pass modulus to stage_modulus before calling modmult.
                    634:        Assumes that global_precision has already been adjusted to the
                    635:        size of the modulus, plus SLOP_BITS.
                    636: */
                    637: {      /* For this simple version of modmult, just copy unit pointer. */
                    638:        modulus = n;
                    639:        return(0);      /* normal return */
                    640: }      /* stage_peasant_modulus */
                    641: 
                    642: 
                    643: int peasant_modmult(register unitptr prod,
                    644:        unitptr multiplicand,register unitptr multiplier)
                    645: {      /*      "Russian peasant" multiply algorithm, combined with a modulo 
                    646:                operation.  This is a simple naive replacement for Merritt's 
                    647:                faster modmult algorithm.  References global unitptr "modulus".
                    648:                Computes:  prod = (multiplicand*multiplier) mod modulus
                    649:                WARNING: All the arguments must be less than the modulus!
                    650:        */
                    651:        int bits;
                    652:        register unit bitmask;
                    653:        short mprec;
                    654:        mp_init(prod,0);
                    655: /*     if (testeq(multiplicand,0))
                    656:                return(0); */   /* zero multiplicand means zero product */
                    657:        /* normalize and compute number of bits in multiplier first */
                    658:        init_bitsniffer(multiplier,bitmask,mprec,bits);
                    659: 
                    660:        while (bits--)
                    661:        {       mp_shift_left(prod);
                    662:                msub(prod,modulus);     /* turns mult into modmult */
                    663:                if (sniff_bit(multiplier,bitmask))
                    664:                {       mp_add(prod,multiplicand);
                    665:                        msub(prod,modulus);     /* turns mult into modmult */
                    666:                }
                    667:                bump_bitsniffer(multiplier,bitmask);
                    668:        }
                    669:        return(0);
                    670: }      /* peasant_modmult */
                    671: 
                    672: 
                    673: /*     If we are using a version of mp_modmult that uses internal tables 
                    674:        in memory, we have to call modmult_burn() at the end of mp_modexp.
                    675:        This is so that no sensitive data is left in memory after the program 
                    676:        exits.  The Russian peasant method doesn't use any such tables.
                    677: */
                    678: static void peasant_burn(void)
                    679: /*     Alias for modmult_burn, called only from mp_modexp().  Destroys
                    680:        internal modmult tables.  This version does nothing because no 
                    681:        tables are used by the Russian peasant modmult. */
                    682: { }    /* peasant_burn */
                    683: 
                    684: #endif /* PEASANT */
                    685: 
                    686: 
                    687: #ifdef MERRITT
                    688: /*=========================================================================*/
                    689: /*
                    690:        This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
                    691:        Also refined by Zhahai Stewart to reduce the number of subtracts.
                    692:        It performs a multiply combined with a modulo operation, without 
                    693:        going into "double precision".  It is faster than the Russian peasant
                    694:        method, and still works well on machines that lack a fast hardware 
                    695:        multiply instruction.  
                    696: */
                    697: 
                    698: /*     The following support functions, macros, and data structures
                    699:        are used only by Merritt's modmult algorithm... */
                    700: 
                    701: static void mp_lshift_unit(register unitptr r1)
                    702: /*     Shift r1 1 whole unit to the left.  Used only by modmult function. */
                    703: {      register short precision;
                    704:        register unitptr r2;
                    705:        precision = global_precision;
                    706:        make_msbptr(r1,precision);
                    707:        r2 = r1;
                    708:        while (--precision)
                    709:                *post_lowerunit(r1) = *pre_lowerunit(r2);
                    710:        *r1 = 0;
                    711: }      /* mp_lshift_unit */
                    712: 
                    713: 
                    714: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
                    715: static unit moduli_buf[UNITSIZE][MAX_UNIT_PRECISION] = {0};
                    716: static unitptr moduli[UNITSIZE+1] = /* contains pointers into moduli_buf */
                    717: {      0
                    718:        ,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0], 
                    719:         &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0], &moduli_buf[ 7][0]
                    720: #ifndef UNIT8
                    721:        ,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0], 
                    722:         &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0], &moduli_buf[15][0] 
                    723: #ifndef UNIT16 /* and not UNIT8 */
                    724:        ,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0], 
                    725:         &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0], 
                    726:         &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0], 
                    727:         &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0], &moduli_buf[31][0]
                    728: #endif /* UNIT16 and UNIT8 not defined */
                    729: #endif /* UNIT8 not defined */
                    730: };
                    731: 
                    732: /*     To optimize msubs, need following 2 unit arrays, each filled
                    733:        with the most significant unit and next-to-most significant unit
                    734:        of the preshifted images of the modulus. */
                    735: static unit msu_moduli[UNITSIZE+1] = {0}; /* most signif. unit */
                    736: static unit nmsu_moduli[UNITSIZE+1] = {0}; /* next-most signif. unit */
                    737: 
                    738: /*     mpdbuf contains preshifted images of the multiplicand, mod n.
                    739:        It is used only by mp_modmult.  It could be staticly declared
                    740:        inside of mp_modmult, but we put it outside mp_modmult so that 
                    741:        it can be wiped clean by modmult_burn(), which is called at the
                    742:        end of mp_modexp.  This is so that no sensitive data is left in 
                    743:        memory after the program exits.
                    744: */
                    745: static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
                    746: 
                    747: 
                    748: static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
                    749: /*     Computes UNITSIZE images of r, each shifted left 1 more bit.
                    750:        Used only by modmult function. 
                    751: */
                    752: {      short int i;
                    753:        images[0] = r;  /* no need to move the first image, just copy ptr */
                    754:        for (i=1; i<UNITSIZE; i++)
                    755:        {       mp_move(images[i],images[i-1]);
                    756:                mp_shift_left(images[i]);
                    757:        }
                    758: }      /* stage_mp_images */
                    759: 
                    760: 
                    761: int stage_merritt_modulus(unitptr n)
                    762: /*     Computes UNITSIZE+1 images of modulus, each shifted left 1 more bit.
                    763:        Before calling mp_modmult, you must first stage the moduli images by
                    764:        calling stage_modulus.  n is the pointer to the modulus.
                    765:        Assumes that global_precision has already been adjusted to the
                    766:        size of the modulus, plus SLOP_BITS.
                    767: */
                    768: {      short int i;
                    769:        unitptr msu;    /* ptr to most significant unit, for faster msubs */
                    770:        moduli[0] = n;  /* no need to move the first image, just copy ptr */
                    771: 
                    772:        /* used by optimized msubs macro... */
                    773:        msu = msbptr(n,global_precision);       /* needed by msubs */
                    774:        msu_moduli[0] = *post_lowerunit(msu);
                    775:        nmsu_moduli[0] = *msu;
                    776: 
                    777:        for (i=1; i<UNITSIZE+1; i++)
                    778:        {       mp_move(moduli[i],moduli[i-1]);
                    779:                mp_shift_left(moduli[i]);
                    780: 
                    781:                /* used by optimized msubs macro... */
                    782:                msu = msbptr(moduli[i],global_precision);       /* needed by msubs */
                    783:                msu_moduli[i] = *post_lowerunit(msu);   /* for faster msubs */
                    784:                nmsu_moduli[i] = *msu;
                    785:        }
                    786:        return(0);      /* normal return */
                    787: }      /* stage_merritt_modulus */
                    788: 
                    789: 
                    790: /* The following macros, sniffadd and msubs, are used by modmult... */
                    791: #define sniffadd(i) if (*multiplier & power_of_2(i))  mp_add(prod,mpd[i])
                    792: 
                    793: /* Unoptimized msubs macro (msubs0) follows... */
                    794: /* #define msubs0(i) msub(prod,moduli[i]) 
                    795: */
                    796: 
                    797: /*     To optimize msubs, compare the most significant units of the 
                    798:        product and the shifted modulus before deciding to call the full 
                    799:        mp_compare routine.  Better still, compare the upper two units
                    800:        before deciding to call mp_compare.
                    801:        Optimization of msubs relies heavily on standard C left-to-right 
                    802:        parsimonious evaluation of logical expressions. 
                    803: */
                    804: 
                    805: /* Partially-optimized msubs macro (msubs1) follows... */
                    806: /* #define msubs1(i) if ( \
                    807:   ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
                    808:   (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
                    809:   ) mp_sub(prod,moduli[i])
                    810: */
                    811: 
                    812: /* Fully-optimized msubs macro (msubs2) follows... */
                    813: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
                    814:  (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
                    815:  (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
                    816:  mp_sub(prod,moduli[i])
                    817: 
                    818: 
                    819: int merritt_modmult(register unitptr prod,
                    820:        unitptr multiplicand,register unitptr multiplier)
                    821:        /*      Performs combined multiply/modulo operation.  
                    822:                Computes:  prod = (multiplicand*multiplier) mod modulus
                    823:                WARNING: All the arguments must be less than the modulus!
                    824:                Assumes the modulus has been predefined by first calling 
                    825:                stage_modulus.
                    826:        */
                    827: {
                    828:        /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
                    829:        register signedunit p_m;
                    830:        register unitptr msu_prod;      /* ptr to most significant unit of product */
                    831:        register unitptr nmsu_prod;     /* next-most signif. unit of product */
                    832:        short mprec;            /* precision of multiplier, in units */
                    833:        /*      Array mpd contains a list of pointers to preshifted images of 
                    834:                the multiplicand: */
                    835:        static unitptr mpd[UNITSIZE] =
                    836:        {        0,              &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
                    837:                 &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
                    838: #ifndef UNIT8
                    839:                ,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
                    840:                 &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
                    841: #ifndef UNIT16 /* and not UNIT8 */
                    842:                ,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
                    843:                 &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
                    844:                 &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
                    845:                 &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
                    846: #endif /* UNIT16 and UNIT8 not defined */
                    847: #endif /* UNIT8 not defined */
                    848:        };
                    849: 
                    850:        /* Compute preshifted images of multiplicand, mod n: */
                    851:        stage_mp_images(mpd,multiplicand);
                    852: 
                    853:        /* To optimize msubs, set up msu_prod and nmsu_prod: */
                    854:        msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */
                    855:        nmsu_prod = msu_prod;
                    856:        post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
                    857: 
                    858:        /*      To understand this algorithm, it would be helpful to first 
                    859:                study the conventional Russian peasant modmult algorithm.
                    860:                This one does about the same thing as Russian peasant, but
                    861:                is organized differently to save some steps.  It loops 
                    862:                through the multiplier a word (unit) at a time, instead of 
                    863:                a bit at a time.  It word-shifts the product instead of 
                    864:                bit-shifting it, so it should be faster.  It also does about 
                    865:                half as many subtracts as Russian peasant.
                    866:        */
                    867: 
                    868:        mp_init(prod,0);        /* Initialize product to 0. */
                    869: 
                    870:        /*      The way mp_modmult is actually used in cryptographic 
                    871:                applications, there will NEVER be a zero multiplier or 
                    872:                multiplicand.  So there is no need to optimize for that 
                    873:                condition.
                    874:        */
                    875: /*     if (testeq(multiplicand,0))
                    876:                return(0); */   /* zero multiplicand means zero product */
                    877:        /* Normalize and compute number of units in multiplier first: */
                    878:        normalize(multiplier,mprec);
                    879:        if (mprec==0)   /* if precision of multiplier is 0 */
                    880:                return(0);      /* zero multiplier means zero product */
                    881:        make_msbptr(multiplier,mprec); /* start at MSU of multiplier */
                    882: 
                    883:        while (mprec--) /* Loop for the number of units in the multiplier */
                    884:        {
                    885:                /*      Shift the product one whole unit to the left.
                    886:                        This is part of the multiply phase of modmult. 
                    887:                */
                    888: 
                    889:                mp_lshift_unit(prod);
                    890: 
                    891:                /*      Sniff each bit in the current unit of the multiplier, 
                    892:                        and conditionally add the the corresponding preshifted 
                    893:                        image of the multiplicand to the product.
                    894:                        This is also part of the multiply phase of modmult.
                    895: 
                    896:                        The following loop is unrolled for speed, using macros...
                    897: 
                    898:                for (i=UNITSIZE-1; i>=0; i--)
                    899:                   if (*multiplier & power_of_2(i)) 
                    900:                                mp_add(prod,mpd[i]);
                    901:                */
                    902: #ifndef UNIT8
                    903: #ifndef UNIT16 /* and not UNIT8 */
                    904:                sniffadd(31); 
                    905:                sniffadd(30); 
                    906:                sniffadd(29); 
                    907:                sniffadd(28);
                    908:                sniffadd(27); 
                    909:                sniffadd(26); 
                    910:                sniffadd(25); 
                    911:                sniffadd(24);
                    912:                sniffadd(23); 
                    913:                sniffadd(22); 
                    914:                sniffadd(21); 
                    915:                sniffadd(20);
                    916:                sniffadd(19); 
                    917:                sniffadd(18); 
                    918:                sniffadd(17); 
                    919:                sniffadd(16);
                    920: #endif /* not UNIT16 and not UNIT8 */
                    921:                sniffadd(15); 
                    922:                sniffadd(14); 
                    923:                sniffadd(13); 
                    924:                sniffadd(12);
                    925:                sniffadd(11); 
                    926:                sniffadd(10); 
                    927:                sniffadd(9); 
                    928:                sniffadd(8);
                    929: #endif /* not UNIT8 */
                    930:                sniffadd(7); 
                    931:                sniffadd(6); 
                    932:                sniffadd(5); 
                    933:                sniffadd(4);
                    934:                sniffadd(3); 
                    935:                sniffadd(2); 
                    936:                sniffadd(1); 
                    937:                sniffadd(0);
                    938: 
                    939:                /*      The product may have grown by as many as UNITSIZE+1 
                    940:                        bits.  That's why we have global_precision set to the
                    941:                        size of the modulus plus UNITSIZE+1 slop bits.  
                    942:                        Now reduce the product back down by conditionally 
                    943:                        subtracting     the UNITSIZE+1 preshifted images of the 
                    944:                        modulus.  This is the modulo reduction phase of modmult.
                    945: 
                    946:                        The following loop is unrolled for speed, using macros...
                    947: 
                    948:                for (i=UNITSIZE; i>=0; i--)
                    949:                        if (mp_compare(prod,moduli[i]) >= 0) 
                    950:                                mp_sub(prod,moduli[i]); 
                    951:                */
                    952: #ifndef UNIT8
                    953: #ifndef UNIT16 /* and not UNIT8 */
                    954:                msubs(32); 
                    955:                msubs(31); 
                    956:                msubs(30); 
                    957:                msubs(29); 
                    958:                msubs(28);
                    959:                msubs(27); 
                    960:                msubs(26); 
                    961:                msubs(25); 
                    962:                msubs(24);
                    963:                msubs(23); 
                    964:                msubs(22); 
                    965:                msubs(21); 
                    966:                msubs(20);
                    967:                msubs(19); 
                    968:                msubs(18); 
                    969:                msubs(17); 
                    970: #endif /* not UNIT16 and not UNIT8 */
                    971:                msubs(16);
                    972:                msubs(15); 
                    973:                msubs(14); 
                    974:                msubs(13); 
                    975:                msubs(12);
                    976:                msubs(11); 
                    977:                msubs(10); 
                    978:                msubs(9); 
                    979: #endif /* not UNIT8 */
                    980:                msubs(8);
                    981:                msubs(7); 
                    982:                msubs(6); 
                    983:                msubs(5); 
                    984:                msubs(4);
                    985:                msubs(3); 
                    986:                msubs(2); 
                    987:                msubs(1); 
                    988:                msubs(0);
                    989: 
                    990:                /* Bump pointer to next lower unit of multiplier: */
                    991:                post_lowerunit(multiplier);
                    992: 
                    993:        }       /* Loop for the number of units in the multiplier */
                    994: 
                    995:        return(0);      /* normal return */
                    996: 
                    997: }      /* merritt_modmult */
                    998: 
                    999: 
                   1000: #undef msubs
                   1001: #undef sniffadd
                   1002: 
                   1003: 
                   1004: /*     Merritt's mp_modmult function leaves some internal tables in memory,
                   1005:        so we have to call modmult_burn() at the end of mp_modexp.  
                   1006:        This is so that no cryptographically sensitive data is left in memory 
                   1007:        after the program exits.
                   1008: */
                   1009: static void merritt_burn(void)
                   1010: /*     Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
                   1011: {      unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
                   1012:        unitfill0(&(moduli_buf[0][0]),(UNITSIZE)*MAX_UNIT_PRECISION);
                   1013:        unitfill0(msu_moduli,UNITSIZE+1);
                   1014:        unitfill0(nmsu_moduli,UNITSIZE+1);
                   1015: } /* merritt_burn() */
                   1016: 
                   1017: /******* end of Merritt's MODMULT stuff. *******/
                   1018: /*=========================================================================*/
                   1019: #endif /* MERRITT */
                   1020: 
                   1021: 
                   1022: #ifdef UPTON   /* using Jimmy Upton's modmult algorithm */
                   1023: 
                   1024: /*     Jimmy Upton's multiprecision modmult algorithm in C.
                   1025:        Performs a multiply combined with a modulo operation.  
                   1026: 
                   1027:        The following support functions and data structures
                   1028:        are used only by Upton's modmult algorithm...
                   1029: */
                   1030: 
                   1031: /*     The MULTUNIT word is the biggest word size for the native atomic 
                   1032:        multiply instruction.  It may or may not be smaller than UNITSIZE. 
                   1033:        Many CPU's have 16-by-16-bit multiplies, yielding a 32-bit product.
                   1034:        In that case, MULTUINIT should be a 16-bit word, even if the rest of
                   1035:        the multiprecision arithmetic functions assume a 32-bit UNIT word.
                   1036: */
                   1037: typedef unsigned short MULTUNIT;
                   1038: #define        MULTUNITSIZE    (8*sizeof(MULTUNIT))    /* size in bits */
                   1039: 
                   1040: static short unit_prec;        /* global_precision expressed in MULTUNITs */
                   1041: 
                   1042: 
                   1043: /*     Define MPORTABLE if there is no assembly version of the mp_smul
                   1044:        function available.  An assembly version is much faster.
                   1045:        Note that since the SPARC CPU has no hardware integer multiply 
                   1046:        instruction, there is not that much advantage in having an 
                   1047:        assembly version of mp_smul on that machine.  It might be faster
                   1048:        to use Merritt's modmult instead of Upton's modmult on the SPARC.
                   1049: */
                   1050: #ifdef MSDOS   /* we do have an assembly version available on the 8086 */
                   1051: #undef MPORTABLE
                   1052: #endif
                   1053: 
                   1054: #ifdef MPORTABLE  /* use slow portable C version instead of assembly */
                   1055: 
                   1056: /* 
                   1057:        Multiply the single-word multiplier times the multiprecision integer 
                   1058:        in multiplicand, accumulating result in prod.  The resulting 
                   1059:        multiprecision prod will be 1 word longer than the multiplicand.   
                   1060:        multiplicand is unit_prec words long.  We add into prod, so caller 
                   1061:        should zero it out first.  For best results, this time-critical 
                   1062:        function should be implemented in assembly.
                   1063:        NOTE:  Unlike other functions in the multiprecision arithmetic 
                   1064:        library, both multiplicand and prod are pointing at the LSB, 
                   1065:        regardless of byte order of the machine.  On an 80x86, this makes 
                   1066:        no difference.  But if this assembly function is implemented
                   1067:        on a 680x0, it becomes important.
                   1068: */
                   1069: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
                   1070: {
                   1071:        short i;
                   1072:        unsigned long p, carry;
                   1073: 
                   1074:        carry = 0;
                   1075:        for (i=0; i<unit_prec; ++i)
                   1076:        {       p = (unsigned long)multiplier * *post_higherunit(multiplicand);
                   1077:                p += *prod + carry;
                   1078:                *post_higherunit(prod) = (MULTUNIT)p;
                   1079:                carry = p >> MULTUNITSIZE;
                   1080:        }
                   1081:        /* We know that the high-order word of prod will always be 0 */
                   1082:        *prod = (MULTUNIT)carry;        /* copy carry to empty high word of prod */
                   1083: }      /* mp_smul */
                   1084: 
                   1085: #else  /* not MPORTABLE-- use assembly routine */
                   1086: 
                   1087: #define mp_smul P_SMUL /* use external assembly routine */
                   1088: 
                   1089: #endif /* MPORTABLE */
                   1090: 
                   1091: #ifdef VMS
                   1092: 
                   1093: #define mp_dmul p_dmul /* use external assembly routine */
                   1094: 
                   1095: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
                   1096: 
                   1097: #else /* VMS */
                   1098: 
                   1099: /*     mp_dmul is a double-precision multiply multiplicand times multiplier, 
                   1100:        result into prod.  prod must be pointing at a "double precision" 
                   1101:        buffer.  E.g. If global_precision is 10 words, prod must be 
                   1102:        pointing at a 20-word buffer.
                   1103: */
                   1104: static void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
                   1105: {
                   1106:        register        int i;
                   1107:        register        MULTUNIT *p_multiplicand, *p_multiplier;
                   1108:        register        MULTUNIT *prodp;
                   1109: 
                   1110:        unitfill0(prod,global_precision*2);     /* Pre-zero prod */
                   1111:        /* Calculate precision in units of MULTUNIT */
                   1112:        unit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
                   1113:        p_multiplicand = (MULTUNIT *)multiplicand;
                   1114:        p_multiplier = (MULTUNIT *)multiplier;
                   1115:        prodp = (MULTUNIT *)prod;
                   1116:        make_lsbptr(p_multiplicand,unit_prec);
                   1117:        make_lsbptr(p_multiplier,unit_prec);
                   1118:        make_lsbptr(prodp,unit_prec*2);
                   1119:        /* Multiply multiplicand by each word in multiplier, accumulating prod: */
                   1120:        for (i=0; i<unit_prec; ++i)
                   1121:                mp_smul (post_higherunit(prodp), 
                   1122:                        p_multiplicand, *post_higherunit(p_multiplier));
                   1123: }      /* mp_dmul */
                   1124: 
                   1125: #endif /* VMS */
                   1126: 
                   1127: /*     These scratchpad arrays are used only by upton_modmult (mp_modmult).
                   1128:        Some of them could be staticly declared inside of mp_modmult, but we 
                   1129:        put them outside mp_modmult so that they can be wiped clean by 
                   1130:        modmult_burn(), which is called at the end of mp_modexp.  This is 
                   1131:        so that no sensitive data is left in memory after the program exits.
                   1132: */
                   1133: 
                   1134: #ifdef VAXC
                   1135: /*
                   1136:  * Operations on arrays are sped up on a VAX if the data is quadword (64-bit)
                   1137:  * aligned. This is because many VAXen use a minimum of 64-bit data paths to
                   1138:  * cache and memory. This technique is VAX C private, hence the conditional.
                   1139:  */
                   1140: static unit _align( quadword ) modulus[MAX_UNIT_PRECISION];
                   1141: static unit _align( quadword ) reciprocal[MAX_UNIT_PRECISION];
                   1142: static unit _align( quadword ) dhi[MAX_UNIT_PRECISION];
                   1143: static unit _align( quadword ) d_data[MAX_UNIT_PRECISION*2];    /* double-wide register */
                   1144: static unit _align( quadword ) e_data[MAX_UNIT_PRECISION*2];    /* double-wide register */
                   1145: static unit _align( quadword ) f_data[MAX_UNIT_PRECISION*2];    /* double-wide register */
                   1146: 
                   1147: #else /* VAXC */
                   1148: 
                   1149: static unit modulus[MAX_UNIT_PRECISION];
                   1150: static unit reciprocal[MAX_UNIT_PRECISION];
                   1151: static unit dhi[MAX_UNIT_PRECISION];
                   1152: static unit d_data[MAX_UNIT_PRECISION*2];      /* double-wide register */
                   1153: static unit e_data[MAX_UNIT_PRECISION*2];      /* double-wide register */
                   1154: static unit f_data[MAX_UNIT_PRECISION*2];      /* double-wide register */
                   1155: 
                   1156: #endif /* VAXC */
                   1157: 
                   1158: static short nbits;
                   1159: static short nbitsDivUNITSIZE;
                   1160: static short nbitsModUNITSIZE;
                   1161: 
                   1162: /*     stage_upton_modulus() is aliased to stage_modulus().
                   1163:        Prepare for an Upton modmult.  Calculate the reciprocal of modulus,
                   1164:        and save both.  Note that reciprocal will have one more bit than 
                   1165:        modulus.
                   1166:        Assumes that global_precision has already been adjusted to the
                   1167:        size of the modulus, plus SLOP_BITS.
                   1168: */
                   1169: int stage_upton_modulus(unitptr n)
                   1170: {
                   1171:        mp_move(modulus, n);
                   1172:        mp_recip(reciprocal, modulus);
                   1173:        nbits = countbits(modulus);
                   1174:        nbitsDivUNITSIZE = nbits / UNITSIZE;
                   1175:        nbitsModUNITSIZE = nbits % UNITSIZE;
                   1176:        return(0);      /* normal return */
                   1177: }      /* stage_upton_modulus */
                   1178: 
                   1179: 
                   1180: /*     Upton's algorithm performs a multiply combined with a modulo operation.  
                   1181:        Computes:  prod = (multiplicand*multiplier) mod modulus
                   1182:        WARNING: All the arguments must be less than the modulus!
                   1183:        References global unitptr modulus and reciprocal.
                   1184:        The reciprocal of modulus is 1 bit longer than the modulus.  
                   1185:        upton_modmult() is aliased to mp_modmult().
                   1186: */
                   1187: int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier)
                   1188: {
                   1189:        unitptr d = d_data;
                   1190:        unitptr d1 = d_data;
                   1191:        unitptr e = e_data;
                   1192:        unitptr f = f_data;
                   1193:        short orig_precision, dprec;
                   1194:        short i;
                   1195: 
                   1196:        orig_precision = global_precision;
                   1197:        mp_dmul (d,multiplicand,multiplier);
                   1198:        
                   1199:        /* Throw off low nbits of d */
                   1200: #ifndef HIGHFIRST
                   1201:        d1 = d + nbitsDivUNITSIZE;
                   1202: #else
                   1203:        d1 = d + orig_precision - nbitsDivUNITSIZE;
                   1204: #endif
                   1205:        mp_move(dhi, d1);       /* Don't screw up d, we need it later */
                   1206:        mp_shift_right_bits(dhi, nbitsModUNITSIZE);
                   1207: 
                   1208:        mp_dmul (e,dhi,reciprocal);     /* Note - reciprocal has nbits+1 bits */
                   1209: 
                   1210: #ifndef HIGHFIRST
                   1211:        e += nbitsDivUNITSIZE;
                   1212: #else
                   1213:        e += orig_precision - nbitsDivUNITSIZE;
                   1214: #endif
                   1215:        mp_shift_right_bits(e, nbitsModUNITSIZE);
                   1216: 
                   1217:        mp_dmul (f,e,modulus);
                   1218: 
                   1219:        /* Now for the only double-precision call to mpilib: */
                   1220:        set_precision (orig_precision * 2);
                   1221:        mp_sub (d,f);
                   1222: 
                   1223:        /* d's precision should be <= orig_precision */
                   1224:        rescale (d, orig_precision*2, orig_precision);
                   1225:        set_precision (orig_precision);
                   1226: 
                   1227:        /* Should never have to do this final subtract more than twice: */
                   1228:        while (mp_compare(d,modulus) > 0)
                   1229:                mp_sub (d,modulus);
                   1230: 
                   1231:        mp_move(prod,d);
                   1232:        return(0);      /* normal return */
                   1233: }      /* upton_modmult */
                   1234: 
                   1235: 
                   1236: /*     Upton's mp_modmult function leaves some internal arrays in memory,
                   1237:        so we have to call modmult_burn() at the end of mp_modexp.  
                   1238:        This is so that no cryptographically sensitive data is left in memory 
                   1239:        after the program exits.
                   1240:        upton_burn() is aliased to modmult_burn().
                   1241: */
                   1242: void upton_burn(void)
                   1243: {
                   1244:        unitfill0(modulus,MAX_UNIT_PRECISION);
                   1245:        unitfill0(reciprocal,MAX_UNIT_PRECISION);
                   1246:        unitfill0(dhi,MAX_UNIT_PRECISION);
                   1247:        unitfill0(d_data,MAX_UNIT_PRECISION*2);
                   1248:        unitfill0(e_data,MAX_UNIT_PRECISION*2);
                   1249:        unitfill0(f_data,MAX_UNIT_PRECISION*2);
                   1250:        nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
                   1251: }      /* upton_burn */
                   1252: 
                   1253: #endif /* UPTON */
                   1254: 
                   1255: 
                   1256: int countbits(unitptr r)
                   1257:        /* Returns number of significant bits in r */
                   1258: {      int bits;
                   1259:        short prec;
                   1260:        register unit bitmask;
                   1261:        init_bitsniffer(r,bitmask,prec,bits);
                   1262:        return(bits);
                   1263: } /* countbits */
                   1264: 
                   1265: 
                   1266: char *copyright_notice(void)
                   1267: /* force linker to include copyright notice in the executable object image. */
                   1268: { return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */
                   1269: 
                   1270: 
                   1271: int mp_modexp(register unitptr expout,register unitptr expin,
                   1272:        register unitptr exponent,register unitptr modulus)
                   1273: {      /*      Russian peasant combined exponentiation/modulo algorithm.
                   1274:                Calls modmult instead of mult. 
                   1275:                Computes:  expout = (expin**exponent) mod modulus
                   1276:                WARNING: All the arguments must be less than the modulus!
                   1277:        */
                   1278:        int bits;
                   1279:        short oldprecision;
                   1280:        register unit bitmask;
                   1281:        unit product[MAX_UNIT_PRECISION];
                   1282:        short eprec;
                   1283: 
                   1284: #ifdef COUNTMULTS
                   1285:        tally_modmults = 0;     /* clear "number of modmults" counter */
                   1286:        tally_modsquares = 0;   /* clear "number of modsquares" counter */
                   1287: #endif /* COUNTMULTS */
                   1288:        mp_init(expout,1);
                   1289:        if (testeq(exponent,0))
                   1290:        {       if (testeq(expin,0))
                   1291:                        return(-1); /* 0 to the 0th power means return error */
                   1292:                return(0);      /* otherwise, zero exponent means expout is 1 */
                   1293:        }
                   1294:        if (testeq(modulus,0))
                   1295:                return(-2);             /* zero modulus means error */
                   1296: #if SLOP_BITS > 0      /* if there's room for sign bits */
                   1297:        if (mp_tstminus(modulus))
                   1298:                return(-2);             /* negative modulus means error */
                   1299: #endif /* SLOP_BITS > 0 */
                   1300:        if (mp_compare(expin,modulus) >= 0)
                   1301:                return(-3); /* if expin >= modulus, return error */
                   1302:        if (mp_compare(exponent,modulus) >= 0)
                   1303:                return(-4); /* if exponent >= modulus, return error */
                   1304: 
                   1305:        oldprecision = global_precision;        /* save global_precision */
                   1306:        /* set smallest optimum precision for this modulus */
                   1307:        set_precision(bits2units(countbits(modulus)+SLOP_BITS));
                   1308:        /* rescale all these registers to global_precision we just defined */
                   1309:        rescale(modulus,oldprecision,global_precision);
                   1310:        rescale(expin,oldprecision,global_precision);
                   1311:        rescale(exponent,oldprecision,global_precision);
                   1312:        rescale(expout,oldprecision,global_precision);
                   1313: 
                   1314:        if (stage_modulus(modulus))
                   1315:        {       set_precision(oldprecision);    /* restore original precision */
                   1316:                return(-5);             /* unstageable modulus (STEWART algorithm) */
                   1317:        }
                   1318: 
                   1319:        /* normalize and compute number of bits in exponent first */
                   1320:        init_bitsniffer(exponent,bitmask,eprec,bits);
                   1321: 
                   1322:        /* We can "optimize out" the first modsquare and modmult: */
                   1323:        bits--;         /* We know for sure at this point that bits>0 */
                   1324:        mp_move(expout,expin);          /*  expout = (1*1)*expin; */
                   1325:        bump_bitsniffer(exponent,bitmask);
                   1326:        
                   1327:        while (bits--)
                   1328:        {
                   1329:                poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
                   1330: #ifdef COUNTMULTS
                   1331:                tally_modsquares++;     /* bump "number of modsquares" counter */
                   1332: #endif /* COUNTMULTS */
                   1333:                mp_modsquare(product,expout);
                   1334:                mp_move(expout,product);
                   1335:                if (sniff_bit(exponent,bitmask))
                   1336:                {       mp_modmult(product,expout,expin);
                   1337:                        mp_move(expout,product);
                   1338: #ifdef COUNTMULTS
                   1339:                        tally_modmults++;       /* bump "number of modmults" counter */
                   1340: #endif /* COUNTMULTS */
                   1341:                }
                   1342:                bump_bitsniffer(exponent,bitmask);
                   1343:        }       /* while bits-- */
                   1344:        mp_burn(product);       /* burn the evidence on the stack */
                   1345:        modmult_burn(); /* ask mp_modmult to also burn its own evidence */
                   1346: 
                   1347: #ifdef COUNTMULTS      /* diagnostic analysis */
                   1348:        {       long atomic_mults;
                   1349:                unsigned int unitcount,totalmults;
                   1350:                unitcount = bits2units(countbits(modulus));
                   1351:                /* calculation assumes modsquare takes as long as a modmult: */
                   1352:                atomic_mults = (long) tally_modmults * (unitcount * unitcount);
                   1353:                atomic_mults += (long) tally_modsquares * (unitcount * unitcount);
                   1354:                printf("%ld atomic mults for ",atomic_mults);
                   1355:                printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
                   1356:                        tally_modsquares,tally_modmults,
                   1357:                        tally_modsquares+tally_modmults,
                   1358:                        countbits(modulus), unitcount);
                   1359:        }
                   1360: #endif /* COUNTMULTS */
                   1361: 
                   1362:        set_precision(oldprecision);    /* restore original precision */
                   1363: 
                   1364:        /* Do an explicit reference to the copyright notice so that the linker
                   1365:           will be forced to include it in the executable object image... */
                   1366:        copyright_notice();     /* has no real effect at run time */
                   1367: 
                   1368:        return(0);              /* normal return */
                   1369: }      /* mp_modexp */
                   1370: 
                   1371: 
                   1372: 
                   1373: 
                   1374: /*********************************************************************
                   1375:        RSA-specific routines follow.  These are the only functions that 
                   1376:        are specific to the RSA public key cryptosystem.  The other
                   1377:        multiprecision integer math functions may be used for non-RSA
                   1378:        applications.  
                   1379: 
                   1380:        The RSA public key cryptosystem is patented by the Massachusetts
                   1381:        Institute of Technology (U.S. patent #4,405,829).  This patent does
                   1382:        not apply outside the USA.  Public Key Partners (PKP) holds the
                   1383:        exclusive commercial license to sell and sub-license the RSA public
                   1384:        key cryptosystem.  The author of this software implementation of the
                   1385:        RSA algorithm is providing this software for educational use only. 
                   1386:        Licensing this algorithm from PKP is the responsibility of you, the
                   1387:        user, not Philip Zimmermann, the author of this software.  The author
                   1388:        assumes no liability for any breach of patent law resulting from the
                   1389:        unlicensed use of this software by the user.
                   1390: *********************************************************************/
                   1391: 
                   1392: 
                   1393: int rsa_decrypt(unitptr M, unitptr C,
                   1394:        unitptr d, unitptr p, unitptr q, unitptr u)
                   1395:        /*      Uses Quisquater's Chinese Remainder Theorem shortcut 
                   1396:                for RSA decryption.
                   1397:                M is the output plaintext message.
                   1398:                C is the input ciphertext message.
                   1399:                d is the secret decryption exponent.
                   1400:                p and q are the secret prime factors of n.
                   1401:                u is the multiplicative inverse of p, mod q.
                   1402:                Note that u is precomputed on the assumption that p<q.
                   1403:                n, the common modulus, is not used here because of the
                   1404:                Chinese Remainder Theorem shortcut.
                   1405:        */
                   1406: {
                   1407:        unit p2[MAX_UNIT_PRECISION];
                   1408:        unit q2[MAX_UNIT_PRECISION];
                   1409:        unit temp1[MAX_UNIT_PRECISION];
                   1410:        unit temp2[MAX_UNIT_PRECISION];
                   1411:        int status;
                   1412: 
                   1413:        mp_init(M,1);   /* initialize result in case of error */
                   1414: 
                   1415:        if (mp_compare(p,q) >= 0)       /* ensure that p<q */
                   1416:        {       /* swap the pointers p and q */
                   1417:                unitptr t;
                   1418:                t = p;  p = q; q = t;
                   1419:        }
                   1420: 
                   1421: /*     Rather than decrypting by computing modexp with full mod n
                   1422:        precision, compute a shorter modexp with mod p precision... */
                   1423: 
                   1424: /*     p2 = [ (C mod p)**( d mod (p-1) ) ] mod p               */
                   1425:        mp_move(temp1,p);
                   1426:        mp_dec(temp1);                  /* temp1 = p-1 */
                   1427:        mp_mod(temp2,d,temp1);  /* temp2 = d mod (p-1) ) */
                   1428:        mp_mod(temp1,C,p);              /* temp1 = C mod p  */
                   1429:        status = mp_modexp(p2,temp1,temp2,p);
                   1430:        if (status < 0) /* mp_modexp returned an error. */
                   1431:                return(status); /* error return */
                   1432: 
                   1433: /*     Now compute a short modexp with mod q precision... */
                   1434: 
                   1435: /*     q2 = [ (C mod q)**( d mod (q-1) ) ] mod q               */
                   1436:        mp_move(temp1,q);
                   1437:        mp_dec(temp1);                  /* temp1 = q-1 */
                   1438:        mp_mod(temp2,d,temp1);  /* temp2 = d mod (q-1) */
                   1439:        mp_mod(temp1,C,q);              /* temp1 = C mod q  */
                   1440:        status = mp_modexp(q2,temp1,temp2,q);
                   1441:        if (status < 0) /* mp_modexp returned an error. */
                   1442:                return(status); /* error return */
                   1443: 
                   1444: /*     Now use the multiplicative inverse u to glue together the
                   1445:        two halves, saving a lot of time by avoiding a full mod n
                   1446:        exponentiation.  At key generation time, u was computed
                   1447:        with the secret key as the multiplicative inverse of
                   1448:        p, mod q such that:  (p*u) mod q = 1, assuming p<q.
                   1449: */
                   1450:        if (mp_compare(p2,q2) == 0)     /* only happens if C<p */
                   1451:                mp_move(M,p2);
                   1452:        else
                   1453:        {       /*      q2 = q2 - p2;  if q2<0 then q2 = q2 + q  */
                   1454:                if (mp_sub(q2,p2))      /* if the result went negative... */
                   1455:                        mp_add(q2,q);           /* add q to q2 */
                   1456: 
                   1457:                /*      M = p2 + ( p * [(q2*u) mod q] )         */
                   1458:                mp_mult(temp1,q2,u);            /* temp1 = q2*u  */
                   1459:                mp_mod(temp2,temp1,q);          /* temp2 = temp1 mod q */
                   1460:                mp_mult(temp1,p,temp2); /* temp1 = p * temp2 */
                   1461:                mp_add(temp1,p2);               /* temp1 = temp1 + p2 */
                   1462:                mp_move(M,temp1);               /* M = temp1 */
                   1463:        }
                   1464: 
                   1465:        mp_burn(p2);    /* burn the evidence on the stack...*/
                   1466:        mp_burn(q2);
                   1467:        mp_burn(temp1);
                   1468:        mp_burn(temp2);
                   1469:        return(0);      /* normal return */
                   1470: }      /* rsa_decrypt */
                   1471: 
                   1472: 
                   1473: /****************** end of MPI library ****************************/

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.