|
|
1.1 root 1: /* C source code for multiprecision arithmetic library routines.
2: Implemented Nov 86 by Philip Zimmermann
3: Last revised 27 Nov 91 by PRZ
4:
5: Boulder Software Engineering
6: 3021 Eleventh Street
7: Boulder, CO 80304
8: (303) 541-0140
9:
10: (c) Copyright 1986-92 by Philip Zimmermann. All rights reserved.
11: The author assumes no liability for damages resulting from the use
12: of this software, even if the damage results from defects in this
13: software. No warranty is expressed or implied. The use of this
14: cryptographic software for developing weapon systems is expressly
15: forbidden.
16:
17: These routines implement all of the multiprecision arithmetic
18: necessary for number-theoretic cryptographic algorithms such as
19: ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
20: composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
21: key cryptography.
22:
23: Although originally developed in Microsoft C for the IBM PC, this code
24: contains few machine dependancies. It assumes 2's complement
25: arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines,
26: lowbyte-highbyte order or highbyte-lowbyte order. This version
27: has been converted to ANSI C.
28:
29:
30: The internal representation for these extended precision integer
31: "registers" is an array of "units". A unit is a machine word, which
32: is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
33: unsigned integer, depending on the machine's word size. For example,
34: an IBM PC or AT uses a unit size of 16 bits. To perform arithmetic
35: on these huge precision integers, we pass pointers to these unit
36: arrays to various subroutines. A pointer to an array of units is of
37: type unitptr. This is a pointer to a huge integer "register".
38:
39: When calling a subroutine, we always pass a pointer to the BEGINNING
40: of the array of units, regardless of the byte order of the machine.
41: On a lowbyte-first machine, such as the Intel 80x86, this unitptr
42: points to the LEAST significant unit, and the array of units increases
43: significance to the right. On a highbyte-first machine, such as the
44: Motorola 680x0, this unitptr points to the MOST significant unit, and
45: the array of units decreases significance to the right.
46:
47: Modified 8 Apr 92 - HAJK
48: Implement new VAX/VMS primitive support.
49:
50: */
51:
52: /* #define COUNTMULTS */ /* count modmults for performance studies */
53:
54: #ifdef DEBUG
55: #ifdef MSDOS
56: #include <conio.h>
57: #define poll_for_break() {while (kbhit()) getch();}
58: #endif /* MSDOS */
59: #endif /* DEBUG */
60:
61: #ifndef poll_for_break
62: #define poll_for_break() /* stub */
63: #endif
64:
65: #include "mpilib.h"
66:
67: short global_precision = 0; /* units of precision for all routines */
68: /* global_precision is the unit precision last set by set_precision.
69: Initially, set_precision() should be called to define global_precision
70: before using any of these other multiprecision library routines.
71: i.e.: set_precision(MAX_UNIT_PRECISION);
72: */
73:
74: #ifdef PORTABLE
75: /*************** multiprecision library primitives ****************/
76: /* The following portable C primitives should be recoded in assembly.
77: The equivalent assembly primitives are externally defined under
78: different names, unless PORTABLE is defined. See the header file
79: "mpilib.h" for further details.
80:
81: The carry bit mechanism we use for these portable primitives
82: will only work if the size of unit is smaller than the size of
83: a long integer. In most cases, this means UNITSIZE must be
84: less than 32 bits -- if you use the C portable primitives.
85: */
86:
87: typedef unsigned long int ulint;
88: #define carrybit ((ulint) 1 << UNITSIZE)
89: /* ...assumes sizeof(unit) < sizeof(unsigned long) */
90:
91: boolean mp_addc
92: (register unitptr r1,register unitptr r2,register boolean carry)
93: /* multiprecision add with carry r2 to r1, result in r1 */
94: /* carry is incoming carry flag-- value should be 0 or 1 */
95: { register ulint x; /* won't work if sizeof(unit)==sizeof(long) */
96: short precision; /* number of units to add */
97: precision = global_precision;
98: make_lsbptr(r1,precision);
99: make_lsbptr(r2,precision);
100: while (precision--)
101: { x = (ulint) *r1 + (ulint) *post_higherunit(r2) + (ulint) carry;
102: *post_higherunit(r1) = x;
103: carry = ((x & carrybit) != 0L);
104: }
105: return(carry); /* return the final carry flag bit */
106: } /* mp_addc */
107:
108:
109: boolean mp_subb
110: (register unitptr r1,register unitptr r2,register boolean borrow)
111: /* multiprecision subtract with borrow, r2 from r1, result in r1 */
112: /* borrow is incoming borrow flag-- value should be 0 or 1 */
113: { register ulint x; /* won't work if sizeof(unit)==sizeof(long) */
114: short precision; /* number of units to subtract */
115: precision = global_precision;
116: make_lsbptr(r1,precision);
117: make_lsbptr(r2,precision);
118: while (precision--)
119: { x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
120: *post_higherunit(r1) = x;
121: borrow = ((x & carrybit) != 0L);
122: }
123: return(borrow); /* return the final carry/borrow flag bit */
124: } /* mp_subb */
125:
126: #undef carrybit
127:
128:
129: boolean mp_rotate_left(register unitptr r1,register boolean carry)
130: /* multiprecision rotate left 1 bit with carry, result in r1. */
131: /* carry is incoming carry flag-- value should be 0 or 1 */
132: { register short precision; /* number of units to rotate */
133: register boolean nextcarry;
134: precision = global_precision;
135: make_lsbptr(r1,precision);
136: while (precision--)
137: {
138: nextcarry = (((signedunit) *r1) < 0);
139: /* nextcarry = ((*r1 & uppermostbit) != 0); */
140: *r1 <<= 1 ;
141: if (carry) *r1 |= 1;
142: carry = nextcarry;
143: pre_higherunit(r1);
144: }
145: return(nextcarry); /* return the final carry flag bit */
146: } /* mp_rotate_left */
147:
148: /************** end of primitives that should be in assembly *************/
149: #endif /* PORTABLE */
150:
151:
152: /* The mp_shift_right_bits function is not called in any time-critical
153: situations in public-key cryptographic functions, so it doesn't
154: need to be coded in assembly language.
155: */
156: void mp_shift_right_bits(register unitptr r1,register short bits)
157: /* multiprecision shift right bits, result in r1.
158: bits is how many bits to shift, must be < UNITSIZE.
159: */
160: { register short precision; /* number of units to shift */
161: register unit carry,nextcarry,bitmask;
162: register short unbits;
163: if (bits==0) return; /* shift zero bits is a no-op */
164: carry = 0;
165: bitmask = power_of_2(bits)-1;
166: unbits = UNITSIZE-bits; /* shift bits must be < UNITSIZE */
167: precision = global_precision;
168: make_msbptr(r1,precision);
169: while (precision--)
170: { nextcarry = *r1 & bitmask;
171: *r1 >>= bits;
172: *r1 |= carry << unbits;
173: carry = nextcarry;
174: pre_lowerunit(r1);
175: }
176: } /* mp_shift_right_bits */
177:
178:
179: #ifndef VMS
180:
181: short mp_compare(register unitptr r1,register unitptr r2)
182: /* Compares multiprecision integers *r1, *r2, and returns:
183: -1 iff *r1 < *r2
184: 0 iff *r1 == *r2
185: +1 iff *r1 > *r2
186: */
187: { register short precision; /* number of units to compare */
188:
189: precision = global_precision;
190: make_msbptr(r1,precision);
191: make_msbptr(r2,precision);
192: do
193: { if (*r1 < *r2)
194: return(-1);
195: if (*post_lowerunit(r1) > *post_lowerunit(r2))
196: return(1);
197: } while (--precision);
198: return(0); /* *r1 == *r2 */
199: } /* mp_compare */
200:
201: #endif /* VMS */
202:
203: boolean mp_inc(register unitptr r)
204: /* Increment multiprecision integer r. */
205: { register short precision;
206: precision = global_precision;
207: make_lsbptr(r,precision);
208: do
209: { if ( ++(*r) ) return(0); /* no carry */
210: post_higherunit(r);
211: } while (--precision);
212: return(1); /* return carry set */
213: } /* mp_inc */
214:
215:
216: boolean mp_dec(register unitptr r)
217: /* Decrement multiprecision integer r. */
218: { register short precision;
219: precision = global_precision;
220: make_lsbptr(r,precision);
221: do
222: { if ( (signedunit) (--(*r)) != (signedunit) -1 )
223: return(0); /* no borrow */
224: post_higherunit(r);
225: } while (--precision);
226: return(1); /* return borrow set */
227: } /* mp_dec */
228:
229:
230: void mp_neg(register unitptr r)
231: /* Compute 2's complement, the arithmetic negative, of r */
232: { register short precision; /* number of units to negate */
233: precision = global_precision;
234: mp_dec(r); /* 2's complement is 1's complement plus 1 */
235: do /* now do 1's complement */
236: { *r = ~(*r);
237: r++;
238: } while (--precision);
239: } /* mp_neg */
240:
241: #ifndef VAXC /* Replaced by a macro under VAX C */
242:
243: void mp_move(register unitptr dst,register unitptr src)
244: { register short precision; /* number of units to move */
245: precision = global_precision;
246: do { *dst++ = *src++; } while (--precision);
247: } /* mp_move */
248:
249: #endif /* VAXC */
250: void mp_init(register unitptr r, word16 value)
251: /* Init multiprecision register r with short value. */
252: { /* Note that mp_init doesn't extend sign bit for >32767 */
253: register short precision; /* number of units to init */
254: precision = global_precision;
255: make_lsbptr(r,precision);
256: *post_higherunit(r) = value; precision--;
257: #ifdef UNIT8
258: *post_higherunit(r) = value >> UNITSIZE; precision--;
259: #endif
260: #ifdef VAXC
261:
262: unitfill0( r, precision); /* Use character insts. on a VAX */
263:
264: #else /* VAXC */
265:
266: while (precision--)
267: *post_higherunit(r) = 0;
268:
269: #endif /* VAXC */
270:
271: } /* mp_init */
272:
273:
274: short significance(register unitptr r)
275: /* Returns number of significant units in r */
276: { register short precision;
277: precision = global_precision;
278: make_msbptr(r,precision);
279: do
280: { if (*post_lowerunit(r))
281: return(precision);
282: } while (--precision);
283: return(precision);
284: } /* significance */
285:
286:
287: #ifndef VAXC /* Replaced by a macro under VAX C */
288:
289: void unitfill0(unitptr r,word16 unitcount)
290: /* Zero-fill the unit buffer r. */
291: { while (unitcount--) *r++ = 0;
292: } /* unitfill0 */
293:
294: #endif /* VAXC */
295:
296: /* The macro normalize(r,precision) "normalizes" a multiprecision integer
297: by adjusting r and precision to new values. For Motorola-style processors
298: (MSB-first), r is a pointer to the MSB of the register, and must
299: be adjusted to point to the first nonzero unit. For Intel/VAX-style
300: (LSB-first) processors, r is a pointer to the LSB of the register,
301: and must be left unchanged. The precision counter is always adjusted,
302: regardless of processor type. In the case of precision = 0,
303: r becomes undefined.
304: */
305:
306:
307: /* The macro rescale(r,current_precision,new_precision) rescales
308: a multiprecision integer by adjusting r and its precision to new values.
309: It can be used to reverse the effects of the normalize
310: routine given above. See the comments in normalize concerning
311: Intel vs. Motorola LSB/MSB conventions.
312: WARNING: You can only safely call rescale on registers that
313: you have previously normalized with the above normalize routine,
314: or are known to be big enough for the new precision. You may
315: specify a new precision that is smaller than the current precision.
316: You must be careful not to specify a new_precision value that is
317: too big, or which adjusts the r pointer out of range.
318: */
319:
320:
321: /* The "bit sniffer" macros all begin sniffing at the MSB.
322: They are used internally by all the various multiply, divide,
323: modulo, exponentiation, and square root functions.
324: */
325:
326:
327: int mp_udiv(register unitptr remainder,register unitptr quotient,
328: register unitptr dividend,register unitptr divisor)
329: /* Unsigned divide, treats both operands as positive. */
330: { int bits;
331: short dprec;
332: register unit bitmask;
333: if (testeq(divisor,0))
334: return(-1); /* zero divisor means divide error */
335: mp_init0(remainder);
336: mp_init0(quotient);
337: /* normalize and compute number of bits in dividend first */
338: init_bitsniffer(dividend,bitmask,dprec,bits);
339: /* rescale quotient to same precision (dprec) as dividend */
340: rescale(quotient,global_precision,dprec);
341: make_msbptr(quotient,dprec);
342:
343: while (bits--)
344: { mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
345: if (mp_compare(remainder,divisor) >= 0)
346: { mp_sub(remainder,divisor);
347: stuff_bit(quotient,bitmask);
348: }
349: bump_2bitsniffers(dividend,quotient,bitmask);
350: }
351: return(0);
352: } /* mp_udiv */
353:
354:
355: #define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */
356:
357: int mp_recip(register unitptr quotient,register unitptr divisor)
358: /* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */
359: { int bits;
360: short qprec;
361: register unit bitmask;
362: unit remainder[MAX_UNIT_PRECISION];
363: if (testeq(divisor,0))
364: return(-1); /* zero divisor means divide error */
365: mp_init0(remainder);
366: mp_init0(quotient);
367:
368: /* normalize and compute number of bits in quotient first */
369: bits = countbits(divisor) + RECIPMARGIN;
370: bitmask = bitmsk(bits); /* bitmask within a single unit */
371: qprec = bits2units(bits+1);
372: mp_setbit(remainder,(bits-RECIPMARGIN)-1);
373: /* rescale quotient to precision of divisor + RECIPMARGIN bits */
374: rescale(quotient,global_precision,qprec);
375: make_msbptr(quotient,qprec);
376:
377: while (bits--)
378: { mp_shift_left(remainder);
379: if (mp_compare(remainder,divisor) >= 0)
380: { mp_sub(remainder,divisor);
381: stuff_bit(quotient,bitmask);
382: }
383: bump_bitsniffer(quotient,bitmask);
384: }
385: mp_init0(remainder); /* burn sensitive data left on stack */
386: return(0);
387: } /* mp_recip */
388:
389:
390: int mp_div(register unitptr remainder,register unitptr quotient,
391: register unitptr dividend,register unitptr divisor)
392: /* Signed divide, either or both operands may be negative. */
393: { boolean dvdsign,dsign;
394: int status;
395: dvdsign = (mp_tstminus(dividend)!=0);
396: dsign = (mp_tstminus(divisor)!=0);
397: if (dvdsign) mp_neg(dividend);
398: if (dsign) mp_neg(divisor);
399: status = mp_udiv(remainder,quotient,dividend,divisor);
400: if (dvdsign) mp_neg(dividend); /* repair caller's dividend */
401: if (dsign) mp_neg(divisor); /* repair caller's divisor */
402: if (status<0) return(status); /* divide error? */
403: if (dvdsign) mp_neg(remainder);
404: if (dvdsign ^ dsign) mp_neg(quotient);
405: return(status);
406: } /* mp_div */
407:
408:
409: word16 mp_shortdiv(register unitptr quotient,
410: register unitptr dividend,register word16 divisor)
411: /* This function does a fast divide and mod on a multiprecision dividend
412: using a short integer divisor returning a short integer remainder.
413: This is an unsigned divide. It treats both operands as positive.
414: It is used mainly for faster printing of large numbers in base 10.
415: */
416: { int bits;
417: short dprec;
418: register unit bitmask;
419: register word16 remainder;
420: if (!divisor) /* if divisor == 0 */
421: return(-1); /* zero divisor means divide error */
422: remainder=0;
423: mp_init0(quotient);
424: /* normalize and compute number of bits in dividend first */
425: init_bitsniffer(dividend,bitmask,dprec,bits);
426: /* rescale quotient to same precision (dprec) as dividend */
427: rescale(quotient,global_precision,dprec);
428: make_msbptr(quotient,dprec);
429:
430: while (bits--)
431: { remainder <<= 1;
432: if (sniff_bit(dividend,bitmask))
433: remainder++;
434: if (remainder >= divisor)
435: { remainder -= divisor;
436: stuff_bit(quotient,bitmask);
437: }
438: bump_2bitsniffers(dividend,quotient,bitmask);
439: }
440: return(remainder);
441: } /* mp_shortdiv */
442:
443:
444: int mp_mod(register unitptr remainder,
445: register unitptr dividend,register unitptr divisor)
446: /* Unsigned divide, treats both operands as positive. */
447: { int bits;
448: short dprec;
449: register unit bitmask;
450: if (testeq(divisor,0))
451: return(-1); /* zero divisor means divide error */
452: mp_init0(remainder);
453: /* normalize and compute number of bits in dividend first */
454: init_bitsniffer(dividend,bitmask,dprec,bits);
455:
456: while (bits--)
457: { mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
458: msub(remainder,divisor);
459: bump_bitsniffer(dividend,bitmask);
460: }
461: return(0);
462: } /* mp_mod */
463:
464:
465: word16 mp_shortmod(register unitptr dividend,register word16 divisor)
466: /* This function does a fast mod operation on a multprecision dividend
467: using a short integer modulus returning a short integer remainder.
468: This is an unsigned divide. It treats both operands as positive.
469: It is used mainly for fast sieve searches for large primes.
470: */
471: { int bits;
472: short dprec;
473: register unit bitmask;
474: register word16 remainder;
475: if (!divisor) /* if divisor == 0 */
476: return(-1); /* zero divisor means divide error */
477: remainder=0;
478: /* normalize and compute number of bits in dividend first */
479: init_bitsniffer(dividend,bitmask,dprec,bits);
480:
481: while (bits--)
482: { remainder <<= 1;
483: if (sniff_bit(dividend,bitmask))
484: remainder++;
485: if (remainder >= divisor) remainder -= divisor;
486: bump_bitsniffer(dividend,bitmask);
487: }
488: return(remainder);
489: } /* mp_shortmod */
490:
491:
492:
493: #ifdef COMB_MULT /* use faster "comb" multiply algorithm */
494: /* We are skipping this code because it has a bug... */
495:
496: int mp_mult(register unitptr prod,
497: register unitptr multiplicand, register unitptr multiplier)
498: /* Computes multiprecision prod = multiplicand * multiplier */
499: { /* Uses interleaved comb multiply algorithm.
500: This improved multiply more than twice as fast as a Russian
501: peasant multiply, because it does a lot fewer shifts.
502: Must have global_precision set to the size of the multiplicand
503: plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum
504: of the lengths of the multiplier and multiplicand.
505:
506: BUG ALERT: Unfortunately, this code has a bug. It fails for
507: some numbers. One such example:
508: x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5
509: x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D
510: 52C8 CDC7 AFB3 61C8 243C 741B
511: --which is obviously wrong. The answer should be:
512: x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A
513: C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9
514: We'll have to fix this some day. In the meantime, we'll
515: just have the compiler skip over this code.
516: */
517: int bits;
518: register unit bitmask;
519: unitptr product, mplier, temp;
520: short mprec,mprec2;
521: unit mplicand[MAX_UNIT_PRECISION];
522: mp_init(prod,0); /* better clear full width--double precision */
523: if (testeq(multiplicand,0))
524: return(0); /* zero multiplicand means zero product */
525:
526: mp_move(mplicand,multiplicand); /* save it from damage */
527:
528: normalize(multiplier,mprec);
529: if (!mprec)
530: return(0);
531:
532: make_lsbptr(multiplier,mprec);
533: bitmask = 1; /* start scan at LSB of multiplier */
534:
535: do /* UNITSIZE times */
536: { /* do for bits 0-15 */
537: product = prod;
538: mplier = multiplier;
539: mprec2 = mprec;
540: while (mprec2--) /* do for each word in multiplier */
541: {
542: if (sniff_bit(mplier,bitmask))
543: { if (mp_addc(product,multiplicand,0)) /* ripple carry */
544: { /* After 1st time thru, this is rarely encountered. */
545: temp = msbptr(product,global_precision);
546: pre_higherunit(temp);
547: /* temp now points to LSU of carry region. */
548: unmake_lsbptr(temp,global_precision);
549: mp_inc(temp);
550: } /* ripple carry */
551: }
552: pre_higherunit(mplier);
553: pre_higherunit(product);
554: }
555: if (!(bitmask <<= 1))
556: break;
557: mp_shift_left(multiplicand);
558:
559: } while (TRUE);
560:
561: mp_move(multiplicand,mplicand); /* recover */
562:
563: return(0); /* normal return */
564: } /* mp_mult */
565:
566: #endif /* COMB_MULT */
567:
568:
569: /* Because the "comb" multiply has a bug, we will use the slower
570: Russian peasant multiply instead. Fortunately, the mp_mult
571: function is not called from any time-critical code.
572: */
573:
574: int mp_mult(register unitptr prod,
575: register unitptr multiplicand,register unitptr multiplier)
576: /* Computes multiprecision prod = multiplicand * multiplier */
577: { /* Uses "Russian peasant" multiply algorithm. */
578: int bits;
579: register unit bitmask;
580: short mprec;
581: mp_init(prod,0);
582: if (testeq(multiplicand,0))
583: return(0); /* zero multiplicand means zero product */
584: /* normalize and compute number of bits in multiplier first */
585: init_bitsniffer(multiplier,bitmask,mprec,bits);
586:
587: while (bits--)
588: { mp_shift_left(prod);
589: if (sniff_bit(multiplier,bitmask))
590: mp_add(prod,multiplicand);
591: bump_bitsniffer(multiplier,bitmask);
592: }
593: return(0);
594: } /* mp_mult */
595:
596:
597:
598: /* mp_modmult computes a multiprecision multiply combined with a
599: modulo operation. This is the most time-critical function in
600: this multiprecision arithmetic library for performing modulo
601: exponentiation. We experimented with different versions of modmult,
602: depending on the machine architecture and performance requirements.
603: We will either use a Russian Peasant modmult (peasant_modmult),
604: Charlie Merritt's modmult (merritt_modmult) or Jimmy Upton's
605: modmult (upton_modmult). On machines with a hardware atomic
606: multiply instruction, Upton's modmult is fastest, which depends
607: on an assembly subroutine to speed up the hardware multiply logic.
608: If the machine lacks a fast hardware multiply, Merritt's modmult
609: is preferred, which doesn't call any assembly multiply routine.
610: We use the alias names mp_modmult, stage_modulus, and modmult_burn
611: for the corresponding true names, which depend on what flavor of
612: modmult we are using.
613:
614: Before making the first call to mp_modmult, you must set up the
615: modulus-dependant precomputated tables by calling stage_modulus.
616: After making all the calls to mp_modmult, you call modmult_burn to
617: erase the tables created by stage_modulus that were left in memory.
618: */
619:
620: #ifdef COUNTMULTS
621: /* "number of modmults" counters, used for performance studies. */
622: static unsigned int tally_modmults = 0;
623: static unsigned int tally_modsquares = 0;
624: #endif /* COUNTMULTS */
625:
626:
627: #ifdef PEASANT
628: /* Conventional Russian peasant multiply with modulo algorithm. */
629:
630: static unitptr modulus = 0; /* used only by mp_modmult */
631:
632: int stage_peasant_modulus(unitptr n)
633: /* Must pass modulus to stage_modulus before calling modmult.
634: Assumes that global_precision has already been adjusted to the
635: size of the modulus, plus SLOP_BITS.
636: */
637: { /* For this simple version of modmult, just copy unit pointer. */
638: modulus = n;
639: return(0); /* normal return */
640: } /* stage_peasant_modulus */
641:
642:
643: int peasant_modmult(register unitptr prod,
644: unitptr multiplicand,register unitptr multiplier)
645: { /* "Russian peasant" multiply algorithm, combined with a modulo
646: operation. This is a simple naive replacement for Merritt's
647: faster modmult algorithm. References global unitptr "modulus".
648: Computes: prod = (multiplicand*multiplier) mod modulus
649: WARNING: All the arguments must be less than the modulus!
650: */
651: int bits;
652: register unit bitmask;
653: short mprec;
654: mp_init(prod,0);
655: /* if (testeq(multiplicand,0))
656: return(0); */ /* zero multiplicand means zero product */
657: /* normalize and compute number of bits in multiplier first */
658: init_bitsniffer(multiplier,bitmask,mprec,bits);
659:
660: while (bits--)
661: { mp_shift_left(prod);
662: msub(prod,modulus); /* turns mult into modmult */
663: if (sniff_bit(multiplier,bitmask))
664: { mp_add(prod,multiplicand);
665: msub(prod,modulus); /* turns mult into modmult */
666: }
667: bump_bitsniffer(multiplier,bitmask);
668: }
669: return(0);
670: } /* peasant_modmult */
671:
672:
673: /* If we are using a version of mp_modmult that uses internal tables
674: in memory, we have to call modmult_burn() at the end of mp_modexp.
675: This is so that no sensitive data is left in memory after the program
676: exits. The Russian peasant method doesn't use any such tables.
677: */
678: static void peasant_burn(void)
679: /* Alias for modmult_burn, called only from mp_modexp(). Destroys
680: internal modmult tables. This version does nothing because no
681: tables are used by the Russian peasant modmult. */
682: { } /* peasant_burn */
683:
684: #endif /* PEASANT */
685:
686:
687: #ifdef MERRITT
688: /*=========================================================================*/
689: /*
690: This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
691: Also refined by Zhahai Stewart to reduce the number of subtracts.
692: It performs a multiply combined with a modulo operation, without
693: going into "double precision". It is faster than the Russian peasant
694: method, and still works well on machines that lack a fast hardware
695: multiply instruction.
696: */
697:
698: /* The following support functions, macros, and data structures
699: are used only by Merritt's modmult algorithm... */
700:
701: static void mp_lshift_unit(register unitptr r1)
702: /* Shift r1 1 whole unit to the left. Used only by modmult function. */
703: { register short precision;
704: register unitptr r2;
705: precision = global_precision;
706: make_msbptr(r1,precision);
707: r2 = r1;
708: while (--precision)
709: *post_lowerunit(r1) = *pre_lowerunit(r2);
710: *r1 = 0;
711: } /* mp_lshift_unit */
712:
713:
714: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
715: static unit moduli_buf[UNITSIZE][MAX_UNIT_PRECISION] = {0};
716: static unitptr moduli[UNITSIZE+1] = /* contains pointers into moduli_buf */
717: { 0
718: ,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0],
719: &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0], &moduli_buf[ 7][0]
720: #ifndef UNIT8
721: ,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0],
722: &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0], &moduli_buf[15][0]
723: #ifndef UNIT16 /* and not UNIT8 */
724: ,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0],
725: &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0],
726: &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0],
727: &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0], &moduli_buf[31][0]
728: #endif /* UNIT16 and UNIT8 not defined */
729: #endif /* UNIT8 not defined */
730: };
731:
732: /* To optimize msubs, need following 2 unit arrays, each filled
733: with the most significant unit and next-to-most significant unit
734: of the preshifted images of the modulus. */
735: static unit msu_moduli[UNITSIZE+1] = {0}; /* most signif. unit */
736: static unit nmsu_moduli[UNITSIZE+1] = {0}; /* next-most signif. unit */
737:
738: /* mpdbuf contains preshifted images of the multiplicand, mod n.
739: It is used only by mp_modmult. It could be staticly declared
740: inside of mp_modmult, but we put it outside mp_modmult so that
741: it can be wiped clean by modmult_burn(), which is called at the
742: end of mp_modexp. This is so that no sensitive data is left in
743: memory after the program exits.
744: */
745: static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
746:
747:
748: static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
749: /* Computes UNITSIZE images of r, each shifted left 1 more bit.
750: Used only by modmult function.
751: */
752: { short int i;
753: images[0] = r; /* no need to move the first image, just copy ptr */
754: for (i=1; i<UNITSIZE; i++)
755: { mp_move(images[i],images[i-1]);
756: mp_shift_left(images[i]);
757: }
758: } /* stage_mp_images */
759:
760:
761: int stage_merritt_modulus(unitptr n)
762: /* Computes UNITSIZE+1 images of modulus, each shifted left 1 more bit.
763: Before calling mp_modmult, you must first stage the moduli images by
764: calling stage_modulus. n is the pointer to the modulus.
765: Assumes that global_precision has already been adjusted to the
766: size of the modulus, plus SLOP_BITS.
767: */
768: { short int i;
769: unitptr msu; /* ptr to most significant unit, for faster msubs */
770: moduli[0] = n; /* no need to move the first image, just copy ptr */
771:
772: /* used by optimized msubs macro... */
773: msu = msbptr(n,global_precision); /* needed by msubs */
774: msu_moduli[0] = *post_lowerunit(msu);
775: nmsu_moduli[0] = *msu;
776:
777: for (i=1; i<UNITSIZE+1; i++)
778: { mp_move(moduli[i],moduli[i-1]);
779: mp_shift_left(moduli[i]);
780:
781: /* used by optimized msubs macro... */
782: msu = msbptr(moduli[i],global_precision); /* needed by msubs */
783: msu_moduli[i] = *post_lowerunit(msu); /* for faster msubs */
784: nmsu_moduli[i] = *msu;
785: }
786: return(0); /* normal return */
787: } /* stage_merritt_modulus */
788:
789:
790: /* The following macros, sniffadd and msubs, are used by modmult... */
791: #define sniffadd(i) if (*multiplier & power_of_2(i)) mp_add(prod,mpd[i])
792:
793: /* Unoptimized msubs macro (msubs0) follows... */
794: /* #define msubs0(i) msub(prod,moduli[i])
795: */
796:
797: /* To optimize msubs, compare the most significant units of the
798: product and the shifted modulus before deciding to call the full
799: mp_compare routine. Better still, compare the upper two units
800: before deciding to call mp_compare.
801: Optimization of msubs relies heavily on standard C left-to-right
802: parsimonious evaluation of logical expressions.
803: */
804:
805: /* Partially-optimized msubs macro (msubs1) follows... */
806: /* #define msubs1(i) if ( \
807: ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
808: (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
809: ) mp_sub(prod,moduli[i])
810: */
811:
812: /* Fully-optimized msubs macro (msubs2) follows... */
813: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
814: (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
815: (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
816: mp_sub(prod,moduli[i])
817:
818:
819: int merritt_modmult(register unitptr prod,
820: unitptr multiplicand,register unitptr multiplier)
821: /* Performs combined multiply/modulo operation.
822: Computes: prod = (multiplicand*multiplier) mod modulus
823: WARNING: All the arguments must be less than the modulus!
824: Assumes the modulus has been predefined by first calling
825: stage_modulus.
826: */
827: {
828: /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
829: register signedunit p_m;
830: register unitptr msu_prod; /* ptr to most significant unit of product */
831: register unitptr nmsu_prod; /* next-most signif. unit of product */
832: short mprec; /* precision of multiplier, in units */
833: /* Array mpd contains a list of pointers to preshifted images of
834: the multiplicand: */
835: static unitptr mpd[UNITSIZE] =
836: { 0, &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
837: &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
838: #ifndef UNIT8
839: ,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
840: &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
841: #ifndef UNIT16 /* and not UNIT8 */
842: ,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
843: &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
844: &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
845: &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
846: #endif /* UNIT16 and UNIT8 not defined */
847: #endif /* UNIT8 not defined */
848: };
849:
850: /* Compute preshifted images of multiplicand, mod n: */
851: stage_mp_images(mpd,multiplicand);
852:
853: /* To optimize msubs, set up msu_prod and nmsu_prod: */
854: msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */
855: nmsu_prod = msu_prod;
856: post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
857:
858: /* To understand this algorithm, it would be helpful to first
859: study the conventional Russian peasant modmult algorithm.
860: This one does about the same thing as Russian peasant, but
861: is organized differently to save some steps. It loops
862: through the multiplier a word (unit) at a time, instead of
863: a bit at a time. It word-shifts the product instead of
864: bit-shifting it, so it should be faster. It also does about
865: half as many subtracts as Russian peasant.
866: */
867:
868: mp_init(prod,0); /* Initialize product to 0. */
869:
870: /* The way mp_modmult is actually used in cryptographic
871: applications, there will NEVER be a zero multiplier or
872: multiplicand. So there is no need to optimize for that
873: condition.
874: */
875: /* if (testeq(multiplicand,0))
876: return(0); */ /* zero multiplicand means zero product */
877: /* Normalize and compute number of units in multiplier first: */
878: normalize(multiplier,mprec);
879: if (mprec==0) /* if precision of multiplier is 0 */
880: return(0); /* zero multiplier means zero product */
881: make_msbptr(multiplier,mprec); /* start at MSU of multiplier */
882:
883: while (mprec--) /* Loop for the number of units in the multiplier */
884: {
885: /* Shift the product one whole unit to the left.
886: This is part of the multiply phase of modmult.
887: */
888:
889: mp_lshift_unit(prod);
890:
891: /* Sniff each bit in the current unit of the multiplier,
892: and conditionally add the the corresponding preshifted
893: image of the multiplicand to the product.
894: This is also part of the multiply phase of modmult.
895:
896: The following loop is unrolled for speed, using macros...
897:
898: for (i=UNITSIZE-1; i>=0; i--)
899: if (*multiplier & power_of_2(i))
900: mp_add(prod,mpd[i]);
901: */
902: #ifndef UNIT8
903: #ifndef UNIT16 /* and not UNIT8 */
904: sniffadd(31);
905: sniffadd(30);
906: sniffadd(29);
907: sniffadd(28);
908: sniffadd(27);
909: sniffadd(26);
910: sniffadd(25);
911: sniffadd(24);
912: sniffadd(23);
913: sniffadd(22);
914: sniffadd(21);
915: sniffadd(20);
916: sniffadd(19);
917: sniffadd(18);
918: sniffadd(17);
919: sniffadd(16);
920: #endif /* not UNIT16 and not UNIT8 */
921: sniffadd(15);
922: sniffadd(14);
923: sniffadd(13);
924: sniffadd(12);
925: sniffadd(11);
926: sniffadd(10);
927: sniffadd(9);
928: sniffadd(8);
929: #endif /* not UNIT8 */
930: sniffadd(7);
931: sniffadd(6);
932: sniffadd(5);
933: sniffadd(4);
934: sniffadd(3);
935: sniffadd(2);
936: sniffadd(1);
937: sniffadd(0);
938:
939: /* The product may have grown by as many as UNITSIZE+1
940: bits. That's why we have global_precision set to the
941: size of the modulus plus UNITSIZE+1 slop bits.
942: Now reduce the product back down by conditionally
943: subtracting the UNITSIZE+1 preshifted images of the
944: modulus. This is the modulo reduction phase of modmult.
945:
946: The following loop is unrolled for speed, using macros...
947:
948: for (i=UNITSIZE; i>=0; i--)
949: if (mp_compare(prod,moduli[i]) >= 0)
950: mp_sub(prod,moduli[i]);
951: */
952: #ifndef UNIT8
953: #ifndef UNIT16 /* and not UNIT8 */
954: msubs(32);
955: msubs(31);
956: msubs(30);
957: msubs(29);
958: msubs(28);
959: msubs(27);
960: msubs(26);
961: msubs(25);
962: msubs(24);
963: msubs(23);
964: msubs(22);
965: msubs(21);
966: msubs(20);
967: msubs(19);
968: msubs(18);
969: msubs(17);
970: #endif /* not UNIT16 and not UNIT8 */
971: msubs(16);
972: msubs(15);
973: msubs(14);
974: msubs(13);
975: msubs(12);
976: msubs(11);
977: msubs(10);
978: msubs(9);
979: #endif /* not UNIT8 */
980: msubs(8);
981: msubs(7);
982: msubs(6);
983: msubs(5);
984: msubs(4);
985: msubs(3);
986: msubs(2);
987: msubs(1);
988: msubs(0);
989:
990: /* Bump pointer to next lower unit of multiplier: */
991: post_lowerunit(multiplier);
992:
993: } /* Loop for the number of units in the multiplier */
994:
995: return(0); /* normal return */
996:
997: } /* merritt_modmult */
998:
999:
1000: #undef msubs
1001: #undef sniffadd
1002:
1003:
1004: /* Merritt's mp_modmult function leaves some internal tables in memory,
1005: so we have to call modmult_burn() at the end of mp_modexp.
1006: This is so that no cryptographically sensitive data is left in memory
1007: after the program exits.
1008: */
1009: static void merritt_burn(void)
1010: /* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
1011: { unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
1012: unitfill0(&(moduli_buf[0][0]),(UNITSIZE)*MAX_UNIT_PRECISION);
1013: unitfill0(msu_moduli,UNITSIZE+1);
1014: unitfill0(nmsu_moduli,UNITSIZE+1);
1015: } /* merritt_burn() */
1016:
1017: /******* end of Merritt's MODMULT stuff. *******/
1018: /*=========================================================================*/
1019: #endif /* MERRITT */
1020:
1021:
1022: #ifdef UPTON /* using Jimmy Upton's modmult algorithm */
1023:
1024: /* Jimmy Upton's multiprecision modmult algorithm in C.
1025: Performs a multiply combined with a modulo operation.
1026:
1027: The following support functions and data structures
1028: are used only by Upton's modmult algorithm...
1029: */
1030:
1031: /* The MULTUNIT word is the biggest word size for the native atomic
1032: multiply instruction. It may or may not be smaller than UNITSIZE.
1033: Many CPU's have 16-by-16-bit multiplies, yielding a 32-bit product.
1034: In that case, MULTUINIT should be a 16-bit word, even if the rest of
1035: the multiprecision arithmetic functions assume a 32-bit UNIT word.
1036: */
1037: typedef unsigned short MULTUNIT;
1038: #define MULTUNITSIZE (8*sizeof(MULTUNIT)) /* size in bits */
1039:
1040: static short unit_prec; /* global_precision expressed in MULTUNITs */
1041:
1042:
1043: /* Define MPORTABLE if there is no assembly version of the mp_smul
1044: function available. An assembly version is much faster.
1045: Note that since the SPARC CPU has no hardware integer multiply
1046: instruction, there is not that much advantage in having an
1047: assembly version of mp_smul on that machine. It might be faster
1048: to use Merritt's modmult instead of Upton's modmult on the SPARC.
1049: */
1050: #ifdef MSDOS /* we do have an assembly version available on the 8086 */
1051: #undef MPORTABLE
1052: #endif
1053:
1054: #ifdef MPORTABLE /* use slow portable C version instead of assembly */
1055:
1056: /*
1057: Multiply the single-word multiplier times the multiprecision integer
1058: in multiplicand, accumulating result in prod. The resulting
1059: multiprecision prod will be 1 word longer than the multiplicand.
1060: multiplicand is unit_prec words long. We add into prod, so caller
1061: should zero it out first. For best results, this time-critical
1062: function should be implemented in assembly.
1063: NOTE: Unlike other functions in the multiprecision arithmetic
1064: library, both multiplicand and prod are pointing at the LSB,
1065: regardless of byte order of the machine. On an 80x86, this makes
1066: no difference. But if this assembly function is implemented
1067: on a 680x0, it becomes important.
1068: */
1069: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
1070: {
1071: short i;
1072: unsigned long p, carry;
1073:
1074: carry = 0;
1075: for (i=0; i<unit_prec; ++i)
1076: { p = (unsigned long)multiplier * *post_higherunit(multiplicand);
1077: p += *prod + carry;
1078: *post_higherunit(prod) = (MULTUNIT)p;
1079: carry = p >> MULTUNITSIZE;
1080: }
1081: /* We know that the high-order word of prod will always be 0 */
1082: *prod = (MULTUNIT)carry; /* copy carry to empty high word of prod */
1083: } /* mp_smul */
1084:
1085: #else /* not MPORTABLE-- use assembly routine */
1086:
1087: #define mp_smul P_SMUL /* use external assembly routine */
1088:
1089: #endif /* MPORTABLE */
1090:
1091: #ifdef VMS
1092:
1093: #define mp_dmul p_dmul /* use external assembly routine */
1094:
1095: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
1096:
1097: #else /* VMS */
1098:
1099: /* mp_dmul is a double-precision multiply multiplicand times multiplier,
1100: result into prod. prod must be pointing at a "double precision"
1101: buffer. E.g. If global_precision is 10 words, prod must be
1102: pointing at a 20-word buffer.
1103: */
1104: static void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
1105: {
1106: register int i;
1107: register MULTUNIT *p_multiplicand, *p_multiplier;
1108: register MULTUNIT *prodp;
1109:
1110: unitfill0(prod,global_precision*2); /* Pre-zero prod */
1111: /* Calculate precision in units of MULTUNIT */
1112: unit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
1113: p_multiplicand = (MULTUNIT *)multiplicand;
1114: p_multiplier = (MULTUNIT *)multiplier;
1115: prodp = (MULTUNIT *)prod;
1116: make_lsbptr(p_multiplicand,unit_prec);
1117: make_lsbptr(p_multiplier,unit_prec);
1118: make_lsbptr(prodp,unit_prec*2);
1119: /* Multiply multiplicand by each word in multiplier, accumulating prod: */
1120: for (i=0; i<unit_prec; ++i)
1121: mp_smul (post_higherunit(prodp),
1122: p_multiplicand, *post_higherunit(p_multiplier));
1123: } /* mp_dmul */
1124:
1125: #endif /* VMS */
1126:
1127: /* These scratchpad arrays are used only by upton_modmult (mp_modmult).
1128: Some of them could be staticly declared inside of mp_modmult, but we
1129: put them outside mp_modmult so that they can be wiped clean by
1130: modmult_burn(), which is called at the end of mp_modexp. This is
1131: so that no sensitive data is left in memory after the program exits.
1132: */
1133:
1134: #ifdef VAXC
1135: /*
1136: * Operations on arrays are sped up on a VAX if the data is quadword (64-bit)
1137: * aligned. This is because many VAXen use a minimum of 64-bit data paths to
1138: * cache and memory. This technique is VAX C private, hence the conditional.
1139: */
1140: static unit _align( quadword ) modulus[MAX_UNIT_PRECISION];
1141: static unit _align( quadword ) reciprocal[MAX_UNIT_PRECISION];
1142: static unit _align( quadword ) dhi[MAX_UNIT_PRECISION];
1143: static unit _align( quadword ) d_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1144: static unit _align( quadword ) e_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1145: static unit _align( quadword ) f_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1146:
1147: #else /* VAXC */
1148:
1149: static unit modulus[MAX_UNIT_PRECISION];
1150: static unit reciprocal[MAX_UNIT_PRECISION];
1151: static unit dhi[MAX_UNIT_PRECISION];
1152: static unit d_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1153: static unit e_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1154: static unit f_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1155:
1156: #endif /* VAXC */
1157:
1158: static short nbits;
1159: static short nbitsDivUNITSIZE;
1160: static short nbitsModUNITSIZE;
1161:
1162: /* stage_upton_modulus() is aliased to stage_modulus().
1163: Prepare for an Upton modmult. Calculate the reciprocal of modulus,
1164: and save both. Note that reciprocal will have one more bit than
1165: modulus.
1166: Assumes that global_precision has already been adjusted to the
1167: size of the modulus, plus SLOP_BITS.
1168: */
1169: int stage_upton_modulus(unitptr n)
1170: {
1171: mp_move(modulus, n);
1172: mp_recip(reciprocal, modulus);
1173: nbits = countbits(modulus);
1174: nbitsDivUNITSIZE = nbits / UNITSIZE;
1175: nbitsModUNITSIZE = nbits % UNITSIZE;
1176: return(0); /* normal return */
1177: } /* stage_upton_modulus */
1178:
1179:
1180: /* Upton's algorithm performs a multiply combined with a modulo operation.
1181: Computes: prod = (multiplicand*multiplier) mod modulus
1182: WARNING: All the arguments must be less than the modulus!
1183: References global unitptr modulus and reciprocal.
1184: The reciprocal of modulus is 1 bit longer than the modulus.
1185: upton_modmult() is aliased to mp_modmult().
1186: */
1187: int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier)
1188: {
1189: unitptr d = d_data;
1190: unitptr d1 = d_data;
1191: unitptr e = e_data;
1192: unitptr f = f_data;
1193: short orig_precision, dprec;
1194: short i;
1195:
1196: orig_precision = global_precision;
1197: mp_dmul (d,multiplicand,multiplier);
1198:
1199: /* Throw off low nbits of d */
1200: #ifndef HIGHFIRST
1201: d1 = d + nbitsDivUNITSIZE;
1202: #else
1203: d1 = d + orig_precision - nbitsDivUNITSIZE;
1204: #endif
1205: mp_move(dhi, d1); /* Don't screw up d, we need it later */
1206: mp_shift_right_bits(dhi, nbitsModUNITSIZE);
1207:
1208: mp_dmul (e,dhi,reciprocal); /* Note - reciprocal has nbits+1 bits */
1209:
1210: #ifndef HIGHFIRST
1211: e += nbitsDivUNITSIZE;
1212: #else
1213: e += orig_precision - nbitsDivUNITSIZE;
1214: #endif
1215: mp_shift_right_bits(e, nbitsModUNITSIZE);
1216:
1217: mp_dmul (f,e,modulus);
1218:
1219: /* Now for the only double-precision call to mpilib: */
1220: set_precision (orig_precision * 2);
1221: mp_sub (d,f);
1222:
1223: /* d's precision should be <= orig_precision */
1224: rescale (d, orig_precision*2, orig_precision);
1225: set_precision (orig_precision);
1226:
1227: /* Should never have to do this final subtract more than twice: */
1228: while (mp_compare(d,modulus) > 0)
1229: mp_sub (d,modulus);
1230:
1231: mp_move(prod,d);
1232: return(0); /* normal return */
1233: } /* upton_modmult */
1234:
1235:
1236: /* Upton's mp_modmult function leaves some internal arrays in memory,
1237: so we have to call modmult_burn() at the end of mp_modexp.
1238: This is so that no cryptographically sensitive data is left in memory
1239: after the program exits.
1240: upton_burn() is aliased to modmult_burn().
1241: */
1242: void upton_burn(void)
1243: {
1244: unitfill0(modulus,MAX_UNIT_PRECISION);
1245: unitfill0(reciprocal,MAX_UNIT_PRECISION);
1246: unitfill0(dhi,MAX_UNIT_PRECISION);
1247: unitfill0(d_data,MAX_UNIT_PRECISION*2);
1248: unitfill0(e_data,MAX_UNIT_PRECISION*2);
1249: unitfill0(f_data,MAX_UNIT_PRECISION*2);
1250: nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
1251: } /* upton_burn */
1252:
1253: #endif /* UPTON */
1254:
1255:
1256: int countbits(unitptr r)
1257: /* Returns number of significant bits in r */
1258: { int bits;
1259: short prec;
1260: register unit bitmask;
1261: init_bitsniffer(r,bitmask,prec,bits);
1262: return(bits);
1263: } /* countbits */
1264:
1265:
1266: char *copyright_notice(void)
1267: /* force linker to include copyright notice in the executable object image. */
1268: { return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */
1269:
1270:
1271: int mp_modexp(register unitptr expout,register unitptr expin,
1272: register unitptr exponent,register unitptr modulus)
1273: { /* Russian peasant combined exponentiation/modulo algorithm.
1274: Calls modmult instead of mult.
1275: Computes: expout = (expin**exponent) mod modulus
1276: WARNING: All the arguments must be less than the modulus!
1277: */
1278: int bits;
1279: short oldprecision;
1280: register unit bitmask;
1281: unit product[MAX_UNIT_PRECISION];
1282: short eprec;
1283:
1284: #ifdef COUNTMULTS
1285: tally_modmults = 0; /* clear "number of modmults" counter */
1286: tally_modsquares = 0; /* clear "number of modsquares" counter */
1287: #endif /* COUNTMULTS */
1288: mp_init(expout,1);
1289: if (testeq(exponent,0))
1290: { if (testeq(expin,0))
1291: return(-1); /* 0 to the 0th power means return error */
1292: return(0); /* otherwise, zero exponent means expout is 1 */
1293: }
1294: if (testeq(modulus,0))
1295: return(-2); /* zero modulus means error */
1296: #if SLOP_BITS > 0 /* if there's room for sign bits */
1297: if (mp_tstminus(modulus))
1298: return(-2); /* negative modulus means error */
1299: #endif /* SLOP_BITS > 0 */
1300: if (mp_compare(expin,modulus) >= 0)
1301: return(-3); /* if expin >= modulus, return error */
1302: if (mp_compare(exponent,modulus) >= 0)
1303: return(-4); /* if exponent >= modulus, return error */
1304:
1305: oldprecision = global_precision; /* save global_precision */
1306: /* set smallest optimum precision for this modulus */
1307: set_precision(bits2units(countbits(modulus)+SLOP_BITS));
1308: /* rescale all these registers to global_precision we just defined */
1309: rescale(modulus,oldprecision,global_precision);
1310: rescale(expin,oldprecision,global_precision);
1311: rescale(exponent,oldprecision,global_precision);
1312: rescale(expout,oldprecision,global_precision);
1313:
1314: if (stage_modulus(modulus))
1315: { set_precision(oldprecision); /* restore original precision */
1316: return(-5); /* unstageable modulus (STEWART algorithm) */
1317: }
1318:
1319: /* normalize and compute number of bits in exponent first */
1320: init_bitsniffer(exponent,bitmask,eprec,bits);
1321:
1322: /* We can "optimize out" the first modsquare and modmult: */
1323: bits--; /* We know for sure at this point that bits>0 */
1324: mp_move(expout,expin); /* expout = (1*1)*expin; */
1325: bump_bitsniffer(exponent,bitmask);
1326:
1327: while (bits--)
1328: {
1329: poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
1330: #ifdef COUNTMULTS
1331: tally_modsquares++; /* bump "number of modsquares" counter */
1332: #endif /* COUNTMULTS */
1333: mp_modsquare(product,expout);
1334: mp_move(expout,product);
1335: if (sniff_bit(exponent,bitmask))
1336: { mp_modmult(product,expout,expin);
1337: mp_move(expout,product);
1338: #ifdef COUNTMULTS
1339: tally_modmults++; /* bump "number of modmults" counter */
1340: #endif /* COUNTMULTS */
1341: }
1342: bump_bitsniffer(exponent,bitmask);
1343: } /* while bits-- */
1344: mp_burn(product); /* burn the evidence on the stack */
1345: modmult_burn(); /* ask mp_modmult to also burn its own evidence */
1346:
1347: #ifdef COUNTMULTS /* diagnostic analysis */
1348: { long atomic_mults;
1349: unsigned int unitcount,totalmults;
1350: unitcount = bits2units(countbits(modulus));
1351: /* calculation assumes modsquare takes as long as a modmult: */
1352: atomic_mults = (long) tally_modmults * (unitcount * unitcount);
1353: atomic_mults += (long) tally_modsquares * (unitcount * unitcount);
1354: printf("%ld atomic mults for ",atomic_mults);
1355: printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
1356: tally_modsquares,tally_modmults,
1357: tally_modsquares+tally_modmults,
1358: countbits(modulus), unitcount);
1359: }
1360: #endif /* COUNTMULTS */
1361:
1362: set_precision(oldprecision); /* restore original precision */
1363:
1364: /* Do an explicit reference to the copyright notice so that the linker
1365: will be forced to include it in the executable object image... */
1366: copyright_notice(); /* has no real effect at run time */
1367:
1368: return(0); /* normal return */
1369: } /* mp_modexp */
1370:
1371:
1372:
1373:
1374: /*********************************************************************
1375: RSA-specific routines follow. These are the only functions that
1376: are specific to the RSA public key cryptosystem. The other
1377: multiprecision integer math functions may be used for non-RSA
1378: applications.
1379:
1380: The RSA public key cryptosystem is patented by the Massachusetts
1381: Institute of Technology (U.S. patent #4,405,829). This patent does
1382: not apply outside the USA. Public Key Partners (PKP) holds the
1383: exclusive commercial license to sell and sub-license the RSA public
1384: key cryptosystem. The author of this software implementation of the
1385: RSA algorithm is providing this software for educational use only.
1386: Licensing this algorithm from PKP is the responsibility of you, the
1387: user, not Philip Zimmermann, the author of this software. The author
1388: assumes no liability for any breach of patent law resulting from the
1389: unlicensed use of this software by the user.
1390: *********************************************************************/
1391:
1392:
1393: int rsa_decrypt(unitptr M, unitptr C,
1394: unitptr d, unitptr p, unitptr q, unitptr u)
1395: /* Uses Quisquater's Chinese Remainder Theorem shortcut
1396: for RSA decryption.
1397: M is the output plaintext message.
1398: C is the input ciphertext message.
1399: d is the secret decryption exponent.
1400: p and q are the secret prime factors of n.
1401: u is the multiplicative inverse of p, mod q.
1402: Note that u is precomputed on the assumption that p<q.
1403: n, the common modulus, is not used here because of the
1404: Chinese Remainder Theorem shortcut.
1405: */
1406: {
1407: unit p2[MAX_UNIT_PRECISION];
1408: unit q2[MAX_UNIT_PRECISION];
1409: unit temp1[MAX_UNIT_PRECISION];
1410: unit temp2[MAX_UNIT_PRECISION];
1411: int status;
1412:
1413: mp_init(M,1); /* initialize result in case of error */
1414:
1415: if (mp_compare(p,q) >= 0) /* ensure that p<q */
1416: { /* swap the pointers p and q */
1417: unitptr t;
1418: t = p; p = q; q = t;
1419: }
1420:
1421: /* Rather than decrypting by computing modexp with full mod n
1422: precision, compute a shorter modexp with mod p precision... */
1423:
1424: /* p2 = [ (C mod p)**( d mod (p-1) ) ] mod p */
1425: mp_move(temp1,p);
1426: mp_dec(temp1); /* temp1 = p-1 */
1427: mp_mod(temp2,d,temp1); /* temp2 = d mod (p-1) ) */
1428: mp_mod(temp1,C,p); /* temp1 = C mod p */
1429: status = mp_modexp(p2,temp1,temp2,p);
1430: if (status < 0) /* mp_modexp returned an error. */
1431: return(status); /* error return */
1432:
1433: /* Now compute a short modexp with mod q precision... */
1434:
1435: /* q2 = [ (C mod q)**( d mod (q-1) ) ] mod q */
1436: mp_move(temp1,q);
1437: mp_dec(temp1); /* temp1 = q-1 */
1438: mp_mod(temp2,d,temp1); /* temp2 = d mod (q-1) */
1439: mp_mod(temp1,C,q); /* temp1 = C mod q */
1440: status = mp_modexp(q2,temp1,temp2,q);
1441: if (status < 0) /* mp_modexp returned an error. */
1442: return(status); /* error return */
1443:
1444: /* Now use the multiplicative inverse u to glue together the
1445: two halves, saving a lot of time by avoiding a full mod n
1446: exponentiation. At key generation time, u was computed
1447: with the secret key as the multiplicative inverse of
1448: p, mod q such that: (p*u) mod q = 1, assuming p<q.
1449: */
1450: if (mp_compare(p2,q2) == 0) /* only happens if C<p */
1451: mp_move(M,p2);
1452: else
1453: { /* q2 = q2 - p2; if q2<0 then q2 = q2 + q */
1454: if (mp_sub(q2,p2)) /* if the result went negative... */
1455: mp_add(q2,q); /* add q to q2 */
1456:
1457: /* M = p2 + ( p * [(q2*u) mod q] ) */
1458: mp_mult(temp1,q2,u); /* temp1 = q2*u */
1459: mp_mod(temp2,temp1,q); /* temp2 = temp1 mod q */
1460: mp_mult(temp1,p,temp2); /* temp1 = p * temp2 */
1461: mp_add(temp1,p2); /* temp1 = temp1 + p2 */
1462: mp_move(M,temp1); /* M = temp1 */
1463: }
1464:
1465: mp_burn(p2); /* burn the evidence on the stack...*/
1466: mp_burn(q2);
1467: mp_burn(temp1);
1468: mp_burn(temp2);
1469: return(0); /* normal return */
1470: } /* rsa_decrypt */
1471:
1472:
1473: /****************** end of MPI library ****************************/
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.