|
|
1.1.1.2 root 1: /* C source code for multiprecision arithmetic library routines.
2: Implemented Nov 86 by Philip Zimmermann
3: Last revised 27 Nov 91 by PRZ
4:
5: Boulder Software Engineering
6: 3021 Eleventh Street
7: Boulder, CO 80304
8: (303) 541-0140
9:
10: (c) Copyright 1986-92 by Philip Zimmermann. All rights reserved.
11: The author assumes no liability for damages resulting from the use
12: of this software, even if the damage results from defects in this
13: software. No warranty is expressed or implied. The use of this
14: cryptographic software for developing weapon systems is expressly
15: forbidden.
16:
17: These routines implement all of the multiprecision arithmetic
18: necessary for number-theoretic cryptographic algorithms such as
19: ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
20: composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
21: key cryptography.
22:
23: Although originally developed in Microsoft C for the IBM PC, this code
24: contains few machine dependencies. It assumes 2's complement
25: arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines,
26: lowbyte-highbyte order or highbyte-lowbyte order. This version
27: has been converted to ANSI C.
28:
29:
30: The internal representation for these extended precision integer
31: "registers" is an array of "units". A unit is a machine word, which
32: is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
33: unsigned integer, depending on the machine's word size. For example,
34: an IBM PC or AT uses a unit size of 16 bits. To perform arithmetic
35: on these huge precision integers, we pass pointers to these unit
36: arrays to various subroutines. A pointer to an array of units is of
37: type unitptr. This is a pointer to a huge integer "register".
38:
39: When calling a subroutine, we always pass a pointer to the BEGINNING
40: of the array of units, regardless of the byte order of the machine.
41: On a lowbyte-first machine, such as the Intel 80x86, this unitptr
42: points to the LEAST significant unit, and the array of units increases
43: significance to the right. On a highbyte-first machine, such as the
44: Motorola 680x0, this unitptr points to the MOST significant unit, and
45: the array of units decreases significance to the right.
46:
47: Modified 8 Apr 92 - HAJK
48: Implement new VAX/VMS primitive support.
49:
50: Modified 30 Sep 92 -Castor Fu <[email protected]>
51: Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long)
52:
53: Modified 28 Nov 92 - Thad Smith
54: Added Smith modmult, generalized non-portable support.
55: */
56:
57: /* #define COUNTMULTS */ /* count modmults for performance studies */
58:
59: #ifdef DEBUG
60: #ifdef MSDOS
1.1.1.3 root 61: #ifdef __GO32__ /* DJGPP */
62: #include <pc.h>
63: #else
1.1.1.2 root 64: #include <conio.h>
1.1.1.3 root 65: #endif /* __GO32__ */
1.1.1.2 root 66: #define poll_for_break() {while (kbhit()) getch();}
67: #endif /* MSDOS */
68: #endif /* DEBUG */
69:
70: #ifndef poll_for_break
71: #define poll_for_break() /* stub */
72: #endif
73:
74: #include "mpilib.h"
75:
76: #ifdef mp_smula
77: #ifdef mp_smul
78: Error: Both mp_smula and mp_smul cannot be defined.
79: #else
80: #define mp_smul mp_smula
81: #endif
82: #endif
83:
84: /* set macros for MULTUNIT */
85: #ifdef MUNIT8
86: #define MULTUNITSIZE 8
87: typedef unsigned char MULTUNIT;
88: #ifdef UNIT8
89: #define MULTUNIT_SIZE_SAME
90: #endif
91: #else /* not MUNIT8 */
92: #ifdef MUNIT32
93: #define MULTUNITSIZE 32
94: typedef unsigned long MULTUNIT;
95: #ifdef UNIT32
96: #define MULTUNIT_SIZE_SAME
97: #else
98: /* #error is not portable, this has the same effect */
99: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
100: #endif
101: #else /* assume MUNIT16 */
102: #define MULTUNITSIZE 16
103: typedef unsigned short MULTUNIT;
104: #ifdef UNIT16
105: #define MULTUNIT_SIZE_SAME
106: #endif /* UNIT16 */
107: #ifdef UNIT8
108: #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
109: #endif /* UNIT8 */
110: #endif /* MUNIT32 */
111: #endif /* MUNIT8 */
112:
113: #define MULTUNIT_msb ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */
114: #define DMULTUNIT_msb (1L << (2*MULTUNITSIZE-1))
115: #define MULTUNIT_mask ((MULTUNIT)((1L << MULTUNITSIZE)-1))
116: #define MULTUNITs_perunit (UNITSIZE/MULTUNITSIZE)
117:
118:
119: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier);
120: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
121:
122: short global_precision = 0; /* units of precision for all routines */
123: /* global_precision is the unit precision last set by set_precision.
124: Initially, set_precision() should be called to define global_precision
125: before using any of these other multiprecision library routines.
126: i.e.: set_precision(MAX_UNIT_PRECISION);
127: */
128:
129: /*************** multiprecision library primitives ****************/
130: /* The following portable C primitives should be recoded in assembly.
131: The entry point name should be defined, in "mpilib.h" to the external
132: entry point name. If undefined, the C version will be used.
133: */
134:
135: typedef unsigned long int ulint;
136:
137: #ifndef mp_addc
138: boolean mp_addc
139: (register unitptr r1,register unitptr r2,register boolean carry)
140: /* multiprecision add with carry r2 to r1, result in r1 */
141: /* carry is incoming carry flag-- value should be 0 or 1 */
142: { register unit x;
143: short precision; /* number of units to add */
144: precision = global_precision;
145: make_lsbptr(r1,precision);
146: make_lsbptr(r2,precision);
147: while (precision--)
148: {
149: if (carry)
150: { x = *r1 + *r2 + 1;
151: carry = (*r2 >= (unit)(~ *r1));
152: } else
153: { x = *r1 + *r2;
154: carry = (x < *r1) ;
155: }
156: post_higherunit(r2);
157: *post_higherunit(r1) = x;
158: }
159: return(carry); /* return the final carry flag bit */
160: } /* mp_addc */
161: #endif /* mp_addc */
162:
163: #ifndef mp_subb
164: boolean mp_subb
165: (register unitptr r1,register unitptr r2,register boolean borrow)
166: /* multiprecision subtract with borrow, r2 from r1, result in r1 */
167: /* borrow is incoming borrow flag-- value should be 0 or 1 */
168: { register unit x;
169: short precision; /* number of units to subtract */
170: precision = global_precision;
171: make_lsbptr(r1,precision);
172: make_lsbptr(r2,precision);
173: while (precision--)
174: { if (borrow)
175: { x = *r1 - *r2 - 1;
176: borrow = (*r1 <= *r2);
177: } else
178: { x = *r1 - *r2;
179: borrow = (*r1 < *r2);
180: }
181: post_higherunit(r2);
182: *post_higherunit(r1) = x;
183: }
184: return(borrow); /* return the final carry/borrow flag bit */
185: } /* mp_subb */
186: #endif /* mp_subb */
187:
188: #ifndef mp_rotate_left
189: boolean mp_rotate_left(register unitptr r1,register boolean carry)
190: /* multiprecision rotate left 1 bit with carry, result in r1. */
191: /* carry is incoming carry flag-- value should be 0 or 1 */
192: { register int precision; /* number of units to rotate */
193: unsigned int mcarry = carry, nextcarry; /* int is supposed to be
194: * the efficient size for ops*/
195: precision = global_precision;
196: make_lsbptr(r1,precision);
197: while (precision--)
198: {
199: nextcarry = (((signedunit) *r1) < 0);
200: *r1 = (*r1 << 1) | mcarry;
201: mcarry = nextcarry;
202: pre_higherunit(r1);
203: }
204: return(nextcarry); /* return the final carry flag bit */
205: } /* mp_rotate_left */
206: #endif /* mp_rotate_left */
207:
208: /************** end of primitives that should be in assembly *************/
209:
210:
211: /* The mp_shift_right_bits function is not called in any time-critical
212: situations in public-key cryptographic functions, so it doesn't
213: need to be coded in assembly language.
214: */
215: void mp_shift_right_bits(register unitptr r1,register short bits)
216: /* multiprecision shift right bits, result in r1.
217: bits is how many bits to shift, must be <= UNITSIZE.
218: */
219: { register short precision; /* number of units to shift */
220: register unit carry,nextcarry,bitmask;
221: register short unbits;
222: if (bits==0) return; /* shift zero bits is a no-op */
223: carry = 0;
224: bitmask = power_of_2(bits)-1;
225: unbits = UNITSIZE-bits; /* shift bits must be <= UNITSIZE */
226: precision = global_precision;
227: make_msbptr(r1,precision);
228: if (bits == UNITSIZE) {
229: while (precision--) {
230: nextcarry = *r1;
231: *r1 = carry;
232: carry = nextcarry;
233: pre_lowerunit(r1);
234: }
235: } else {
236: while (precision--)
237: { nextcarry = *r1 & bitmask;
238: *r1 >>= bits;
239: *r1 |= carry << unbits;
240: carry = nextcarry;
241: pre_lowerunit(r1);
242: }
243: }
244: } /* mp_shift_right_bits */
245:
246:
247: #ifndef mp_compare
248: short mp_compare(register unitptr r1,register unitptr r2)
249: /* Compares multiprecision integers *r1, *r2, and returns:
250: -1 iff *r1 < *r2
251: 0 iff *r1 == *r2
252: +1 iff *r1 > *r2
253: */
254: { register short precision; /* number of units to compare */
255:
256: precision = global_precision;
257: make_msbptr(r1,precision);
258: make_msbptr(r2,precision);
259: do
260: { if (*r1 < *r2)
261: return(-1);
262: if (*post_lowerunit(r1) > *post_lowerunit(r2))
263: return(1);
264: } while (--precision);
265: return(0); /* *r1 == *r2 */
266: } /* mp_compare */
267: #endif /* mp_compare */
268:
269:
270: boolean mp_inc(register unitptr r)
271: /* Increment multiprecision integer r. */
272: { register short precision;
273: precision = global_precision;
274: make_lsbptr(r,precision);
275: do
276: { if ( ++(*r) ) return(0); /* no carry */
277: post_higherunit(r);
278: } while (--precision);
279: return(1); /* return carry set */
280: } /* mp_inc */
281:
282:
283: boolean mp_dec(register unitptr r)
284: /* Decrement multiprecision integer r. */
285: { register short precision;
286: precision = global_precision;
287: make_lsbptr(r,precision);
288: do
289: { if ( (signedunit) (--(*r)) != (signedunit) -1 )
290: return(0); /* no borrow */
291: post_higherunit(r);
292: } while (--precision);
293: return(1); /* return borrow set */
294: } /* mp_dec */
295:
296:
297: void mp_neg(register unitptr r)
298: /* Compute 2's complement, the arithmetic negative, of r */
299: { register short precision; /* number of units to negate */
300: precision = global_precision;
301: mp_dec(r); /* 2's complement is 1's complement plus 1 */
302: do /* now do 1's complement */
303: { *r = ~(*r);
304: r++;
305: } while (--precision);
306: } /* mp_neg */
307:
308: #ifndef mp_move
309: void mp_move(register unitptr dst,register unitptr src)
310: { register short precision; /* number of units to move */
311: precision = global_precision;
312: do { *dst++ = *src++; } while (--precision);
313: } /* mp_move */
314: #endif /* mp_move */
315:
316: void mp_init(register unitptr r, word16 value)
317: /* Init multiprecision register r with short value. */
318: { /* Note that mp_init doesn't extend sign bit for >32767 */
319:
320: unitfill0( r, global_precision);
321: make_lsbptr(r,global_precision);
322: *post_higherunit(r) = value;
323: #ifdef UNIT8
324: *post_higherunit(r) = value >> UNITSIZE;
325: #endif
326: } /* mp_init */
327:
328:
329: short significance(register unitptr r)
330: /* Returns number of significant units in r */
331: { register short precision;
332: precision = global_precision;
333: make_msbptr(r,precision);
334: do
335: { if (*post_lowerunit(r))
336: return(precision);
337: } while (--precision);
338: return(precision);
339: } /* significance */
340:
341:
342: #ifndef unitfill0
343: void unitfill0(unitptr r,word16 unitcount)
344: /* Zero-fill the unit buffer r. */
345: { while (unitcount--) *r++ = 0;
346: } /* unitfill0 */
347: #endif /* unitfill0 */
348:
349:
350: int mp_udiv(register unitptr remainder,register unitptr quotient,
351: register unitptr dividend,register unitptr divisor)
352: /* Unsigned divide, treats both operands as positive. */
353: { int bits;
354: short dprec;
355: register unit bitmask;
356: if (testeq(divisor,0))
357: return(-1); /* zero divisor means divide error */
358: mp_init0(remainder);
359: mp_init0(quotient);
360: /* normalize and compute number of bits in dividend first */
361: init_bitsniffer(dividend,bitmask,dprec,bits);
362: /* rescale quotient to same precision (dprec) as dividend */
363: rescale(quotient,global_precision,dprec);
364: make_msbptr(quotient,dprec);
365:
366: while (bits--)
367: { mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
368: if (mp_compare(remainder,divisor) >= 0)
369: { mp_sub(remainder,divisor);
370: stuff_bit(quotient,bitmask);
371: }
372: bump_2bitsniffers(dividend,quotient,bitmask);
373: }
374: return(0);
375: } /* mp_udiv */
376:
377:
378: #ifdef UPTON_OR_SMITH
379: #define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */
380:
381: int mp_recip(register unitptr quotient,register unitptr divisor)
382: /* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */
383: { int bits;
384: short qprec;
385: register unit bitmask;
386: unit remainder[MAX_UNIT_PRECISION];
387: if (testeq(divisor,0))
388: return(-1); /* zero divisor means divide error */
389: mp_init0(remainder);
390: mp_init0(quotient);
391:
392: /* normalize and compute number of bits in quotient first */
393: bits = countbits(divisor) + RECIPMARGIN;
394: bitmask = bitmsk(bits); /* bitmask within a single unit */
395: qprec = bits2units(bits+1);
396: mp_setbit(remainder,(bits-RECIPMARGIN)-1);
397: /* rescale quotient to precision of divisor + RECIPMARGIN bits */
398: rescale(quotient,global_precision,qprec);
399: make_msbptr(quotient,qprec);
400:
401: while (bits--)
402: { mp_shift_left(remainder);
403: if (mp_compare(remainder,divisor) >= 0)
404: { mp_sub(remainder,divisor);
405: stuff_bit(quotient,bitmask);
406: }
407: bump_bitsniffer(quotient,bitmask);
408: }
409: mp_init0(remainder); /* burn sensitive data left on stack */
410: return(0);
411: } /* mp_recip */
412: #endif /* UPTON_OR_SMITH */
413:
414:
415: int mp_div(register unitptr remainder,register unitptr quotient,
416: register unitptr dividend,register unitptr divisor)
417: /* Signed divide, either or both operands may be negative. */
418: { boolean dvdsign,dsign;
419: int status;
420: dvdsign = (boolean)(mp_tstminus(dividend)!=0);
421: dsign = (boolean)(mp_tstminus(divisor)!=0);
422: if (dvdsign) mp_neg(dividend);
423: if (dsign) mp_neg(divisor);
424: status = mp_udiv(remainder,quotient,dividend,divisor);
425: if (dvdsign) mp_neg(dividend); /* repair caller's dividend */
426: if (dsign) mp_neg(divisor); /* repair caller's divisor */
427: if (status<0) return(status); /* divide error? */
428: if (dvdsign) mp_neg(remainder);
429: if (dvdsign ^ dsign) mp_neg(quotient);
430: return(status);
431: } /* mp_div */
432:
433:
434: word16 mp_shortdiv(register unitptr quotient,
435: register unitptr dividend,register word16 divisor)
436: /* This function does a fast divide and mod on a multiprecision dividend
437: using a short integer divisor returning a short integer remainder.
438: This is an unsigned divide. It treats both operands as positive.
439: It is used mainly for faster printing of large numbers in base 10.
440: */
441: { int bits;
442: short dprec;
443: register unit bitmask;
444: register word16 remainder;
445: if (!divisor) /* if divisor == 0 */
446: return(-1); /* zero divisor means divide error */
447: remainder=0;
448: mp_init0(quotient);
449: /* normalize and compute number of bits in dividend first */
450: init_bitsniffer(dividend,bitmask,dprec,bits);
451: /* rescale quotient to same precision (dprec) as dividend */
452: rescale(quotient,global_precision,dprec);
453: make_msbptr(quotient,dprec);
454:
455: while (bits--)
456: { remainder <<= 1;
457: if (sniff_bit(dividend,bitmask))
458: remainder++;
459: if (remainder >= divisor)
460: { remainder -= divisor;
461: stuff_bit(quotient,bitmask);
462: }
463: bump_2bitsniffers(dividend,quotient,bitmask);
464: }
465: return(remainder);
466: } /* mp_shortdiv */
467:
468:
469: int mp_mod(register unitptr remainder,
470: register unitptr dividend,register unitptr divisor)
471: /* Unsigned divide, treats both operands as positive. */
472: { int bits;
473: short dprec;
474: register unit bitmask;
475: if (testeq(divisor,0))
476: return(-1); /* zero divisor means divide error */
477: mp_init0(remainder);
478: /* normalize and compute number of bits in dividend first */
479: init_bitsniffer(dividend,bitmask,dprec,bits);
480:
481: while (bits--)
482: { mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
483: msub(remainder,divisor);
484: bump_bitsniffer(dividend,bitmask);
485: }
486: return(0);
487: } /* mp_mod */
488:
489:
490: word16 mp_shortmod(register unitptr dividend,register word16 divisor)
491: /* This function does a fast mod operation on a multiprecision dividend
492: using a short integer modulus returning a short integer remainder.
493: This is an unsigned divide. It treats both operands as positive.
494: It is used mainly for fast sieve searches for large primes.
495: */
496: { int bits;
497: short dprec;
498: register unit bitmask;
499: register word16 remainder;
500: if (!divisor) /* if divisor == 0 */
501: return(-1); /* zero divisor means divide error */
502: remainder=0;
503: /* normalize and compute number of bits in dividend first */
504: init_bitsniffer(dividend,bitmask,dprec,bits);
505:
506: while (bits--)
507: { remainder <<= 1;
508: if (sniff_bit(dividend,bitmask))
509: remainder++;
510: if (remainder >= divisor) remainder -= divisor;
511: bump_bitsniffer(dividend,bitmask);
512: }
513: return(remainder);
514: } /* mp_shortmod */
515:
516:
517:
518: #ifdef COMB_MULT /* use faster "comb" multiply algorithm */
519: /* We are skipping this code because it has a bug... */
520:
521: int mp_mult(register unitptr prod,
522: register unitptr multiplicand, register unitptr multiplier)
523: /* Computes multiprecision prod = multiplicand * multiplier */
524: { /* Uses interleaved comb multiply algorithm.
525: This improved multiply more than twice as fast as a Russian
526: peasant multiply, because it does a lot fewer shifts.
527: Must have global_precision set to the size of the multiplicand
528: plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum
529: of the lengths of the multiplier and multiplicand.
530:
531: BUG ALERT: Unfortunately, this code has a bug. It fails for
532: some numbers. One such example:
533: x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5
534: x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D
535: 52C8 CDC7 AFB3 61C8 243C 741B
536: --which is obviously wrong. The answer should be:
537: x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A
538: C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9
539: We'll have to fix this some day. In the meantime, we'll
540: just have the compiler skip over this code.
1.1.1.4 ! root 541:
! 542: BUG NOTE: Possibly fixed. Needs testing.
1.1.1.2 root 543: */
544: int bits;
545: register unit bitmask;
546: unitptr product, mplier, temp;
547: short mprec,mprec2;
548: unit mplicand[MAX_UNIT_PRECISION];
1.1.1.4 ! root 549:
! 550: /* better clear full width--double precision */
! 551: mp_init(prod+tohigher(global_precision),0);
! 552:
1.1.1.2 root 553: if (testeq(multiplicand,0))
554: return(0); /* zero multiplicand means zero product */
555:
556: mp_move(mplicand,multiplicand); /* save it from damage */
557:
558: normalize(multiplier,mprec);
559: if (!mprec)
560: return(0);
561:
562: make_lsbptr(multiplier,mprec);
563: bitmask = 1; /* start scan at LSB of multiplier */
564:
565: do /* UNITSIZE times */
566: { /* do for bits 0-15 */
567: product = prod;
568: mplier = multiplier;
569: mprec2 = mprec;
570: while (mprec2--) /* do for each word in multiplier */
571: {
572: if (sniff_bit(mplier,bitmask))
573: { if (mp_addc(product,multiplicand,0)) /* ripple carry */
574: { /* After 1st time thru, this is rarely encountered. */
575: temp = msbptr(product,global_precision);
576: pre_higherunit(temp);
577: /* temp now points to LSU of carry region. */
578: unmake_lsbptr(temp,global_precision);
579: mp_inc(temp);
580: } /* ripple carry */
581: }
582: pre_higherunit(mplier);
583: pre_higherunit(product);
584: }
585: if (!(bitmask <<= 1))
586: break;
587: mp_shift_left(multiplicand);
588:
589: } while (TRUE);
590:
591: mp_move(multiplicand,mplicand); /* recover */
592:
593: return(0); /* normal return */
594: } /* mp_mult */
595:
596: #endif /* COMB_MULT */
597:
598:
599: /* Because the "comb" multiply has a bug, we will use the slower
600: Russian peasant multiply instead. Fortunately, the mp_mult
601: function is not called from any time-critical code.
602: */
603:
604: int mp_mult(register unitptr prod,
605: register unitptr multiplicand,register unitptr multiplier)
606: /* Computes multiprecision prod = multiplicand * multiplier */
607: { /* Uses "Russian peasant" multiply algorithm. */
608: int bits;
609: register unit bitmask;
610: short mprec;
611: mp_init(prod,0);
612: if (testeq(multiplicand,0))
613: return(0); /* zero multiplicand means zero product */
614: /* normalize and compute number of bits in multiplier first */
615: init_bitsniffer(multiplier,bitmask,mprec,bits);
616:
617: while (bits--)
618: { mp_shift_left(prod);
619: if (sniff_bit(multiplier,bitmask))
620: mp_add(prod,multiplicand);
621: bump_bitsniffer(multiplier,bitmask);
622: }
623: return(0);
624: } /* mp_mult */
625:
626:
627:
628: /* mp_modmult computes a multiprecision multiply combined with a
629: modulo operation. This is the most time-critical function in
630: this multiprecision arithmetic library for performing modulo
631: exponentiation. We experimented with different versions of modmult,
632: depending on the machine architecture and performance requirements.
633: We will either use a Russian Peasant modmult (peasant_modmult),
634: Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's
635: modmult (upton_modmult), or Thad Smith's modmult (smith_modmult).
636: On machines with a hardware atomic multiply instruction,
637: Smith's modmult is fastest. It can utilize assembly subroutines to
638: speed up the hardware multiply logic and trial quotient calculation.
639: If the machine lacks a fast hardware multiply, Merritt's modmult
640: is preferred, which doesn't call any assembly multiply routine.
641: We use the alias names mp_modmult, stage_modulus, and modmult_burn
642: for the corresponding true names, which depend on what flavor of
643: modmult we are using.
644:
645: Before making the first call to mp_modmult, you must set up the
646: modulus-dependant precomputated tables by calling stage_modulus.
647: After making all the calls to mp_modmult, you call modmult_burn to
648: erase the tables created by stage_modulus that were left in memory.
649: */
650:
651: #ifdef COUNTMULTS
652: /* "number of modmults" counters, used for performance studies. */
653: static unsigned int tally_modmults = 0;
654: static unsigned int tally_modsquares = 0;
655: #endif /* COUNTMULTS */
656:
657:
658: #ifdef PEASANT
659: /* Conventional Russian peasant multiply with modulo algorithm. */
660:
661: static unitptr pmodulus = 0; /* used only by mp_modmult */
662:
663: int stage_peasant_modulus(unitptr n)
664: /* Must pass modulus to stage_modulus before calling modmult.
665: Assumes that global_precision has already been adjusted to the
666: size of the modulus, plus SLOP_BITS.
667: */
668: { /* For this simple version of modmult, just copy unit pointer. */
669: pmodulus = n;
670: return(0); /* normal return */
671: } /* stage_peasant_modulus */
672:
673:
674: int peasant_modmult(register unitptr prod,
675: unitptr multiplicand,register unitptr multiplier)
676: { /* "Russian peasant" multiply algorithm, combined with a modulo
677: operation. This is a simple naive replacement for Merritt's
678: faster modmult algorithm. References global unitptr "modulus".
679: Computes: prod = (multiplicand*multiplier) mod modulus
680: WARNING: All the arguments must be less than the modulus!
681: */
682: int bits;
683: register unit bitmask;
684: short mprec;
685: mp_init(prod,0);
686: /* if (testeq(multiplicand,0))
687: return(0); */ /* zero multiplicand means zero product */
688: /* normalize and compute number of bits in multiplier first */
689: init_bitsniffer(multiplier,bitmask,mprec,bits);
690:
691: while (bits--)
692: { mp_shift_left(prod);
693: msub(prod,pmodulus); /* turns mult into modmult */
694: if (sniff_bit(multiplier,bitmask))
695: { mp_add(prod,multiplicand);
696: msub(prod,pmodulus); /* turns mult into modmult */
697: }
698: bump_bitsniffer(multiplier,bitmask);
699: }
700: return(0);
701: } /* peasant_modmult */
702:
703:
704: /* If we are using a version of mp_modmult that uses internal tables
705: in memory, we have to call modmult_burn() at the end of mp_modexp.
706: This is so that no sensitive data is left in memory after the program
707: exits. The Russian peasant method doesn't use any such tables.
708: */
709: void peasant_burn(void)
710: /* Alias for modmult_burn, called only from mp_modexp(). Destroys
711: internal modmult tables. This version does nothing because no
712: tables are used by the Russian peasant modmult. */
713: { } /* peasant_burn */
714:
715: #endif /* PEASANT */
716:
717:
718: #ifdef MERRITT
719: /*=========================================================================*/
720: /*
721: This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
722: Also refined by Zhahai Stewart to reduce the number of subtracts.
723: Modified by Raymond Brand to reduce the number of SLOP_BITS by 1.
724: It performs a multiply combined with a modulo operation, without
725: going into "double precision". It is faster than the Russian peasant
726: method, and still works well on machines that lack a fast hardware
727: multiply instruction.
728: */
729:
730: /* The following support functions, macros, and data structures
731: are used only by Merritt's modmult algorithm... */
732:
733: static void mp_lshift_unit(register unitptr r1)
734: /* Shift r1 1 whole unit to the left. Used only by modmult function. */
735: { register short precision;
736: register unitptr r2;
737: precision = global_precision;
738: make_msbptr(r1,precision);
739: r2 = r1;
740: while (--precision)
741: *post_lowerunit(r1) = *pre_lowerunit(r2);
742: *r1 = 0;
743: } /* mp_lshift_unit */
744:
745:
746: /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
747: static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
748: static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */
749: { 0
750: ,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0],
751: &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0]
752: #ifndef UNIT8
753: ,&moduli_buf[ 7][0]
754: ,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0],
755: &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0]
756: #ifndef UNIT16 /* and not UNIT8 */
757: ,&moduli_buf[15][0]
758: ,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0],
759: &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0],
760: &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0],
761: &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0]
762: #endif /* UNIT16 and UNIT8 not defined */
763: #endif /* UNIT8 not defined */
764: };
765:
766: /* To optimize msubs, need following 2 unit arrays, each filled
767: with the most significant unit and next-to-most significant unit
768: of the preshifted images of the modulus. */
769: static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */
770: static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */
771:
772: /* mpdbuf contains preshifted images of the multiplicand, mod n.
773: It is used only by mp_modmult. It could be staticly declared
774: inside of mp_modmult, but we put it outside mp_modmult so that
775: it can be wiped clean by modmult_burn(), which is called at the
776: end of mp_modexp. This is so that no sensitive data is left in
777: memory after the program exits.
778: */
779: static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
780:
781:
782: static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
783: /* Computes UNITSIZE images of r, each shifted left 1 more bit.
784: Used only by modmult function.
785: */
786: { short int i;
787: images[0] = r; /* no need to move the first image, just copy ptr */
788: for (i=1; i<UNITSIZE; i++)
789: { mp_move(images[i],images[i-1]);
790: mp_shift_left(images[i]);
791: msub(images[i],moduli[0]);
792: }
793: } /* stage_mp_images */
794:
795:
796: int stage_merritt_modulus(unitptr n)
797: /* Computes UNITSIZE images of modulus, each shifted left 1 more bit.
798: Before calling mp_modmult, you must first stage the moduli images by
799: calling stage_modulus. n is the pointer to the modulus.
800: Assumes that global_precision has already been adjusted to the
801: size of the modulus, plus SLOP_BITS.
802: */
803: { short int i;
804: unitptr msu; /* ptr to most significant unit, for faster msubs */
805: moduli[0] = n; /* no need to move the first image, just copy ptr */
806:
807: /* used by optimized msubs macro... */
808: msu = msbptr(n,global_precision); /* needed by msubs */
809: msu_moduli[0] = *post_lowerunit(msu);
810: nmsu_moduli[0] = *msu;
811:
812: for (i=1; i<UNITSIZE; i++)
813: { mp_move(moduli[i],moduli[i-1]);
814: mp_shift_left(moduli[i]);
815:
816: /* used by optimized msubs macro... */
817: msu = msbptr(moduli[i],global_precision); /* needed by msubs */
818: msu_moduli[i] = *post_lowerunit(msu); /* for faster msubs */
819: nmsu_moduli[i] = *msu;
820: }
821: return(0); /* normal return */
822: } /* stage_merritt_modulus */
823:
824:
825: /* The following macros, sniffadd and msubs, are used by modmult... */
826: #define sniffadd(i) if (*multiplier & power_of_2(i)) mp_add(prod,mpd[i])
827:
828: /* Unoptimized msubs macro (msubs0) follows... */
829: /* #define msubs0(i) msub(prod,moduli[i])
830: */
831:
832: /* To optimize msubs, compare the most significant units of the
833: product and the shifted modulus before deciding to call the full
834: mp_compare routine. Better still, compare the upper two units
835: before deciding to call mp_compare.
836: Optimization of msubs relies heavily on standard C left-to-right
837: parsimonious evaluation of logical expressions.
838: */
839:
840: /* Partially-optimized msubs macro (msubs1) follows... */
841: /* #define msubs1(i) if ( \
842: ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
843: (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
844: ) mp_sub(prod,moduli[i])
845: */
846:
847: /* Fully-optimized msubs macro (msubs2) follows... */
848: #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
849: (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
850: (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
851: mp_sub(prod,moduli[i])
852:
853:
854: int merritt_modmult(register unitptr prod,
855: unitptr multiplicand,register unitptr multiplier)
856: /* Performs combined multiply/modulo operation.
857: Computes: prod = (multiplicand*multiplier) mod modulus
858: WARNING: All the arguments must be less than the modulus!
859: Assumes the modulus has been predefined by first calling
860: stage_modulus.
861: */
862: {
863: /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
864: register signedunit p_m;
865: register unitptr msu_prod; /* ptr to most significant unit of product */
866: register unitptr nmsu_prod; /* next-most signif. unit of product */
867: short mprec; /* precision of multiplier, in units */
868: /* Array mpd contains a list of pointers to preshifted images of
869: the multiplicand: */
870: static unitptr mpd[UNITSIZE] =
871: { 0, &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
872: &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
873: #ifndef UNIT8
874: ,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
875: &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
876: #ifndef UNIT16 /* and not UNIT8 */
877: ,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
878: &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
879: &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
880: &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
881: #endif /* UNIT16 and UNIT8 not defined */
882: #endif /* UNIT8 not defined */
883: };
884:
885: /* Compute preshifted images of multiplicand, mod n: */
886: stage_mp_images(mpd,multiplicand);
887:
888: /* To optimize msubs, set up msu_prod and nmsu_prod: */
889: msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */
890: nmsu_prod = msu_prod;
891: post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
892:
893: /* To understand this algorithm, it would be helpful to first
894: study the conventional Russian peasant modmult algorithm.
895: This one does about the same thing as Russian peasant, but
896: is organized differently to save some steps. It loops
897: through the multiplier a word (unit) at a time, instead of
898: a bit at a time. It word-shifts the product instead of
899: bit-shifting it, so it should be faster. It also does about
900: half as many subtracts as Russian peasant.
901: */
902:
903: mp_init(prod,0); /* Initialize product to 0. */
904:
905: /* The way mp_modmult is actually used in cryptographic
906: applications, there will NEVER be a zero multiplier or
907: multiplicand. So there is no need to optimize for that
908: condition.
909: */
910: /* if (testeq(multiplicand,0))
911: return(0); */ /* zero multiplicand means zero product */
912: /* Normalize and compute number of units in multiplier first: */
913: normalize(multiplier,mprec);
914: if (mprec==0) /* if precision of multiplier is 0 */
915: return(0); /* zero multiplier means zero product */
916: make_msbptr(multiplier,mprec); /* start at MSU of multiplier */
917:
918: while (mprec--) /* Loop for the number of units in the multiplier */
919: {
920: /* Shift the product one whole unit to the left.
921: This is part of the multiply phase of modmult.
922: */
923:
924: mp_lshift_unit(prod);
925:
926: /* The product may have grown by as many as UNITSIZE
927: bits. That's why we have global_precision set to the
928: size of the modulus plus UNITSIZE slop bits.
929: Now reduce the product back down by conditionally
930: subtracting the preshifted images of the modulus.
931: This is part of the modulo reduction phase of modmult.
932: The following loop is unrolled for speed, using macros...
933:
934: for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--)
935: if (mp_compare(prod,moduli[i]) >= 0)
936: mp_sub(prod,moduli[i]);
937: */
938:
939: #ifndef UNIT8
940: #ifndef UNIT16 /* and not UNIT8 */
941: msubs(31);
942: msubs(30);
943: msubs(29);
944: msubs(28);
945: msubs(27);
946: msubs(26);
947: msubs(25);
948: msubs(24);
949: msubs(23);
950: msubs(22);
951: msubs(21);
952: msubs(20);
953: msubs(19);
954: msubs(18);
955: msubs(17);
956: msubs(16);
957: #endif /* not UNIT16 and not UNIT8 */
958: msubs(15);
959: msubs(14);
960: msubs(13);
961: msubs(12);
962: msubs(11);
963: msubs(10);
964: msubs(9);
965: msubs(8);
966: #endif /* not UNIT8 */
967: msubs(7);
968: msubs(6);
969: msubs(5);
970: #ifndef UNIT32
971: msubs(4);
972: #ifndef UNIT16
973: msubs(3);
974: #endif
975: #endif
976:
977: /* Sniff each bit in the current unit of the multiplier,
978: and conditionally add the corresponding preshifted
979: image of the multiplicand to the product.
980: This is also part of the multiply phase of modmult.
981:
982: The following loop is unrolled for speed, using macros...
983:
984: for (i=UNITSIZE-1; i>=0; i--)
985: if (*multiplier & power_of_2(i))
986: mp_add(prod,mpd[i]);
987: */
988: #ifndef UNIT8
989: #ifndef UNIT16 /* and not UNIT8 */
990: sniffadd(31);
991: sniffadd(30);
992: sniffadd(29);
993: sniffadd(28);
994: sniffadd(27);
995: sniffadd(26);
996: sniffadd(25);
997: sniffadd(24);
998: sniffadd(23);
999: sniffadd(22);
1000: sniffadd(21);
1001: sniffadd(20);
1002: sniffadd(19);
1003: sniffadd(18);
1004: sniffadd(17);
1005: sniffadd(16);
1006: #endif /* not UNIT16 and not UNIT8 */
1007: sniffadd(15);
1008: sniffadd(14);
1009: sniffadd(13);
1010: sniffadd(12);
1011: sniffadd(11);
1012: sniffadd(10);
1013: sniffadd(9);
1014: sniffadd(8);
1015: #endif /* not UNIT8 */
1016: sniffadd(7);
1017: sniffadd(6);
1018: sniffadd(5);
1019: sniffadd(4);
1020: sniffadd(3);
1021: sniffadd(2);
1022: sniffadd(1);
1023: sniffadd(0);
1024:
1025: /* The product may have grown by as many as LOG_UNITSIZE+1
1026: bits.
1027: Now reduce the product back down by conditionally
1028: subtracting LOG_UNITSIZE+1 preshifted images of the
1029: modulus. This is the modulo reduction phase of modmult.
1030:
1031: The following loop is unrolled for speed, using macros...
1032:
1033: for (i=LOG_UNITSIZE; i>=0; i--)
1034: if (mp_compare(prod,moduli[i]) >= 0)
1035: mp_sub(prod,moduli[i]);
1036: */
1037:
1038: #ifndef UNIT8
1039: #ifndef UNIT16
1040: msubs(5);
1041: #endif
1042: msubs(4);
1043: #endif
1044: msubs(3);
1045: msubs(2);
1046: msubs(1);
1047: msubs(0);
1048:
1049: /* Bump pointer to next lower unit of multiplier: */
1050: post_lowerunit(multiplier);
1051:
1052: } /* Loop for the number of units in the multiplier */
1053:
1054: return(0); /* normal return */
1055:
1056: } /* merritt_modmult */
1057:
1058:
1059: #undef msubs
1060: #undef sniffadd
1061:
1062:
1063: /* Merritt's mp_modmult function leaves some internal tables in memory,
1064: so we have to call modmult_burn() at the end of mp_modexp.
1065: This is so that no cryptographically sensitive data is left in memory
1066: after the program exits.
1067: */
1068: void merritt_burn(void)
1069: /* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
1070: { unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
1071: unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
1072: unitfill0(msu_moduli,UNITSIZE);
1073: unitfill0(nmsu_moduli,UNITSIZE);
1074: } /* merritt_burn() */
1075:
1076: /******* end of Merritt's MODMULT stuff. *******/
1077: /*=========================================================================*/
1078: #endif /* MERRITT */
1079:
1080:
1081: #ifdef UPTON_OR_SMITH /* Used by Upton's and Smith's modmult algorithms */
1082:
1083: /* Jimmy Upton's multiprecision modmult algorithm in C.
1084: Performs a multiply combined with a modulo operation.
1085:
1086: The following support functions and data structures
1087: are used only by Upton's modmult algorithm...
1088: */
1089:
1090: short munit_prec; /* global_precision expressed in MULTUNITs */
1091:
1092: /* Note that since the SPARC CPU has no hardware integer multiply
1093: instruction, there is not that much advantage in having an
1094: assembly version of mp_smul on that machine. It might be faster
1095: to use Merritt's modmult instead of Upton's modmult on the SPARC.
1096: */
1097:
1098: /*
1099: Multiply the single-word multiplier times the multiprecision integer
1100: in multiplicand, accumulating result in prod. The resulting
1101: multiprecision prod will be 1 word longer than the multiplicand.
1102: multiplicand is munit_prec words long. We add into prod, so caller
1103: should zero it out first. For best results, this time-critical
1104: function should be implemented in assembly.
1105: NOTE: Unlike other functions in the multiprecision arithmetic
1106: library, both multiplicand and prod are pointing at the LSB,
1107: regardless of byte order of the machine. On an 80x86, this makes
1108: no difference. But if this assembly function is implemented
1109: on a 680x0, it becomes important.
1110: */
1111: /* Note that this has been modified from the previous version to allow
1112: better support for Smith's modmult:
1113: The final carry bit is added to the existing product
1114: array, rather than simply stored.
1115: */
1116:
1117: #ifndef mp_smul
1118: void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
1119: {
1120: short i;
1121: unsigned long p, carry;
1122:
1123: carry = 0;
1124: for (i=0; i<munit_prec; ++i)
1125: { p = (unsigned long)multiplier * *post_higherunit(multiplicand);
1126: p += *prod + carry;
1127: *post_higherunit(prod) = (MULTUNIT)p;
1128: carry = p >> MULTUNITSIZE;
1129: }
1130: /* Add carry to the next higher word of product / dividend */
1131: *prod += (MULTUNIT)carry;
1132: } /* mp_smul */
1133: #endif
1134:
1135: /* mp_dmul is a double-precision multiply multiplicand times multiplier,
1136: result into prod. prod must be pointing at a "double precision"
1137: buffer. E.g. If global_precision is 10 words, prod must be
1138: pointing at a 20-word buffer.
1139: */
1140: #ifndef mp_dmul
1141: void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
1142: {
1143: register int i;
1144: register MULTUNIT *p_multiplicand, *p_multiplier;
1145: register MULTUNIT *prodp;
1146:
1147:
1148: unitfill0(prod,global_precision*2); /* Pre-zero prod */
1149: /* Calculate precision in units of MULTUNIT */
1150: munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
1151: p_multiplicand = (MULTUNIT *)multiplicand;
1152: p_multiplier = (MULTUNIT *)multiplier;
1153: prodp = (MULTUNIT *)prod;
1154: make_lsbptr(p_multiplicand,munit_prec);
1155: make_lsbptr(p_multiplier,munit_prec);
1156: make_lsbptr(prodp,munit_prec*2);
1157: /* Multiply multiplicand by each word in multiplier, accumulating prod: */
1158: for (i=0; i<munit_prec; ++i)
1159: mp_smul (post_higherunit(prodp),
1160: p_multiplicand, *post_higherunit(p_multiplier));
1161: } /* mp_dmul */
1162: #endif /* mp_dmul */
1163:
1164: static unit ALIGN modulus[MAX_UNIT_PRECISION];
1165: static short nbits; /* number of modulus significant bits */
1166: #endif /* UPTON_OR_SMITH */
1167:
1168: #ifdef UPTON
1169:
1170: /* These scratchpad arrays are used only by upton_modmult (mp_modmult).
1171: Some of them could be staticly declared inside of mp_modmult, but we
1172: put them outside mp_modmult so that they can be wiped clean by
1173: modmult_burn(), which is called at the end of mp_modexp. This is
1174: so that no sensitive data is left in memory after the program exits.
1175: */
1176:
1177: static unit ALIGN reciprocal[MAX_UNIT_PRECISION];
1178: static unit ALIGN dhi[MAX_UNIT_PRECISION];
1179: static unit ALIGN d_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1180: static unit ALIGN e_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1181: static unit ALIGN f_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
1182:
1183: static short nbitsDivUNITSIZE;
1184: static short nbitsModUNITSIZE;
1185:
1186: /* stage_upton_modulus() is aliased to stage_modulus().
1187: Prepare for an Upton modmult. Calculate the reciprocal of modulus,
1188: and save both. Note that reciprocal will have one more bit than
1189: modulus.
1190: Assumes that global_precision has already been adjusted to the
1191: size of the modulus, plus SLOP_BITS.
1192: */
1193: int stage_upton_modulus(unitptr n)
1194: {
1195: mp_move(modulus, n);
1196: mp_recip(reciprocal, modulus);
1197: nbits = countbits(modulus);
1198: nbitsDivUNITSIZE = nbits / UNITSIZE;
1199: nbitsModUNITSIZE = nbits % UNITSIZE;
1200: return(0); /* normal return */
1201: } /* stage_upton_modulus */
1202:
1203:
1204: /* Upton's algorithm performs a multiply combined with a modulo operation.
1205: Computes: prod = (multiplicand*multiplier) mod modulus
1206: WARNING: All the arguments must be less than the modulus!
1207: References global unitptr modulus and reciprocal.
1208: The reciprocal of modulus is 1 bit longer than the modulus.
1209: upton_modmult() is aliased to mp_modmult().
1210: */
1211: int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier)
1212: {
1213: unitptr d = d_data;
1214: unitptr d1 = d_data;
1215: unitptr e = e_data;
1216: unitptr f = f_data;
1217: short orig_precision;
1218:
1219: orig_precision = global_precision;
1220: mp_dmul (d,multiplicand,multiplier);
1221:
1222: /* Throw off low nbits of d */
1223: #ifndef HIGHFIRST
1224: d1 = d + nbitsDivUNITSIZE;
1225: #else
1226: d1 = d + orig_precision - nbitsDivUNITSIZE;
1227: #endif
1228: mp_move(dhi, d1); /* Don't screw up d, we need it later */
1229: mp_shift_right_bits(dhi, nbitsModUNITSIZE);
1230:
1231: mp_dmul (e,dhi,reciprocal); /* Note - reciprocal has nbits+1 bits */
1232:
1233: #ifndef HIGHFIRST
1234: e += nbitsDivUNITSIZE;
1235: #else
1236: e += orig_precision - nbitsDivUNITSIZE;
1237: #endif
1238: mp_shift_right_bits(e, nbitsModUNITSIZE);
1239:
1240: mp_dmul (f,e,modulus);
1241:
1242: /* Now for the only double-precision call to mpilib: */
1243: set_precision (orig_precision * 2);
1244: mp_sub (d,f);
1245:
1246: /* d's precision should be <= orig_precision */
1247: rescale (d, orig_precision*2, orig_precision);
1248: set_precision (orig_precision);
1249:
1250: /* Should never have to do this final subtract more than twice: */
1251: while (mp_compare(d,modulus) > 0)
1252: mp_sub (d,modulus);
1253:
1254: mp_move(prod,d);
1255: return(0); /* normal return */
1256: } /* upton_modmult */
1257:
1258:
1259: /* Upton's mp_modmult function leaves some internal arrays in memory,
1260: so we have to call modmult_burn() at the end of mp_modexp.
1261: This is so that no cryptographically sensitive data is left in memory
1262: after the program exits.
1263: upton_burn() is aliased to modmult_burn().
1264: */
1265: void upton_burn(void)
1266: {
1267: unitfill0(modulus,MAX_UNIT_PRECISION);
1268: unitfill0(reciprocal,MAX_UNIT_PRECISION);
1269: unitfill0(dhi,MAX_UNIT_PRECISION);
1270: unitfill0(d_data,MAX_UNIT_PRECISION*2);
1271: unitfill0(e_data,MAX_UNIT_PRECISION*2);
1272: unitfill0(f_data,MAX_UNIT_PRECISION*2);
1273: nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
1274: } /* upton_burn */
1275:
1276: /******* end of Upton's MODMULT stuff. *******/
1277: /*=========================================================================*/
1278: #endif /* UPTON */
1279:
1280: #ifdef SMITH /* using Thad Smith's modmult algorithm */
1281:
1282: /* Thad Smith's implementation of multiprecision modmult algorithm in C.
1283: Performs a multiply combined with a modulo operation.
1284: The multiplication is done with mp_dmul, the same as for Upton's
1285: modmult. The modulus reduction is done by long division, in
1286: which a trial quotient "digit" is determined, then the product of
1287: that digit and the divisor are subtracted from the dividend.
1288:
1289: In this case, the digit is MULTUNIT in size and the subtraction
1290: is done by adding the product to the one's complement of the
1291: dividend, which allows use of the existing mp_smul routine.
1292:
1293: The following support functions and data structures
1294: are used only by Smith's modmult algorithm...
1295: */
1296:
1297: /* These scratchpad arrays are used only by smith_modmult (mp_modmult).
1298: Some of them could be statically declared inside of mp_modmult, but we
1299: put them outside mp_modmult so that they can be wiped clean by
1300: modmult_burn(), which is called at the end of mp_modexp. This is
1301: so that no sensitive data is left in memory after the program exits.
1302: */
1303:
1304: static unit ALIGN ds_data[MAX_UNIT_PRECISION*2+2];
1305:
1306: static unit mod_quotient [4];
1307: static unit mod_divisor [4]; /* 2 most signif. MULTUNITs of modulus */
1308:
1309: static MULTUNIT *modmpl; /* ptr to modulus least significant
1310: ** MULTUNIT */
1311: static int mshift; /* number of bits for
1312: ** recip scaling */
1313: static MULTUNIT reciph; /* MSunit of scaled recip */
1314: static MULTUNIT recipl; /* LSunit of scaled recip */
1315:
1316: static short modlenMULTUNITS; /* length of modulus in MULTUNITs */
1317: static MULTUNIT mutemp; /* temporary */
1318:
1319: /* The routines mp_smul and mp_dmul are the same as for UPTON and
1320: should be coded in assembly. Note, however, that the previous
1321: Upton's mp_smul version has been modified to compatible with
1322: Smith's modmult. The new version also still works for Upton's
1323: modmult.
1324: */
1325:
1326: #ifndef mp_set_recip
1327: #define mp_set_recip(rh,rl,m) /* null */
1328: #else
1329: /* setup routine for external mp_quo_digit */
1330: void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m);
1331: #endif
1332: MULTUNIT mp_quo_digit (MULTUNIT *dividend);
1333:
1334: #ifdef MULTUNIT_SIZE_SAME
1335: #define mp_musubb mp_subb /* use existing routine */
1336: #else /* ! MULTUNIT_SIZE_SAME */
1337:
1338: /* This performs the same function as mp_subb, but with MULTUNITs.
1339: Note: Processors without alignment requirements may be able to use
1340: mp_subb, even though MULTUNITs are smaller than units. In that case,
1341: use mp_subb, since it would be faster if coded in assembly. Note that
1342: this implementation won't work for MULTUNITs which are long -- use
1343: mp_subb in that case.
1344: */
1345: #ifndef mp_musubb
1346: boolean mp_musubb
1347: (register MULTUNIT* r1,register MULTUNIT* r2,register boolean borrow)
1348: /* multiprecision subtract of MULTUNITs with borrow, r2 from r1,
1349: ** result in r1 */
1350: /* borrow is incoming borrow flag-- value should be 0 or 1 */
1351: { register ulint x; /* won't work if sizeof(MULTUNIT)==
1352: sizeof(long) */
1353: short precision; /* number of MULTUNITs to subtract */
1354: precision = global_precision * MULTUNITs_perunit;
1355: make_lsbptr(r1,precision);
1356: make_lsbptr(r2,precision);
1357: while (precision--)
1358: { x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
1359: *post_higherunit(r1) = x;
1360: borrow = (((1L << MULTUNITSIZE) & x) != 0L);
1361: }
1362: return (borrow);
1363: } /* mp_musubb */
1364: #endif /* mp_musubb */
1365: #endif /* MULTUNIT_SIZE_SAME */
1366:
1367: /* The function mp_quo_digit is the heart of Smith's modulo reduction,
1368: which uses a form of long division. It computes a trial quotient
1369: "digit" (MULTUNIT-sized digit) by multiplying the three most
1370: significant MULTUNITs of the dividend by the two most significant
1371: MULTUNITs of the reciprocal of the modulus. Note that this function
1372: requires that MULTUNITSIZE * 2 <= sizeof(unsigned long).
1373:
1374: An important part of this technique is that the quotient never be
1375: too small, although it may occasionally be too large. This was
1376: done to eliminate the need to check and correct for a remainder
1377: exceeding the divisor. It is easier to check for a negative
1378: remainder. The following technique rarely needs correction for
1379: MULTUNITs of at least 16 bits.
1380:
1381: The following routine has two implementations:
1382:
1383: ASM_PROTOTYPE defined: written to be an executable prototype for
1384: an efficient assembly language implementation. Note that several
1385: of the following masks and shifts can be done by directly
1386: manipulating single precision registers on some architectures.
1387:
1388: ASM_PROTOTYPE undefined: a slightly more efficient implementation
1389: in C. Although this version returns a result larger than the
1390: optimum (which is corrected in smith_modmult) more often than the
1391: prototype version, the faster execution in C more than makes up
1392: for the difference.
1393:
1394: Parameter: dividend - points to the most significant MULTUNIT
1395: of the dividend. Note that dividend actually contains the
1396: one's complement of the actual dividend value (see comments for
1397: smith_modmult).
1398:
1399: Return: the trial quotient digit resulting from dividing the first
1400: three MULTUNITs at dividend by the upper two MULTUNITs of the
1401: modulus.
1402: */
1403:
1404: /* #define ASM_PROTOTYPE */ /* undefined: use C-optimized version */
1405:
1406: #ifndef mp_quo_digit
1407: MULTUNIT mp_quo_digit (MULTUNIT *dividend) {
1408: unsigned long q, q0, q1, q2;
1409: unsigned short lsb_factor;
1410:
1411: /* Compute the least significant product group.
1412: The last terms of q1 and q2 perform upward rounding, which is
1413: needed to guarantee that the result not be too small.
1414: */
1415: q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long)reciph
1416: + reciph;
1417: q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)recipl
1418: + (1L << MULTUNITSIZE) ;
1419: #ifdef ASM_PROTOTYPE
1420: lsb_factor = 1 & (q1>>MULTUNITSIZE) & (q2>>MULTUNITSIZE);
1421: q = q1 + q2;
1422:
1423: /* The following statement is equivalent to shifting the sum right
1424: one bit while shifting in the carry bit.
1425: */
1426: q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
1427: #else /* optimized C version */
1428: q0 = (q1>>1) + (q2>>1) + 1;
1429: #endif
1430:
1431: /* Compute the middle significant product group. */
1432:
1433: q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph;
1434: q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl;
1435: #ifdef ASM_PROTOTYPE
1436: q = q1 + q2;
1437: q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
1438:
1439: /* Add in the most significant word of the first group.
1440: The last term takes care of the carry from adding the lsb's
1441: that were shifted out prior to addition.
1442: */
1443: q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2));
1444: #else /* optimized C version */
1445: q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1;
1446: #endif
1447:
1448: /* Compute the most significant term and add in the others */
1449:
1450: q = (q >> (MULTUNITSIZE-2)) +
1451: (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1);
1452: q >>= mshift;
1453:
1454: /* Prevent overflow and then wipe out the intermediate results. */
1455:
1456: mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1);
1457: q= q0 = q1 = q2 = 0; lsb_factor = 0; (void)lsb_factor;
1458: return mutemp;
1459: }
1460: #endif /* mp_quo_digit */
1461:
1462: /* stage_smith_modulus() - Prepare for a Smith modmult.
1463:
1464: Calculate the reciprocal of modulus with a precision of two MULTUNITs.
1465: Assumes that global_precision has already been adjusted to the
1466: size of the modulus, plus SLOP_BITS.
1467:
1468: Note: This routine was designed to work with large values and
1469: doesn't have the necessary testing or handling to work with a
1470: modulus having less than three significant units. For such cases,
1471: the separate multiply and modulus routines can be used.
1472:
1473: stage_smith_modulus() is aliased to stage_modulus().
1474: */
1475:
1476: int stage_smith_modulus(unitptr n_modulus)
1477: {
1478: int original_precision;
1479: int sigmod; /* significant units in modulus */
1480: unitptr mp; /* modulus most significant pointer */
1481: MULTUNIT *mpm; /* reciprocal pointer */
1482: int prec; /* precision of reciprocal calc in units */
1483:
1484: mp_move(modulus, n_modulus);
1485: modmpl = (MULTUNIT*) modulus;
1486: modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit);
1487: nbits = countbits(modulus);
1488: modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE;
1489:
1490: original_precision = global_precision;
1491:
1492: /* The following code copies the three most significant units of
1493: * modulus to mod_divisor.
1494: */
1495: mp = modulus;
1496: sigmod = significance (modulus);
1497: rescale (mp, original_precision, sigmod);
1498: /* prec is the unit precision required for 3 MULTUNITs */
1499: prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit;
1500: set_precision (prec);
1501:
1502: /* set mp = ptr to most significant units of modulus, then move
1503: * the most significant units to mp_divisor
1504: */
1505: mp = msbptr(mp,sigmod) -tohigher(prec-1);
1506: unmake_lsbptr (mp, prec);
1507: mp_move (mod_divisor, mp);
1508:
1509: /* Keep 2*MULTUNITSIZE bits in mod_divisor.
1510: * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits.
1511: */
1512: mshift = countbits (mod_divisor) - 2*MULTUNITSIZE;
1513: mp_shift_right_bits (mod_divisor, mshift);
1514: mp_recip(mod_quotient,mod_divisor);
1515: mp_shift_right_bits (mod_quotient,1);
1516:
1517: /* Reduce to: 0 < mshift <= MULTUNITSIZE */
1518: mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1;
1519: /* round up, rescaling if necessary */
1520: mp_inc (mod_quotient);
1521: if (countbits (mod_quotient) > 2*MULTUNITSIZE) {
1522: mp_shift_right_bits (mod_quotient,1);
1523: mshift--; /* now 0 <= mshift <= MULTUNITSIZE */
1524: }
1525: mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit);
1526: recipl = *post_higherunit (mpm);
1527: reciph = *mpm;
1528: mp_set_recip (reciph, recipl, mshift);
1529: set_precision (original_precision);
1530: return(0); /* normal return */
1531: } /* stage_smith_modulus */
1532:
1533: /* Smith's algorithm performs a multiply combined with a modulo operation.
1534: Computes: prod = (multiplicand*multiplier) mod modulus
1535: The modulus must first be set by stage_smith_modulus().
1536: smith_modmult() is aliased to mp_modmult().
1537: */
1538:
1539: int
1540: smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
1541: {
1542: unitptr d; /* ptr to product */
1543: MULTUNIT *dmph, *dmpl, *dmp; /* ptrs to dividend (high, low, first)
1544: * aligned for subtraction */
1545: /* Note that dmph points one MULTUNIT higher than indicated by
1546: global precision. This allows us to zero out a word one higher than
1547: the normal precision.
1548: */
1549: short orig_precision;
1550: short nqd; /* number of quotient digits remaining to
1551: * be generated */
1552: short dmi; /* number of significant MULTUNITs in product */
1553:
1554: d = ds_data + 1; /* room for leading MSB if HIGHFIRST */
1555: orig_precision = global_precision;
1556: mp_dmul(d, multiplicand, multiplier);
1557:
1558: rescale(d, orig_precision * 2, orig_precision * 2 + 1);
1559: set_precision(orig_precision * 2 + 1);
1560: *msbptr(d, global_precision) = 0; /* leading 0 unit */
1561:
1562: /* We now start working with MULTUNITs.
1563: Determine the most significant MULTUNIT of the product so we don't
1564: have to process leading zeros in our divide loop.
1565: */
1566: dmi = significance(d) * MULTUNITs_perunit;
1567: if (dmi >= modlenMULTUNITS)
1568: { /* Make dividend negative. This allows the use of mp_smul to
1569: * "subtract" the product of the modulus and the trial divisor
1570: * by actually adding to a negative dividend.
1571: * The one's complement of the dividend is used, since it causes
1572: * a zero value to be represented as all ones. This facilitates
1573: * testing the result for possible overflow, since a sign bit
1574: * indicates that no adjustment is necessary, and we should not
1575: * attempt to adjust if the result of the addition is zero.
1576: */
1577: mp_inc(d);
1578: mp_neg(d);
1579: set_precision(orig_precision);
1580: munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
1581:
1582: /* Set msb, lsb, and normal ptrs of dividend */
1583: dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) *
1584: MULTUNITs_perunit) + tohigher(dmi);
1585: nqd = dmi + 1 - modlenMULTUNITS;
1586: dmpl = dmph - tohigher(modlenMULTUNITS);
1587:
1588: /* Divide loop.
1589: Each iteration computes the next quotient MULTUNIT digit, then
1590: multiplies the divisor (modulus) by the quotient digit and adds
1591: it to the one's complement of the dividend (equivalent to
1592: subtracting). If the product was greater than the remaining dividend,
1593: we get a non-negative result, in which case we subtract off the
1594: modulus to get the proper negative remainder.
1595: */
1596: for (; nqd; nqd--)
1597: { MULTUNIT q; /* quotient trial digit */
1598:
1599: q = mp_quo_digit(dmph);
1600: if (q > 0)
1601: { mp_smul(dmpl, modmpl, q);
1602:
1603: /* Perform correction if q too large.
1604: * This rarely occurs.
1605: */
1606: if (!(*dmph & MULTUNIT_msb))
1607: { dmp = dmpl;
1608: unmake_lsbptr(dmp, orig_precision *
1609: MULTUNITs_perunit);
1610: if (mp_musubb(dmp,
1611: (MULTUNIT *) modulus, 0))
1612: (*dmph) --;
1613: }
1614: }
1615: pre_lowerunit(dmph);
1616: pre_lowerunit(dmpl);
1617: }
1618: /* d contains the one's complement of the remainder. */
1619: rescale(d, orig_precision * 2 + 1, orig_precision);
1620: set_precision(orig_precision);
1621: mp_neg(d);
1622: mp_dec(d);
1623: } else
1624: { /* Product was less than modulus. Return it. */
1625: rescale(d, orig_precision * 2 + 1, orig_precision);
1626: set_precision(orig_precision);
1627: }
1628: mp_move(prod, d);
1629: return (0); /* normal return */
1630: } /* smith_modmult */
1631:
1632:
1633: /* Smith's mp_modmult function leaves some internal arrays in memory,
1634: so we have to call modmult_burn() at the end of mp_modexp.
1635: This is so that no cryptographically sensitive data is left in memory
1636: after the program exits.
1637: smith_burn() is aliased to modmult_burn().
1638: */
1639: void smith_burn(void)
1640: {
1641: empty_array (modulus);
1642: empty_array (ds_data);
1643: empty_array (mod_quotient);
1644: empty_array (mod_divisor);
1645: modmpl = 0;
1646: mshift =nbits = 0;
1647: reciph = recipl = 0;
1648: modlenMULTUNITS = mutemp = 0;
1649: mp_set_recip (0,0,0);
1650: } /* smith_burn */
1651:
1652: /* End of Thad Smith's implementation of modmult. */
1653:
1654: #endif /* SMITH */
1655:
1656:
1657: int countbits(unitptr r)
1658: /* Returns number of significant bits in r */
1659: { int bits;
1660: short prec;
1661: register unit bitmask;
1662: init_bitsniffer(r,bitmask,prec,bits);
1663: return(bits);
1664: } /* countbits */
1665:
1666:
1667: char *copyright_notice(void)
1668: /* force linker to include copyright notice in the executable object image. */
1669: { return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */
1670:
1671:
1672: int mp_modexp(register unitptr expout,register unitptr expin,
1673: register unitptr exponent,register unitptr modulus)
1674: { /* Russian peasant combined exponentiation/modulo algorithm.
1675: Calls modmult instead of mult.
1676: Computes: expout = (expin**exponent) mod modulus
1677: WARNING: All the arguments must be less than the modulus!
1678: */
1679: int bits;
1680: short oldprecision;
1681: register unit bitmask;
1682: unit product[MAX_UNIT_PRECISION];
1683: short eprec;
1684:
1685: #ifdef COUNTMULTS
1686: tally_modmults = 0; /* clear "number of modmults" counter */
1687: tally_modsquares = 0; /* clear "number of modsquares" counter */
1688: #endif /* COUNTMULTS */
1689: mp_init(expout,1);
1690: if (testeq(exponent,0))
1691: { if (testeq(expin,0))
1692: return(-1); /* 0 to the 0th power means return error */
1693: return(0); /* otherwise, zero exponent means expout is 1 */
1694: }
1695: if (testeq(modulus,0))
1696: return(-2); /* zero modulus means error */
1697: #if SLOP_BITS > 0 /* if there's room for sign bits */
1698: if (mp_tstminus(modulus))
1699: return(-2); /* negative modulus means error */
1700: #endif /* SLOP_BITS > 0 */
1701: if (mp_compare(expin,modulus) >= 0)
1702: return(-3); /* if expin >= modulus, return error */
1703: if (mp_compare(exponent,modulus) >= 0)
1704: return(-4); /* if exponent >= modulus, return error */
1705:
1706: oldprecision = global_precision; /* save global_precision */
1707: /* set smallest optimum precision for this modulus */
1708: set_precision(bits2units(countbits(modulus)+SLOP_BITS));
1709: /* rescale all these registers to global_precision we just defined */
1710: rescale(modulus,oldprecision,global_precision);
1711: rescale(expin,oldprecision,global_precision);
1712: rescale(exponent,oldprecision,global_precision);
1713: rescale(expout,oldprecision,global_precision);
1714:
1715: if (stage_modulus(modulus))
1716: { set_precision(oldprecision); /* restore original precision */
1717: return(-5); /* unstageable modulus (STEWART algorithm) */
1718: }
1719:
1720: /* normalize and compute number of bits in exponent first */
1721: init_bitsniffer(exponent,bitmask,eprec,bits);
1722:
1723: /* We can "optimize out" the first modsquare and modmult: */
1724: bits--; /* We know for sure at this point that bits>0 */
1725: mp_move(expout,expin); /* expout = (1*1)*expin; */
1726: bump_bitsniffer(exponent,bitmask);
1727:
1728: while (bits--)
1729: {
1730: poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
1731: #ifdef COUNTMULTS
1732: tally_modsquares++; /* bump "number of modsquares" counter */
1733: #endif /* COUNTMULTS */
1734: mp_modsquare(product,expout);
1735: if (sniff_bit(exponent,bitmask))
1736: { mp_modmult(expout,product,expin);
1737: #ifdef COUNTMULTS
1738: tally_modmults++; /* bump "number of modmults" counter */
1739: #endif /* COUNTMULTS */
1740: } else
1741: {
1742: mp_move(expout,product);
1743: }
1744: bump_bitsniffer(exponent,bitmask);
1745: } /* while bits-- */
1746: mp_burn(product); /* burn the evidence on the stack */
1747: modmult_burn(); /* ask mp_modmult to also burn its own evidence */
1748:
1749: #ifdef COUNTMULTS /* diagnostic analysis */
1750: { long atomic_mults;
1751: unsigned int unitcount,totalmults;
1752: unitcount = bits2units(countbits(modulus));
1753: /* calculation assumes modsquare takes as long as a modmult: */
1754: atomic_mults = (long) tally_modmults * (unitcount * unitcount);
1755: atomic_mults += (long) tally_modsquares * (unitcount * unitcount);
1756: printf("%ld atomic mults for ",atomic_mults);
1757: printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
1758: tally_modsquares,tally_modmults,
1759: tally_modsquares+tally_modmults,
1760: countbits(modulus), unitcount);
1761: }
1762: #endif /* COUNTMULTS */
1763:
1764: set_precision(oldprecision); /* restore original precision */
1765:
1766: /* Do an explicit reference to the copyright notice so that the linker
1767: will be forced to include it in the executable object image... */
1768: copyright_notice(); /* has no real effect at run time */
1769:
1770: return(0); /* normal return */
1771: } /* mp_modexp */
1772:
1.1.1.4 ! root 1773: int mp_modexp_crt(unitptr expout, unitptr expin,
! 1774: unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u)
! 1775: /* This is a faster modexp for moduli with a known
! 1776: factorisation into two relatively prime factors p and q,
! 1777: and an input relatively prime to the modulus,
! 1778: the Chinese Remainder Theorem to do the computation
! 1779: mod p and mod q, and then combine the results. This
! 1780: relies on a number of precomputed values, but does not
! 1781: actually require the modulus n or the exponent e.
! 1782:
! 1783: expout = expin ^ e mod (p*q).
! 1784: We form this by evaluating
! 1785: p2 = (expin ^ e) mod p and
! 1786: q2 = (expin ^ e) mod q
! 1787: and then combining the two by the CRT.
! 1788:
! 1789: Two optimisations of this are possible. First, we can
! 1790: reduce expin modulo p and q before starting.
! 1791:
! 1792: Second, since we know the factorisation of p and q
! 1793: (trivially derived from the factorisation of n = p*q),
! 1794: and expin is relatively prime to both p and q,
! 1795: we can use Euler's theorem, expin^phi(m) = 1 (mod m),
! 1796: to throw away multiples of phi(p) or phi(q) in e.
! 1797: Letting ep = e mod phi(p) and
! 1798: eq = e mod phi(q)
! 1799: then combining these two speedups, we only need to evaluate
! 1800: p2 = ((expin mod p) ^ ep) mod p and
! 1801: q2 = ((expin mod q) ^ eq) mod q.
! 1802:
! 1803: Now we need to apply the CRT. Starting with
! 1804: expout = p2 (mod p) and
! 1805: expout = q2 (mod q)
! 1806: we can say that expout = p2 + p * k, and if we assume that
! 1807: 0 <= p2 < p, then 0 <= expout < p*q for some 0 <= k < q.
! 1808: Since we want expout = q2 (mod q), then
! 1809: p*k = q2-p2 (mod q). Since p and q are relatively
! 1810: prime, p has a multiplicative inverse u mod q. In other
! 1811: words,
! 1812: u = 1/p (mod q).
! 1813: Multiplying by u on both sides gives k = u*(q2-p2) (mod q).
! 1814: Since we want 0 <= k < q, we can thus find k as
! 1815: k = (u * (q2-p2)) mod q.
! 1816:
! 1817: Once we have k, evaluating p2 + p * k is easy, and
! 1818: that gives us the result.
! 1819:
! 1820: In the detailed implementation, there is a temporary, temp,
! 1821: used to hold intermediate results, p2 is held in expout,
! 1822: and q2 is used as a temporary in the final step when it is
! 1823: no longer needed. With that, you should be able to
! 1824: understand the code below.
1.1.1.2 root 1825: */
1826: {
1827: unit q2[MAX_UNIT_PRECISION];
1.1.1.4 ! root 1828: unit temp[MAX_UNIT_PRECISION];
1.1.1.2 root 1829: int status;
1830:
1.1.1.4 ! root 1831: /* First, compiute p2 (physically held in M) */
1.1.1.2 root 1832:
1.1.1.4 ! root 1833: /* p2 = [ (expin mod p) ^ ep ] mod p */
! 1834: mp_mod(temp,expin,p); /* temp = expin mod p */
! 1835: status = mp_modexp(expout,temp,ep,p);
1.1.1.2 root 1836: if (status < 0) /* mp_modexp returned an error. */
1.1.1.4 ! root 1837: { mp_init(expout,1);
1.1.1.2 root 1838: return(status); /* error return */
1.1.1.4 ! root 1839: }
1.1.1.2 root 1840:
1.1.1.4 ! root 1841: /* And the same thing for q2 */
1.1.1.2 root 1842:
1.1.1.4 ! root 1843: /* q2 = [ (expin mod q) ^ eq ] mod q */
! 1844: mp_mod(temp,expin,q); /* temp = expin mod q */
! 1845: status = mp_modexp(q2,temp,eq,q);
1.1.1.2 root 1846: if (status < 0) /* mp_modexp returned an error. */
1.1.1.4 ! root 1847: { mp_init(expout,1);
1.1.1.2 root 1848: return(status); /* error return */
1.1.1.4 ! root 1849: }
1.1.1.2 root 1850:
1851: /* Now use the multiplicative inverse u to glue together the
1.1.1.4 ! root 1852: two halves.
! 1853: */
! 1854: #if 0
! 1855: /* This optimisation is useful if you commonly get small results,
! 1856: but for random results and large q, the odds of (1/q) of it
! 1857: being useful do not warrant the test.
! 1858: */
! 1859: if (mp_compare(expout,q2) != 0)
! 1860: {
! 1861: #endif
! 1862: /* Find q2-p2 mod q */
! 1863: if (mp_sub(q2,expout)) /* if the result went negative */
! 1864: mp_add(q2,q); /* add q to q2 */
! 1865:
! 1866: /* expout = p2 + ( p * [(q2*u) mod q] ) */
! 1867: mp_mult(temp,q2,u); /* q2*u */
! 1868: mp_mod(q2,temp,q); /* (q2*u) mod q */
! 1869: mp_mult(temp,p,q2); /* p * [(q2*u) mod q] */
! 1870: mp_add(expout,temp); /* expout = p2 + p * [...] */
! 1871: #if 0
1.1.1.2 root 1872: }
1.1.1.4 ! root 1873: #endif
1.1.1.2 root 1874:
1.1.1.4 ! root 1875: mp_burn(q2); /* burn the evidence on the stack...*/
! 1876: mp_burn(temp);
1.1.1.2 root 1877: return(0); /* normal return */
1.1.1.4 ! root 1878: } /* mp_modexp_crt */
1.1.1.2 root 1879:
1880:
1881: /****************** end of MPI library ****************************/
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.