--- pgp/src/mpilib.c	2018/04/24 16:37:52	1.1
+++ pgp/src/mpilib.c	2018/04/24 16:40:49	1.1.1.5
@@ -7,22 +7,23 @@
 	Boulder, CO 80304
 	(303) 541-0140
 
-	(c) Copyright 1986-92 by Philip Zimmermann.  All rights reserved.
-	The author assumes no liability for damages resulting from the use 
-	of this software, even if the damage results from defects in this 
-	software.  No warranty is expressed or implied.  The use of this 
-	cryptographic software for developing weapon systems is expressly 
+	(c) Copyright 1986-1994 by Philip Zimmermann.  All rights reserved.
+	The author assumes no liability for damages resulting from the use
+	of this software, even if the damage results from defects in this
+	software.  No warranty is expressed or implied.  The use of this
+	cryptographic software for developing weapon systems is expressly
 	forbidden.
 
-	These routines implement all of the multiprecision arithmetic 
-	necessary for number-theoretic cryptographic algorithms such as 
-	ElGamal, Diffie-Hellman, Rabin, or factoring studies for large 
-	composite numbers, as well as Rivest-Shamir-Adleman (RSA) public 
+
+	These routines implement all of the multiprecision arithmetic
+	necessary for number-theoretic cryptographic algorithms such as
+	ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
+	composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
 	key cryptography.
 
-	Although originally developed in Microsoft C for the IBM PC, this code 
-	contains few machine dependancies.  It assumes 2's complement 
-	arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines, 
+	Although originally developed in Microsoft C for the IBM PC, this code
+	contains few machine dependencies.  It assumes 2's complement
+	arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines,
 	lowbyte-highbyte order or highbyte-lowbyte order.  This version
 	has been converted to ANSI C.
 
@@ -47,13 +48,22 @@
 	Modified 8 Apr 92 - HAJK
 	Implement new VAX/VMS primitive support.
 
+	Modified 30 Sep 92 -Castor Fu <castor@drizzle.stanford.edu>
+	Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long)
+
+	Modified 28 Nov 92 - Thad Smith
+	Added Smith modmult, generalized non-portable support.
 */
 
 /* #define COUNTMULTS */ /* count modmults for performance studies */
 
 #ifdef DEBUG
 #ifdef MSDOS
+#ifdef __GO32__ /* DJGPP */
+#include <pc.h>
+#else
 #include <conio.h>
+#endif  /* __GO32__ */
 #define poll_for_break() {while (kbhit()) getch();}
 #endif	/* MSDOS */
 #endif	/* DEBUG */
@@ -64,6 +74,52 @@
 
 #include "mpilib.h"
 
+#ifdef mp_smula
+#ifdef mp_smul
+	Error: Both mp_smula and mp_smul cannot be defined.
+#else
+#define mp_smul	mp_smula
+#endif
+#endif
+
+/* set macros for MULTUNIT */
+#ifdef MUNIT8
+#define MULTUNITSIZE   8
+typedef unsigned char MULTUNIT;
+#ifdef UNIT8
+#define MULTUNIT_SIZE_SAME
+#endif
+#else	/* not MUNIT8 */
+#ifdef MUNIT32
+#define MULTUNITSIZE   32
+typedef unsigned long MULTUNIT;
+#ifdef UNIT32
+#define MULTUNIT_SIZE_SAME
+#else
+/* #error is not portable, this has the same effect */
+#include "UNITSIZE cannot be smaller than MULTUNITSIZE"
+#endif
+#else   /* assume MUNIT16 */
+#define MULTUNITSIZE   16
+typedef unsigned short MULTUNIT;
+#ifdef UNIT16
+#define MULTUNIT_SIZE_SAME
+#endif	/* UNIT16 */
+#ifdef UNIT8
+#include "UNITSIZE cannot be smaller than MULTUNITSIZE"
+#endif	/* UNIT8 */
+#endif	/* MUNIT32 */
+#endif	/* MUNIT8 */
+
+#define MULTUNIT_msb    ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */
+#define DMULTUNIT_msb   (1L << (2*MULTUNITSIZE-1))
+#define MULTUNIT_mask   ((MULTUNIT)((1L << MULTUNITSIZE)-1))
+#define MULTUNITs_perunit   (UNITSIZE/MULTUNITSIZE)
+
+
+void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier);
+void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
+
 short global_precision = 0; /* units of precision for all routines */
 /*	global_precision is the unit precision last set by set_precision.
 	Initially, set_precision() should be called to define global_precision
@@ -71,297 +127,279 @@ short global_precision = 0; /* units of
 	i.e.:   set_precision(MAX_UNIT_PRECISION);
 */
 
-#ifdef PORTABLE
 /*************** multiprecision library primitives ****************/
 /*	The following portable C primitives should be recoded in assembly.
-	The equivalent assembly primitives are externally defined under
-	different names, unless PORTABLE is defined.  See the header file
-	"mpilib.h" for further details.
-
-	The carry bit mechanism we use for these portable primitives
-	will only work if the size of unit is smaller than the size of
-	a long integer.  In most cases, this means UNITSIZE must be
-	less than 32 bits -- if you use the C portable primitives.
+	The entry point name should be defined, in "mpilib.h" to the external
+	entry point name.  If undefined, the C version will be used.
 */
 
 typedef unsigned long int ulint;
-#define carrybit ((ulint) 1 << UNITSIZE)
-/* ...assumes sizeof(unit) < sizeof(unsigned long) */
 
+#ifndef mp_addc
 boolean mp_addc
 	(register unitptr r1,register unitptr r2,register boolean carry)
 	/* multiprecision add with carry r2 to r1, result in r1 */
 	/* carry is incoming carry flag-- value should be 0 or 1 */
-{	register ulint x;	/* won't work if sizeof(unit)==sizeof(long) */
+{
+	register unit x;
 	short precision;	/* number of units to add */
 	precision = global_precision;
 	make_lsbptr(r1,precision);
 	make_lsbptr(r2,precision);
-	while (precision--)	
-	{	x = (ulint) *r1 + (ulint) *post_higherunit(r2) + (ulint) carry;
+	while (precision--) {
+		if (carry) {
+			x = *r1 + *r2 + 1;
+			carry = (*r2  >= (unit)(~ *r1));
+		} else {
+			x = *r1 + *r2;
+			carry = (x < *r1) ;
+		}
+		post_higherunit(r2);
 		*post_higherunit(r1) = x;
-		carry = ((x & carrybit) != 0L);
 	}
-	return(carry);		/* return the final carry flag bit */
-}	/* mp_addc */
-
+	return carry;		/* return the final carry flag bit */
+} /* mp_addc */
+#endif  /* mp_addc */
 
+#ifndef mp_subb
 boolean mp_subb
 	(register unitptr r1,register unitptr r2,register boolean borrow)
 	/* multiprecision subtract with borrow, r2 from r1, result in r1 */
 	/* borrow is incoming borrow flag-- value should be 0 or 1 */
-{	register ulint x;	/* won't work if sizeof(unit)==sizeof(long) */
+{
+	register unit x;
 	short precision;	/* number of units to subtract */
 	precision = global_precision;
 	make_lsbptr(r1,precision);
 	make_lsbptr(r2,precision);
-	while (precision--)	
-	{	x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
+	while (precision--) {
+		if (borrow) {
+			x = *r1 - *r2 - 1;
+			borrow = (*r1 <= *r2);
+		} else {
+			x = *r1 - *r2;
+			borrow = (*r1 < *r2);
+		}
+		post_higherunit(r2);
 		*post_higherunit(r1) = x;
-		borrow = ((x & carrybit) != 0L);
 	}
-	return(borrow);		/* return the final carry/borrow flag bit */
-}	/* mp_subb */
-
-#undef carrybit
-
+	return borrow;	/* return the final carry/borrow flag bit */
+} /* mp_subb */
+#endif  /* mp_subb */
 
+#ifndef mp_rotate_left
 boolean mp_rotate_left(register unitptr r1,register boolean carry)
 	/* multiprecision rotate left 1 bit with carry, result in r1. */
 	/* carry is incoming carry flag-- value should be 0 or 1 */
-{	register short precision;	/* number of units to rotate */
-	register boolean nextcarry;
+{
+	register int precision;	/* number of units to rotate */
+	unsigned int mcarry = carry, nextcarry; /* int is supposed to be
+						 * the efficient size for ops*/
 	precision = global_precision;
 	make_lsbptr(r1,precision);
-	while (precision--)	
-	{	
+	while (precision--) {
 		nextcarry = (((signedunit) *r1) < 0);
-		/* nextcarry = ((*r1 & uppermostbit) != 0); */
-		*r1 <<= 1 ;
-		if (carry) *r1 |= 1;
-		carry = nextcarry;
+		*r1 = (*r1 << 1) | mcarry;
+		mcarry = nextcarry;
 		pre_higherunit(r1);
 	}
-	return(nextcarry);	/* return the final carry flag bit */
-}	/* mp_rotate_left */
+	return nextcarry;	/* return the final carry flag bit */
+} /* mp_rotate_left */
+#endif  /* mp_rotate_left */
 
 /************** end of primitives that should be in assembly *************/
-#endif	/* PORTABLE */
 
 
-/* 	The mp_shift_right_bits function is not called in any time-critical
-	situations in public-key cryptographic functions, so it doesn't 
+/*	The mp_shift_right_bits function is not called in any time-critical
+	situations in public-key cryptographic functions, so it doesn't
 	need to be coded in assembly language.
 */
 void mp_shift_right_bits(register unitptr r1,register short bits)
-	/*	multiprecision shift right bits, result in r1. 
-		bits is how many bits to shift, must be < UNITSIZE.
+	/*	multiprecision shift right bits, result in r1.
+		bits is how many bits to shift, must be <= UNITSIZE.
 	*/
-{	register short precision;	/* number of units to shift */
+{
+	register short precision;	/* number of units to shift */
 	register unit carry,nextcarry,bitmask;
 	register short unbits;
 	if (bits==0) return;	/* shift zero bits is a no-op */
 	carry = 0;
 	bitmask = power_of_2(bits)-1;
-	unbits = UNITSIZE-bits;		/* shift bits must be < UNITSIZE */
+	unbits = UNITSIZE-bits;		/* shift bits must be <= UNITSIZE */
 	precision = global_precision;
 	make_msbptr(r1,precision);
-	while (precision--)	
-	{	nextcarry = *r1 & bitmask;
-		*r1 >>= bits;
-		*r1 |= carry << unbits;
-		carry = nextcarry;
-		pre_lowerunit(r1);
+	if (bits == UNITSIZE) {
+		while (precision--)  {
+			nextcarry = *r1;
+			*r1 = carry;
+			carry = nextcarry;
+			pre_lowerunit(r1);
+		}
+	} else {
+		while (precision--) {
+			nextcarry = *r1 & bitmask;
+			*r1 >>= bits;
+			*r1 |= carry << unbits;
+			carry = nextcarry;
+			pre_lowerunit(r1);
+		}
 	}
-}	/* mp_shift_right_bits */
+} /* mp_shift_right_bits */
 
 
-#ifndef VMS
-
+#ifndef mp_compare
 short mp_compare(register unitptr r1,register unitptr r2)
-/*	Compares multiprecision integers *r1, *r2, and returns:	
-	-1 iff *r1 < *r2
-	 0 iff *r1 == *r2
-	+1 iff *r1 > *r2
-*/
-{	register short precision;	/* number of units to compare */
+/*
+ * Compares multiprecision integers *r1, *r2, and returns:
+ * -1 iff *r1 < *r2
+ *  0 iff *r1 == *r2
+ * +1 iff *r1 > *r2
+ */
+{
+	register short precision;	/* number of units to compare */
 
 	precision = global_precision;
 	make_msbptr(r1,precision);
 	make_msbptr(r2,precision);
-	do	
-	{	if (*r1 < *r2) 
-			return(-1);
-		if (*post_lowerunit(r1) > *post_lowerunit(r2)) 
-			return(1);
-	} while (--precision);	
-	return(0);	/*  *r1 == *r2  */
-}	/* mp_compare */
+	do {
+		if (*r1 < *r2)
+			return -1;
+		if (*post_lowerunit(r1) > *post_lowerunit(r2))
+			return 1;
+	} while (--precision);
+	return 0;	/*  *r1 == *r2  */
+} /* mp_compare */
+#endif /* mp_compare */
 
-#endif /* VMS */
 
 boolean mp_inc(register unitptr r)
-	/* Increment multiprecision integer r. */
-{	register short precision;
+/* Increment multiprecision integer r. */
+{
+	register short precision;
 	precision = global_precision;
 	make_lsbptr(r,precision);
-	do	
-	{	if ( ++(*r) ) return(0);	/* no carry */
-		post_higherunit(r);	
+	do {
+		if ( ++(*r) ) return 0;	/* no carry */
+		post_higherunit(r);
 	} while (--precision);
-	return(1);		/* return carry set */
-}	/* mp_inc */
+	return 1;		/* return carry set */
+} /* mp_inc */
 
 
 boolean mp_dec(register unitptr r)
-	/* Decrement multiprecision integer r. */
-{	register short precision;
+/* Decrement multiprecision integer r. */
+{
+	register short precision;
 	precision = global_precision;
 	make_lsbptr(r,precision);
-	do	
-	{	if ( (signedunit) (--(*r)) != (signedunit) -1 )
-			return(0);	/* no borrow */
-		post_higherunit(r);	
+	do {
+		if ( (signedunit) (--(*r)) != (signedunit) -1 )
+			return 0;	/* no borrow */
+		post_higherunit(r);
 	} while (--precision);
-	return(1);		/* return borrow set */
-}	/* mp_dec */
+	return 1;		/* return borrow set */
+} /* mp_dec */
 
 
 void mp_neg(register unitptr r)
 	/* Compute 2's complement, the arithmetic negative, of r */
-{	register short precision;	/* number of units to negate */
+{
+	register short precision;	/* number of units to negate */
 	precision = global_precision;
 	mp_dec(r);	/* 2's complement is 1's complement plus 1 */
-	do	/* now do 1's complement */
-	{	*r = ~(*r);
-		r++;	
+	do {	/* now do 1's complement */
+		*r = ~(*r);
+		r++;
 	} while (--precision);
-}	/* mp_neg */
-
-#ifndef VAXC /* Replaced by a macro under VAX C */
+} /* mp_neg */
 
+#ifndef mp_move
 void mp_move(register unitptr dst,register unitptr src)
-{	register short precision;	/* number of units to move */
+{
+	register short precision;	/* number of units to move */
 	precision = global_precision;
 	do { *dst++ = *src++; } while (--precision);
-}	/* mp_move */
+} /* mp_move */
+#endif /* mp_move */
 
-#endif /* VAXC */
 void mp_init(register unitptr r, word16 value)
 	/* Init multiprecision register r with short value. */
 {	/* Note that mp_init doesn't extend sign bit for >32767 */
-	register short precision;	/* number of units to init */
-	precision = global_precision;
-	make_lsbptr(r,precision);
-	*post_higherunit(r) = value; precision--;
+
+	unitfill0( r, global_precision);
+	make_lsbptr(r,global_precision);
+	*post_higherunit(r) = value;
 #ifdef UNIT8
-	*post_higherunit(r) = value >> UNITSIZE; precision--;
+	*post_higherunit(r) = value >> UNITSIZE;
 #endif
-#ifdef VAXC
-
-	unitfill0( r, precision); /* Use character insts. on a VAX */
-
-#else /* VAXC */
-
-	while (precision--)
-		*post_higherunit(r) = 0;
-
-#endif /* VAXC */
-
-}	/* mp_init */
+} /* mp_init */
 
 
 short significance(register unitptr r)
 	/* Returns number of significant units in r */
-{	register short precision;
+{
+	register short precision;
 	precision = global_precision;
 	make_msbptr(r,precision);
-	do	
-	{	if (*post_lowerunit(r)) 
-			return(precision);
+	do {
+		if (*post_lowerunit(r))
+			return precision;
 	} while (--precision);
-	return(precision);
-}	/* significance */
-
+	return precision;
+} /* significance */
 
-#ifndef VAXC	/* Replaced by a macro under VAX C */
 
+#ifndef unitfill0
 void unitfill0(unitptr r,word16 unitcount)
 	/* Zero-fill the unit buffer r. */
-{	while (unitcount--) *r++ = 0;
-}	/* unitfill0 */
-
-#endif /* VAXC */
-
-/* The macro normalize(r,precision) "normalizes" a multiprecision integer 
-   by adjusting r and precision to new values.  For Motorola-style processors 
-   (MSB-first), r is a pointer to the MSB of the register, and must
-   be adjusted to point to the first nonzero unit.  For Intel/VAX-style
-   (LSB-first) processors, r is a  pointer to the LSB of the register,
-   and must be left unchanged.  The precision counter is always adjusted,
-   regardless of processor type.  In the case of precision = 0,
-   r becomes undefined.
-*/
-
-
-/* The macro rescale(r,current_precision,new_precision) rescales
-   a multiprecision integer by adjusting r and its precision to new values.  
-   It can be used to reverse the effects of the normalize 
-   routine given above.  See the comments in normalize concerning 
-   Intel vs. Motorola LSB/MSB conventions.
-   WARNING:  You can only safely call rescale on registers that 
-   you have previously normalized with the above normalize routine, 
-   or are known to be big enough for the new precision.  You may
-   specify a new precision that is smaller than the current precision.
-   You must be careful not to specify a new_precision value that is 
-   too big, or which adjusts the r pointer out of range.
-*/
-
-
-/*	The "bit sniffer" macros all begin sniffing at the MSB.
-	They are used internally by all the various multiply, divide, 
-	modulo, exponentiation, and square root functions.
-*/
+{
+	while (unitcount--) *r++ = 0;
+} /* unitfill0 */
+#endif  /* unitfill0 */
 
 
 int mp_udiv(register unitptr remainder,register unitptr quotient,
 	register unitptr dividend,register unitptr divisor)
 	/* Unsigned divide, treats both operands as positive. */
-{	int bits;
+{
+	int bits;
 	short dprec;
 	register unit bitmask;
+
 	if (testeq(divisor,0))
-		return(-1);	/* zero divisor means divide error */
+		return -1;	/* zero divisor means divide error */
 	mp_init0(remainder);
 	mp_init0(quotient);
 	/* normalize and compute number of bits in dividend first */
 	init_bitsniffer(dividend,bitmask,dprec,bits);
 	/* rescale quotient to same precision (dprec) as dividend */
 	rescale(quotient,global_precision,dprec);
-	make_msbptr(quotient,dprec); 
+	make_msbptr(quotient,dprec);
 
-	while (bits--)
-	{	mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
-		if (mp_compare(remainder,divisor) >= 0)
-		{	mp_sub(remainder,divisor);
+	while (bits--) {
+		mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
+		if (mp_compare(remainder,divisor) >= 0) {
+			mp_sub(remainder,divisor);
 			stuff_bit(quotient,bitmask);
 		}
-		bump_2bitsniffers(dividend,quotient,bitmask); 
+		bump_2bitsniffers(dividend,quotient,bitmask);
 	}
-	return(0);
+	return 0;
 } /* mp_udiv */
 
 
+#ifdef UPTON_OR_SMITH
 #define RECIPMARGIN 0	/* extra margin bits used by mp_recip() */
 
 int mp_recip(register unitptr quotient,register unitptr divisor)
 	/* Compute reciprocal (quotient) as 1/divisor.  Used by faster modmult. */
-{	int bits;
+{
+	int bits;
 	short qprec;
 	register unit bitmask;
 	unit remainder[MAX_UNIT_PRECISION];
 	if (testeq(divisor,0))
-		return(-1);	/* zero divisor means divide error */
+		return -1;	/* zero divisor means divide error */
 	mp_init0(remainder);
 	mp_init0(quotient);
 
@@ -372,120 +410,126 @@ int mp_recip(register unitptr quotient,r
 	mp_setbit(remainder,(bits-RECIPMARGIN)-1);
 	/* rescale quotient to precision of divisor + RECIPMARGIN bits */
 	rescale(quotient,global_precision,qprec);
-	make_msbptr(quotient,qprec); 
+	make_msbptr(quotient,qprec);
 
-	while (bits--)
-	{	mp_shift_left(remainder);
-		if (mp_compare(remainder,divisor) >= 0)
-		{	mp_sub(remainder,divisor);
+	while (bits--) {
+		mp_shift_left(remainder);
+		if (mp_compare(remainder,divisor) >= 0) {
+			mp_sub(remainder,divisor);
 			stuff_bit(quotient,bitmask);
 		}
 		bump_bitsniffer(quotient,bitmask);
 	}
 	mp_init0(remainder);	/* burn sensitive data left on stack */
-	return(0);
+	return 0;
 } /* mp_recip */
+#endif	/* UPTON_OR_SMITH */
 
 
 int mp_div(register unitptr remainder,register unitptr quotient,
 	register unitptr dividend,register unitptr divisor)
 	/* Signed divide, either or both operands may be negative. */
-{	boolean dvdsign,dsign;
+{
+	boolean dvdsign,dsign;
 	int status;
-	dvdsign = (mp_tstminus(dividend)!=0);
-	dsign = (mp_tstminus(divisor)!=0);
+	dvdsign = (boolean)(mp_tstminus(dividend)!=0);
+	dsign = (boolean)(mp_tstminus(divisor)!=0);
 	if (dvdsign) mp_neg(dividend);
 	if (dsign) mp_neg(divisor);
 	status = mp_udiv(remainder,quotient,dividend,divisor);
 	if (dvdsign) mp_neg(dividend);		/* repair caller's dividend */
 	if (dsign) mp_neg(divisor);		/* repair caller's divisor */
-	if (status<0) return(status);		/* divide error? */
+	if (status<0) return status;		/* divide error? */
 	if (dvdsign) mp_neg(remainder);
 	if (dvdsign ^ dsign) mp_neg(quotient);
-	return(status);
+	return status;
 } /* mp_div */
 
 
 word16 mp_shortdiv(register unitptr quotient,
 	register unitptr dividend,register word16 divisor)
-/*	This function does a fast divide and mod on a multiprecision dividend
-	using a short integer divisor returning a short integer remainder.
-	This is an unsigned divide.  It treats both operands as positive.
-	It is used mainly for faster printing of large numbers in base 10. 
-*/
-{	int bits;
+/*
+ * This function does a fast divide and mod on a multiprecision dividend
+ * using a short integer divisor returning a short integer remainder.
+ * This is an unsigned divide.  It treats both operands as positive.
+ * It is used mainly for faster printing of large numbers in base 10.
+ */
+{
+	int bits;
 	short dprec;
 	register unit bitmask;
 	register word16 remainder;
 	if (!divisor)	/* if divisor == 0 */
-		return(-1);	/* zero divisor means divide error */
+		return -1;	/* zero divisor means divide error */
 	remainder=0;
 	mp_init0(quotient);
 	/* normalize and compute number of bits in dividend first */
 	init_bitsniffer(dividend,bitmask,dprec,bits);
 	/* rescale quotient to same precision (dprec) as dividend */
 	rescale(quotient,global_precision,dprec);
-	make_msbptr(quotient,dprec); 
+	make_msbptr(quotient,dprec);
 
-	while (bits--)
-	{	remainder <<= 1;
+	while (bits--) {
+		remainder <<= 1;
 		if (sniff_bit(dividend,bitmask))
 			remainder++;
-		if (remainder >= divisor)
-		{	remainder -= divisor;
+		if (remainder >= divisor) {
+			remainder -= divisor;
 			stuff_bit(quotient,bitmask);
 		}
-		bump_2bitsniffers(dividend,quotient,bitmask); 
-	}	
-	return(remainder);
+		bump_2bitsniffers(dividend,quotient,bitmask);
+	}
+	return remainder;
 } /* mp_shortdiv */
 
 
 int mp_mod(register unitptr remainder,
 	register unitptr dividend,register unitptr divisor)
-	/* Unsigned divide, treats both operands as positive. */
-{	int bits;
+/* Unsigned divide, treats both operands as positive. */
+{
+	int bits;
 	short dprec;
 	register unit bitmask;
 	if (testeq(divisor,0))
-		return(-1);	/* zero divisor means divide error */
+		return -1;	/* zero divisor means divide error */
 	mp_init0(remainder);
 	/* normalize and compute number of bits in dividend first */
 	init_bitsniffer(dividend,bitmask,dprec,bits);
 
-	while (bits--)
-	{	mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
+	while (bits--) {
+		mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
 		msub(remainder,divisor);
 		bump_bitsniffer(dividend,bitmask);
-	}	
-	return(0);
+	}
+	return 0;
 } /* mp_mod */
 
 
 word16 mp_shortmod(register unitptr dividend,register word16 divisor)
-/*	This function does a fast mod operation on a multprecision dividend
-	using a short integer modulus returning a short integer remainder.
-	This is an unsigned divide.  It treats both operands as positive.
-	It is used mainly for fast sieve searches for large primes. 
-*/
+/*
+ * This function does a fast mod operation on a multiprecision dividend
+ * using a short integer modulus returning a short integer remainder.
+ * This is an unsigned divide.  It treats both operands as positive.
+ * It is used mainly for fast sieve searches for large primes.
+ */
 {	int bits;
 	short dprec;
 	register unit bitmask;
 	register word16 remainder;
 	if (!divisor)	/* if divisor == 0 */
-		return(-1);	/* zero divisor means divide error */
+		return -1;	/* zero divisor means divide error */
 	remainder=0;
 	/* normalize and compute number of bits in dividend first */
 	init_bitsniffer(dividend,bitmask,dprec,bits);
 
-	while (bits--)
-	{	remainder <<= 1;
+	while (bits--) {
+		remainder <<= 1;
 		if (sniff_bit(dividend,bitmask))
 			remainder++;
 		if (remainder >= divisor) remainder -= divisor;
 		bump_bitsniffer(dividend,bitmask);
-	}	
-	return(remainder);
+	}
+	return remainder;
 } /* mp_shortmod */
 
 
@@ -495,59 +539,67 @@ word16 mp_shortmod(register unitptr divi
 
 int mp_mult(register unitptr prod,
 	register unitptr multiplicand, register unitptr multiplier)
-	/*	Computes multiprecision prod = multiplicand * multiplier */
-{	/*	Uses interleaved comb multiply algorithm.
-		This improved multiply more than twice as fast as a Russian 
-		peasant multiply, because it does a lot fewer shifts. 
-		Must have global_precision set to the size of the multiplicand 
-		plus UNITSIZE-1 SLOP_BITS.  Produces a product that is the sum 
-		of the lengths of the multiplier and multiplicand.
-
-		BUG ALERT:  Unfortunately, this code has a bug.  It fails for 
-		some numbers.  One such example:
-		x=   59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5 
-		x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D 
-             52C8 CDC7 AFB3 61C8 243C 741B
-		--which is obviously wrong.  The answer should be:
-		x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A 
-             C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9 
-		We'll have to fix this some day.  In the meantime, we'll 
-		just have the compiler skip over this code. 
-	*/
+/*
+ * Computes multiprecision prod = multiplicand * multiplier
+ *
+ * Uses interleaved comb multiply algorithm.
+ * This improved multiply more than twice as fast as a Russian
+ * peasant multiply, because it does a lot fewer shifts.
+ * Must have global_precision set to the size of the multiplicand
+ * plus UNITSIZE-1 SLOP_BITS.  Produces a product that is the sum
+ * of the lengths of the multiplier and multiplicand.
+ *
+ * BUG ALERT:  Unfortunately, this code has a bug.  It fails for
+ * some numbers.  One such example:
+ * x=   59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5
+ * x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D
+ *      52C8 CDC7 AFB3 61C8 243C 741B
+ * --which is obviously wrong.  The answer should be:
+ * x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A
+ *      C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9
+ * We'll have to fix this some day.  In the meantime, we'll
+ * just have the compiler skip over this code.
+ *
+ * BUG NOTE: Possibly fixed.  Needs testing.
+ */
+{
 	int bits;
 	register unit bitmask;
 	unitptr product, mplier, temp;
 	short mprec,mprec2;
 	unit mplicand[MAX_UNIT_PRECISION];
-	mp_init(prod,0);		/* better clear full width--double precision */
+
+	/* better clear full width--double precision */
+	mp_init(prod+tohigher(global_precision),0);
+
 	if (testeq(multiplicand,0))
-		return(0);	/* zero multiplicand means zero product */
+		return 0;	/* zero multiplicand means zero product */
 
 	mp_move(mplicand,multiplicand);	/* save it from damage */
 
 	normalize(multiplier,mprec);
 	if (!mprec)
-		return(0);
+		return 0;
 
 	make_lsbptr(multiplier,mprec);
 	bitmask = 1;	/* start scan at LSB of multiplier */
 
-	do	/* UNITSIZE times */
-	{	/* do for bits 0-15 */
+	do {	/* UNITSIZE times */
+		/* do for bits 0-15 */
 		product = prod;
 		mplier = multiplier;
 		mprec2 = mprec;
-		while (mprec2--)	/* do for each word in multiplier */
-		{
-			if (sniff_bit(mplier,bitmask))
-			{	if (mp_addc(product,multiplicand,0))	/* ripple carry */
+		while (mprec2--) {	/* do for each word in multiplier */
+		
+			if (sniff_bit(mplier,bitmask)) {
+				if (mp_addc(product,multiplicand,0))	/* ripple carry */
 				{	/* After 1st time thru, this is rarely encountered. */
 					temp = msbptr(product,global_precision);
 					pre_higherunit(temp);
 					/* temp now points to LSU of carry region. */
 					unmake_lsbptr(temp,global_precision);
 					mp_inc(temp);
-				}	/* ripple carry */
+				} /* ripple carry */
 			}
 			pre_higherunit(mplier);
 			pre_higherunit(product);
@@ -560,62 +612,67 @@ int mp_mult(register unitptr prod,
 
 	mp_move(multiplicand,mplicand);	/* recover */
 
-	return(0);	/* normal return */	
-}	/* mp_mult */
+	return 0;	/* normal return */
+} /* mp_mult */
 
 #endif	/* COMB_MULT */
 
 
 /*	Because the "comb" multiply has a bug, we will use the slower
-	Russian peasant multiply instead.  Fortunately, the mp_mult 
+	Russian peasant multiply instead.  Fortunately, the mp_mult
 	function is not called from any time-critical code.
 */
 
 int mp_mult(register unitptr prod,
 	register unitptr multiplicand,register unitptr multiplier)
-	/* Computes multiprecision prod = multiplicand * multiplier */
-{	/* Uses "Russian peasant" multiply algorithm. */
+/*
+ * Computes multiprecision prod = multiplicand * multiplier
+ * Uses "Russian peasant" multiply algorithm.
+ */
+{
 	int bits;
 	register unit bitmask;
 	short mprec;
 	mp_init(prod,0);
 	if (testeq(multiplicand,0))
-		return(0);	/* zero multiplicand means zero product */
+		return 0;	/* zero multiplicand means zero product */
 	/* normalize and compute number of bits in multiplier first */
 	init_bitsniffer(multiplier,bitmask,mprec,bits);
 
-	while (bits--)
-	{	mp_shift_left(prod);
+	while (bits--) {
+		mp_shift_left(prod);
 		if (sniff_bit(multiplier,bitmask))
 			mp_add(prod,multiplicand);
 		bump_bitsniffer(multiplier,bitmask);
 	}
-	return(0);	
-}	/* mp_mult */
+	return 0;
+} /* mp_mult */
 
 
 
-/*	mp_modmult computes a multiprecision multiply combined with a 
-	modulo operation.  This is the most time-critical function in
-	this multiprecision arithmetic library for performing modulo
-	exponentiation.  We experimented with different versions of modmult,
-	depending on the machine architecture and performance requirements.
-	We will either use a Russian Peasant modmult (peasant_modmult), 
-	Charlie Merritt's modmult (merritt_modmult) or Jimmy Upton's
-	modmult (upton_modmult).  On machines with a hardware atomic 
-	multiply instruction, Upton's modmult is fastest, which depends
-	on an assembly subroutine to speed up the hardware multiply logic.
-	If the machine lacks a fast hardware multiply, Merritt's modmult
-	is preferred, which doesn't call any assembly multiply routine.
-	We use the alias names mp_modmult, stage_modulus, and modmult_burn 
-	for the corresponding true names, which depend on what flavor of 
-	modmult we are using.
-
-	Before making the first call to mp_modmult, you must set up the 
-	modulus-dependant precomputated tables by calling stage_modulus.
-	After making all the calls to mp_modmult, you call modmult_burn to 
-	erase the tables created by stage_modulus that were left in memory.
-*/
+/*
+ * mp_modmult computes a multiprecision multiply combined with a
+ * modulo operation.  This is the most time-critical function in
+ * this multiprecision arithmetic library for performing modulo
+ * exponentiation.  We experimented with different versions of modmult,
+ * depending on the machine architecture and performance requirements.
+ * We will either use a Russian Peasant modmult (peasant_modmult), 
+ * Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's
+ * modmult (upton_modmult), or Thad Smith's modmult (smith_modmult).
+ * On machines with a hardware atomic multiply instruction,
+ * Smith's modmult is fastest.  It can utilize assembly subroutines to
+ * speed up the hardware multiply logic and trial quotient calculation.
+ * If the machine lacks a fast hardware multiply, Merritt's modmult
+ * is preferred, which doesn't call any assembly multiply routine.
+ * We use the alias names mp_modmult, stage_modulus, and modmult_burn
+ * for the corresponding true names, which depend on what flavor of
+ * modmult we are using.
+ * 
+ * Before making the first call to mp_modmult, you must set up the
+ * modulus-dependant precomputated tables by calling stage_modulus.
+ * After making all the calls to mp_modmult, you call modmult_burn to
+ * erase the tables created by stage_modulus that were left in memory.
+ */
 
 #ifdef COUNTMULTS
 /* "number of modmults" counters, used for performance studies. */
@@ -627,59 +684,64 @@ static unsigned int tally_modsquares = 0
 #ifdef PEASANT
 /* Conventional Russian peasant multiply with modulo algorithm. */
 
-static unitptr modulus = 0;	/* used only by mp_modmult */
+static unitptr pmodulus = 0;	/* used only by mp_modmult */
 
 int stage_peasant_modulus(unitptr n)
-/*	Must pass modulus to stage_modulus before calling modmult.
-	Assumes that global_precision has already been adjusted to the
-	size of the modulus, plus SLOP_BITS.
-*/
+/*
+ * Must pass modulus to stage_modulus before calling modmult.
+ * Assumes that global_precision has already been adjusted to the
+ * size of the modulus, plus SLOP_BITS.
+ */
 {	/* For this simple version of modmult, just copy unit pointer. */
-	modulus = n;
-	return(0);	/* normal return */
-}	/* stage_peasant_modulus */
+	pmodulus = n;
+	return 0;	/* normal return */
+} /* stage_peasant_modulus */
 
 
 int peasant_modmult(register unitptr prod,
 	unitptr multiplicand,register unitptr multiplier)
-{	/*	"Russian peasant" multiply algorithm, combined with a modulo 
-		operation.  This is a simple naive replacement for Merritt's 
-		faster modmult algorithm.  References global unitptr "modulus".
-		Computes:  prod = (multiplicand*multiplier) mod modulus
-		WARNING: All the arguments must be less than the modulus!
-	*/
+/* "Russian peasant" multiply algorithm, combined with a modulo
+ * operation.  This is a simple naive replacement for Merritt's
+ * faster modmult algorithm.  References global unitptr "modulus".
+ * Computes:  prod = (multiplicand*multiplier) mod modulus
+ * WARNING: All the arguments must be less than the modulus!
+ */
+{
 	int bits;
 	register unit bitmask;
 	short mprec;
 	mp_init(prod,0);
 /*	if (testeq(multiplicand,0))
-		return(0); */	/* zero multiplicand means zero product */
+		return 0; */	/* zero multiplicand means zero product */
 	/* normalize and compute number of bits in multiplier first */
 	init_bitsniffer(multiplier,bitmask,mprec,bits);
 
-	while (bits--)
-	{	mp_shift_left(prod);
-		msub(prod,modulus);	/* turns mult into modmult */
-		if (sniff_bit(multiplier,bitmask))
-		{	mp_add(prod,multiplicand);
-			msub(prod,modulus);	/* turns mult into modmult */
+	while (bits--) {
+		mp_shift_left(prod);
+		msub(prod,pmodulus);	/* turns mult into modmult */
+		if (sniff_bit(multiplier,bitmask)) {
+			mp_add(prod,multiplicand);
+			msub(prod,pmodulus);	/* turns mult into modmult */
 		}
 		bump_bitsniffer(multiplier,bitmask);
 	}
-	return(0);
-}	/* peasant_modmult */
+	return 0;
+} /* peasant_modmult */
 
 
-/*	If we are using a version of mp_modmult that uses internal tables 
-	in memory, we have to call modmult_burn() at the end of mp_modexp.
-	This is so that no sensitive data is left in memory after the program 
-	exits.  The Russian peasant method doesn't use any such tables.
-*/
-static void peasant_burn(void)
-/*	Alias for modmult_burn, called only from mp_modexp().  Destroys
-	internal modmult tables.  This version does nothing because no 
-	tables are used by the Russian peasant modmult. */
-{ }	/* peasant_burn */
+/*
+ * If we are using a version of mp_modmult that uses internal tables
+ * in memory, we have to call modmult_burn() at the end of mp_modexp.
+ * This is so that no sensitive data is left in memory after the program
+ * exits.  The Russian peasant method doesn't use any such tables.
+ */
+void peasant_burn(void)
+/*
+ * Alias for modmult_burn, called only from mp_modexp().  Destroys
+ * internal modmult tables.  This version does nothing because no
+ * tables are used by the Russian peasant modmult.
+ */
+{ } /* peasant_burn */
 
 #endif	/* PEASANT */
 
@@ -689,10 +751,11 @@ static void peasant_burn(void)
 /*
 	This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
 	Also refined by Zhahai Stewart to reduce the number of subtracts.
-	It performs a multiply combined with a modulo operation, without 
+    Modified by Raymond Brand to reduce the number of SLOP_BITS by 1.
+	It performs a multiply combined with a modulo operation, without
 	going into "double precision".  It is faster than the Russian peasant
-	method,	and still works well on machines that lack a fast hardware 
-	multiply instruction.  
+	method, and still works well on machines that lack a fast hardware
+	multiply instruction.
 */
 
 /*	The following support functions, macros, and data structures
@@ -700,7 +763,8 @@ static void peasant_burn(void)
 
 static void mp_lshift_unit(register unitptr r1)
 /*	Shift r1 1 whole unit to the left.  Used only by modmult function. */
-{	register short precision;
+{
+	register short precision;
 	register unitptr r2;
 	precision = global_precision;
 	make_msbptr(r1,precision);
@@ -708,65 +772,74 @@ static void mp_lshift_unit(register unit
 	while (--precision)
 		*post_lowerunit(r1) = *pre_lowerunit(r2);
 	*r1 = 0;
-}	/* mp_lshift_unit */
+} /* mp_lshift_unit */
 
 
 /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
-static unit moduli_buf[UNITSIZE][MAX_UNIT_PRECISION] = {0};
-static unitptr moduli[UNITSIZE+1] = /* contains pointers into moduli_buf */
-{	0
-	,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0], 
-	 &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0], &moduli_buf[ 7][0]
+static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
+static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */
+{	 0,              moduli_buf[ 0], moduli_buf[ 1], moduli_buf[ 2],
+	 moduli_buf[ 3], moduli_buf[ 4], moduli_buf[ 5], moduli_buf[ 6]
 #ifndef UNIT8
-	,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0], 
-	 &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0], &moduli_buf[15][0] 
+	,moduli_buf[ 7] ,moduli_buf[ 8], moduli_buf[ 9], moduli_buf[10],
+	 moduli_buf[11], moduli_buf[12], moduli_buf[13], moduli_buf[14]
 #ifndef UNIT16	/* and not UNIT8 */
-	,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0], 
-	 &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0], 
-	 &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0], 
-	 &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0], &moduli_buf[31][0]
+	,moduli_buf[15] ,moduli_buf[16], moduli_buf[17], moduli_buf[18],
+	 moduli_buf[19], moduli_buf[20], moduli_buf[21], moduli_buf[22],
+	 moduli_buf[23], moduli_buf[24], moduli_buf[25], moduli_buf[26],
+	 moduli_buf[27], moduli_buf[28], moduli_buf[29], moduli_buf[30]
 #endif	/* UNIT16 and UNIT8 not defined */
 #endif	/* UNIT8 not defined */
 };
 
-/*	To optimize msubs, need following 2 unit arrays, each filled
-	with the most significant unit and next-to-most significant unit
-	of the preshifted images of the modulus. */
-static unit msu_moduli[UNITSIZE+1] = {0}; /* most signif. unit */
-static unit nmsu_moduli[UNITSIZE+1] = {0}; /* next-most signif. unit */
-
-/*	mpdbuf contains preshifted images of the multiplicand, mod n.
- 	It is used only by mp_modmult.  It could be staticly declared
-	inside of mp_modmult, but we put it outside mp_modmult so that 
-	it can be wiped clean by modmult_burn(), which is called at the
-	end of mp_modexp.  This is so that no sensitive data is left in 
-	memory after the program exits.
-*/
+/*
+ * To optimize msubs, need following 2 unit arrays, each filled
+ * with the most significant unit and next-to-most significant unit
+ * of the preshifted images of the modulus.
+ */
+static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */
+static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */
+
+/*
+ * mpdbuf contains preshifted images of the multiplicand, mod n.
+ * It is used only by mp_modmult.  It could be staticly declared
+ * inside of mp_modmult, but we put it outside mp_modmult so that
+ * it can be wiped clean by modmult_burn(), which is called at the
+ * end of mp_modexp.  This is so that no sensitive data is left in
+ * memory after the program exits.
+ */
 static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
 
 
 static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
-/*	Computes UNITSIZE images of r, each shifted left 1 more bit.
-	Used only by modmult function. 
-*/
-{	short int i;
+/*
+ * Computes UNITSIZE images of r, each shifted left 1 more bit.
+ * Used only by modmult function.
+ */
+{
+	short int i;
+
 	images[0] = r;	/* no need to move the first image, just copy ptr */
-	for (i=1; i<UNITSIZE; i++)
-	{	mp_move(images[i],images[i-1]);
+	for (i=1; i<UNITSIZE; i++) {
+		mp_move(images[i],images[i-1]);
 		mp_shift_left(images[i]);
+		msub(images[i],moduli[0]);
 	}
-}	/* stage_mp_images */
+} /* stage_mp_images */
 
 
 int stage_merritt_modulus(unitptr n)
-/*	Computes UNITSIZE+1 images of modulus, each shifted left 1 more bit.
-	Before calling mp_modmult, you must first stage the moduli images by
-	calling stage_modulus.  n is the pointer to the modulus.
-	Assumes that global_precision has already been adjusted to the
-	size of the modulus, plus SLOP_BITS.
-*/
-{	short int i;
+/*
+ * Computes UNITSIZE images of modulus, each shifted left 1 more bit.
+ * Before calling mp_modmult, you must first stage the moduli images by
+ * calling stage_modulus.  n is the pointer to the modulus.
+ * Assumes that global_precision has already been adjusted to the
+ * size of the modulus, plus SLOP_BITS.
+ */
+{
+	short int i;
 	unitptr msu;	/* ptr to most significant unit, for faster msubs */
+
 	moduli[0] = n;	/* no need to move the first image, just copy ptr */
 
 	/* used by optimized msubs macro... */
@@ -774,8 +847,8 @@ int stage_merritt_modulus(unitptr n)
 	msu_moduli[0] = *post_lowerunit(msu);
 	nmsu_moduli[0] = *msu;
 
-	for (i=1; i<UNITSIZE+1; i++)
-	{	mp_move(moduli[i],moduli[i-1]);
+	for (i=1; i<UNITSIZE; i++) {
+		mp_move(moduli[i],moduli[i-1]);
 		mp_shift_left(moduli[i]);
 
 		/* used by optimized msubs macro... */
@@ -783,24 +856,25 @@ int stage_merritt_modulus(unitptr n)
 		msu_moduli[i] = *post_lowerunit(msu);	/* for faster msubs */
 		nmsu_moduli[i] = *msu;
 	}
-	return(0);	/* normal return */
-}	/* stage_merritt_modulus */
+	return 0;	/* normal return */
+} /* stage_merritt_modulus */
 
 
 /* The following macros, sniffadd and msubs, are used by modmult... */
 #define sniffadd(i) if (*multiplier & power_of_2(i))  mp_add(prod,mpd[i])
 
 /* Unoptimized msubs macro (msubs0) follows... */
-/* #define msubs0(i) msub(prod,moduli[i]) 
+/* #define msubs0(i) msub(prod,moduli[i])
 */
 
-/*	To optimize msubs, compare the most significant units of the 
-	product and the shifted modulus before deciding to call the full 
-	mp_compare routine.  Better still, compare the upper two units
-	before deciding to call mp_compare.
-	Optimization of msubs relies heavily on standard C left-to-right 
-	parsimonious evaluation of logical expressions. 
-*/
+/*
+ * To optimize msubs, compare the most significant units of the
+ * product and the shifted modulus before deciding to call the full
+ * mp_compare routine.  Better still, compare the upper two units
+ * before deciding to call mp_compare.
+ * Optimization of msubs relies heavily on standard C left-to-right
+ * parsimonious evaluation of logical expressions.
+ */
 
 /* Partially-optimized msubs macro (msubs1) follows... */
 /* #define msubs1(i) if ( \
@@ -818,31 +892,32 @@ int stage_merritt_modulus(unitptr n)
 
 int merritt_modmult(register unitptr prod,
 	unitptr multiplicand,register unitptr multiplier)
-	/*	Performs combined multiply/modulo operation.  
-		Computes:  prod = (multiplicand*multiplier) mod modulus
-		WARNING: All the arguments must be less than the modulus!
-		Assumes the modulus has been predefined by first calling 
-		stage_modulus.
-	*/
+/*
+ * Performs combined multiply/modulo operation.
+ * Computes:  prod = (multiplicand*multiplier) mod modulus
+ * WARNING: All the arguments must be less than the modulus!
+ * Assumes the modulus has been predefined by first calling
+ * stage_modulus.
+ */
 {
 	/* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
 	register signedunit p_m;
 	register unitptr msu_prod;	/* ptr to most significant unit of product */
 	register unitptr nmsu_prod;	/* next-most signif. unit of product */
 	short mprec;		/* precision of multiplier, in units */
-	/*	Array mpd contains a list of pointers to preshifted images of 
+	/*	Array mpd contains a list of pointers to preshifted images of
 		the multiplicand: */
-	static unitptr mpd[UNITSIZE] =
-	{	 0,              &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
-		 &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
+	static unitptr mpd[UNITSIZE] = {
+		 0,          mpdbuf[ 0], mpdbuf[ 1], mpdbuf[ 2],
+		 mpdbuf[ 3], mpdbuf[ 4], mpdbuf[ 5], mpdbuf[ 6]
 #ifndef UNIT8
-		,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
-		 &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
+		,mpdbuf[ 7], mpdbuf[ 8], mpdbuf[ 9], mpdbuf[10],
+		 mpdbuf[11], mpdbuf[12], mpdbuf[13], mpdbuf[14]
 #ifndef UNIT16	/* and not UNIT8 */
-		,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
-		 &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
-		 &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
-		 &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
+		,mpdbuf[15], mpdbuf[16], mpdbuf[17], mpdbuf[18],
+		 mpdbuf[19], mpdbuf[20], mpdbuf[21], mpdbuf[22],
+		 mpdbuf[23], mpdbuf[24], mpdbuf[25], mpdbuf[26],
+		 mpdbuf[27], mpdbuf[28], mpdbuf[29], mpdbuf[30]
 #endif	/* UNIT16 and UNIT8 not defined */
 #endif	/* UNIT8 not defined */
 	};
@@ -855,133 +930,162 @@ int merritt_modmult(register unitptr pro
 	nmsu_prod = msu_prod;
 	post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
 
-	/*	To understand this algorithm, it would be helpful to first 
-		study the conventional Russian peasant modmult algorithm.
-		This one does about the same thing as Russian peasant, but
-		is organized differently to save some steps.  It loops 
-		through the multiplier a word (unit) at a time, instead of 
-		a bit at a time.  It word-shifts the product instead of 
-		bit-shifting it, so it should be faster.  It also does about 
-		half as many subtracts as Russian peasant.
-	*/
+	/*
+	 * To understand this algorithm, it would be helpful to first
+	 * study the conventional Russian peasant modmult algorithm.
+	 * This one does about the same thing as Russian peasant, but
+	 * is organized differently to save some steps.  It loops
+	 * through the multiplier a word (unit) at a time, instead of
+	 * a bit at a time.  It word-shifts the product instead of
+	 * bit-shifting it, so it should be faster.  It also does about
+	 * half as many subtracts as Russian peasant.
+	 */
 
 	mp_init(prod,0);	/* Initialize product to 0. */
 
-	/*	The way mp_modmult is actually used in cryptographic 
-		applications, there will NEVER be a zero multiplier or 
-		multiplicand.  So there is no need to optimize for that 
-		condition.
-	*/
+	/*
+	 * The way mp_modmult is actually used in cryptographic
+	 * applications, there will NEVER be a zero multiplier or
+	 * multiplicand.  So there is no need to optimize for that
+	 * condition.
+	 */
 /*	if (testeq(multiplicand,0))
-		return(0); */	/* zero multiplicand means zero product */
+		return 0; */	/* zero multiplicand means zero product */
 	/* Normalize and compute number of units in multiplier first: */
 	normalize(multiplier,mprec);
 	if (mprec==0)	/* if precision of multiplier is 0 */
-		return(0);	/* zero multiplier means zero product */
+		return 0;	/* zero multiplier means zero product */
 	make_msbptr(multiplier,mprec); /* start at MSU of multiplier */
 
-	while (mprec--)	/* Loop for the number of units in the multiplier */
-	{
-		/*	Shift the product one whole unit to the left.
-			This is part of the multiply phase of modmult. 
-		*/
+	while (mprec--) { /* Loop for the number of units in the multiplier */
+		/*
+		 * Shift the product one whole unit to the left.
+		 * This is part of the multiply phase of modmult.
+		 */
 
 		mp_lshift_unit(prod);
 
-		/*	Sniff each bit in the current unit of the multiplier, 
-			and conditionally add the the corresponding preshifted 
-			image of the multiplicand to the product.
-			This is also part of the multiply phase of modmult.
+		/* 
+		 * The product may have grown by as many as UNITSIZE
+		 * bits.  That's why we have global_precision set to the
+		 * size of the modulus plus UNITSIZE slop bits.
+		 * Now reduce the product back down by conditionally
+		 * subtracting the preshifted images of the modulus.
+		 * This is part of the modulo reduction phase of modmult. 
+		 * The following loop is unrolled for speed, using macros...
+
+		for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--)
+			if (mp_compare(prod,moduli[i]) >= 0)
+				mp_sub(prod,moduli[i]);
+		 */
 
-			The following loop is unrolled for speed, using macros...
+#ifndef UNIT8
+#ifndef UNIT16  /* and not UNIT8 */
+		msubs(31);
+		msubs(30);
+		msubs(29);
+		msubs(28);
+		msubs(27);
+		msubs(26);
+		msubs(25);
+		msubs(24);
+		msubs(23);
+		msubs(22);
+		msubs(21);
+		msubs(20);
+		msubs(19);
+		msubs(18);
+		msubs(17);
+		msubs(16);
+#endif  /* not UNIT16 and not UNIT8 */
+		msubs(15);
+		msubs(14);
+		msubs(13);
+		msubs(12);
+		msubs(11);
+		msubs(10);
+		msubs(9);
+		msubs(8);
+#endif  /* not UNIT8 */
+		msubs(7);
+		msubs(6);
+		msubs(5);
+#ifndef UNIT32
+		msubs(4);
+#ifndef UNIT16
+		msubs(3);
+#endif
+#endif
+
+		/*
+		 * Sniff each bit in the current unit of the multiplier, 
+		 * and conditionally add the corresponding preshifted 
+		 * image of the multiplicand to the product.
+		 * This is also part of the multiply phase of modmult.
+		 *
+		 * The following loop is unrolled for speed, using macros...
 
 		for (i=UNITSIZE-1; i>=0; i--)
-		   if (*multiplier & power_of_2(i)) 
+		   if (*multiplier & power_of_2(i))
 				mp_add(prod,mpd[i]);
-		*/
+		 */
 #ifndef UNIT8
 #ifndef UNIT16	/* and not UNIT8 */
-		sniffadd(31); 
-		sniffadd(30); 
-		sniffadd(29); 
+		sniffadd(31);
+		sniffadd(30);
+		sniffadd(29);
 		sniffadd(28);
-		sniffadd(27); 
-		sniffadd(26); 
-		sniffadd(25); 
+		sniffadd(27);
+		sniffadd(26);
+		sniffadd(25);
 		sniffadd(24);
-		sniffadd(23); 
-		sniffadd(22); 
-		sniffadd(21); 
+		sniffadd(23);
+		sniffadd(22);
+		sniffadd(21);
 		sniffadd(20);
-		sniffadd(19); 
-		sniffadd(18); 
-		sniffadd(17); 
+		sniffadd(19);
+		sniffadd(18);
+		sniffadd(17);
 		sniffadd(16);
 #endif	/* not UNIT16 and not UNIT8 */
-		sniffadd(15); 
-		sniffadd(14); 
-		sniffadd(13); 
+		sniffadd(15);
+		sniffadd(14);
+		sniffadd(13);
 		sniffadd(12);
-		sniffadd(11); 
-		sniffadd(10); 
-		sniffadd(9); 
+		sniffadd(11);
+		sniffadd(10);
+		sniffadd(9);
 		sniffadd(8);
 #endif	/* not UNIT8 */
-		sniffadd(7); 
-		sniffadd(6); 
-		sniffadd(5); 
+		sniffadd(7);
+		sniffadd(6);
+		sniffadd(5);
 		sniffadd(4);
-		sniffadd(3); 
-		sniffadd(2); 
-		sniffadd(1); 
+		sniffadd(3);
+		sniffadd(2);
+		sniffadd(1);
 		sniffadd(0);
 
-		/*	The product may have grown by as many as UNITSIZE+1 
-			bits.  That's why we have global_precision set to the
-			size of the modulus plus UNITSIZE+1 slop bits.  
-			Now reduce the product back down by conditionally 
-			subtracting	the UNITSIZE+1 preshifted images of the 
-			modulus.  This is the modulo reduction phase of modmult.
-
-			The following loop is unrolled for speed, using macros...
+		/*
+		 * The product may have grown by as many as LOG_UNITSIZE+1
+		 * bits.
+		 * Now reduce the product back down by conditionally 
+		 * subtracting LOG_UNITSIZE+1 preshifted images of the 
+		 * modulus.  This is the modulo reduction phase of modmult.
+		 *
+		 * The following loop is unrolled for speed, using macros...
 
-		for (i=UNITSIZE; i>=0; i--)
+		for (i=LOG_UNITSIZE; i>=0; i--)
 			if (mp_compare(prod,moduli[i]) >= 0) 
 				mp_sub(prod,moduli[i]); 
-		*/
+		 */
+
 #ifndef UNIT8
-#ifndef UNIT16	/* and not UNIT8 */
-		msubs(32); 
-		msubs(31); 
-		msubs(30); 
-		msubs(29); 
-		msubs(28);
-		msubs(27); 
-		msubs(26); 
-		msubs(25); 
-		msubs(24);
-		msubs(23); 
-		msubs(22); 
-		msubs(21); 
-		msubs(20);
-		msubs(19); 
-		msubs(18); 
-		msubs(17); 
-#endif	/* not UNIT16 and not UNIT8 */
-		msubs(16);
-		msubs(15); 
-		msubs(14); 
-		msubs(13); 
-		msubs(12);
-		msubs(11); 
-		msubs(10); 
-		msubs(9); 
-#endif	/* not UNIT8 */
-		msubs(8);
-		msubs(7); 
-		msubs(6); 
+#ifndef UNIT16
 		msubs(5); 
+#endif
 		msubs(4);
+#endif
 		msubs(3); 
 		msubs(2); 
 		msubs(1); 
@@ -990,28 +1094,30 @@ int merritt_modmult(register unitptr pro
 		/* Bump pointer to next lower unit of multiplier: */
 		post_lowerunit(multiplier);
 
-	}	/* Loop for the number of units in the multiplier */
+	} /* Loop for the number of units in the multiplier */
 
-	return(0);	/* normal return */
+	return 0;	/* normal return */
 
-}	/* merritt_modmult */
+} /* merritt_modmult */
 
 
 #undef msubs
 #undef sniffadd
 
 
-/*	Merritt's mp_modmult function leaves some internal tables in memory,
-	so we have to call modmult_burn() at the end of mp_modexp.  
-	This is so that no cryptographically sensitive data is left in memory 
-	after the program exits.
-*/
-static void merritt_burn(void)
-/*	Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
-{	unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
-	unitfill0(&(moduli_buf[0][0]),(UNITSIZE)*MAX_UNIT_PRECISION);
-	unitfill0(msu_moduli,UNITSIZE+1);
-	unitfill0(nmsu_moduli,UNITSIZE+1);
+/*
+ * Merritt's mp_modmult function leaves some internal tables in memory,
+ * so we have to call modmult_burn() at the end of mp_modexp.
+ * This is so that no cryptographically sensitive data is left in memory
+ * after the program exits.
+ */
+void merritt_burn(void)
+/* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
+{
+	unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
+	unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
+	unitfill0(msu_moduli,UNITSIZE);
+	unitfill0(nmsu_moduli,UNITSIZE);
 } /* merritt_burn() */
 
 /******* end of Merritt's MODMULT stuff. *******/
@@ -1019,153 +1125,125 @@ static void merritt_burn(void)
 #endif	/* MERRITT */
 
 
-#ifdef UPTON	/* using Jimmy Upton's modmult algorithm */
+#ifdef UPTON_OR_SMITH	/* Used by Upton's and Smith's modmult algorithms */
 
-/*	Jimmy Upton's multiprecision modmult algorithm in C.
-	Performs a multiply combined with a modulo operation.  
+/*
+ * Jimmy Upton's multiprecision modmult algorithm in C.
+ * Performs a multiply combined with a modulo operation.
+ *
+ * The following support functions and data structures
+ * are used only by Upton's modmult algorithm...
+ */
 
-	The following support functions and data structures
-	are used only by Upton's modmult algorithm...
-*/
+short munit_prec;	/* global_precision expressed in MULTUNITs */
 
-/*	The MULTUNIT word is the biggest word size for the native atomic 
-	multiply instruction.  It may or may not be smaller than UNITSIZE. 
-	Many CPU's have 16-by-16-bit multiplies, yielding a 32-bit product.
-	In that case, MULTUINIT should be a 16-bit word, even if the rest of
-	the multiprecision arithmetic functions assume a 32-bit UNIT word.
-*/
-typedef unsigned short MULTUNIT;
-#define	MULTUNITSIZE	(8*sizeof(MULTUNIT))	/* size in bits */
-
-static short unit_prec;	/* global_precision expressed in MULTUNITs */
+/*
+ * Note that since the SPARC CPU has no hardware integer multiply
+ * instruction, there is not that much advantage in having an
+ * assembly version of mp_smul on that machine.  It might be faster
+ * to use Merritt's modmult instead of Upton's modmult on the SPARC.
+ */
 
+/*
+ * Multiply the single-word multiplier times the multiprecision integer
+ * in multiplicand, accumulating result in prod.  The resulting
+ * multiprecision prod will be 1 word longer than the multiplicand.
+ * multiplicand is munit_prec words long.  We add into prod, so caller
+ * should zero it out first.  For best results, this time-critical
+ * function should be implemented in assembly.
+ * NOTE:  Unlike other functions in the multiprecision arithmetic
+ * library, both multiplicand and prod are pointing at the LSB,
+ * regardless of byte order of the machine.  On an 80x86, this makes
+ * no difference.  But if this assembly function is implemented
+ * on a 680x0, it becomes important.
+ */
 
-/*	Define MPORTABLE if there is no assembly version of the mp_smul
-	function available.  An assembly version is much faster.
-	Note that since the SPARC CPU has no hardware integer multiply 
-	instruction, there is not that much advantage in having an 
-	assembly version of mp_smul on that machine.  It might be faster
-	to use Merritt's modmult instead of Upton's modmult on the SPARC.
+/*
+ * Note that this has been modified from the previous version to allow
+ * better support for Smith's modmult:
+ * 	The final carry bit is added to the existing product
+ * 	array, rather than simply stored.
 */
-#ifdef MSDOS	/* we do have an assembly version available on the 8086 */
-#undef MPORTABLE
-#endif
 
-#ifdef MPORTABLE  /* use slow portable C version instead of assembly */
-
-/* 
-	Multiply the single-word multiplier times the multiprecision integer 
-	in multiplicand, accumulating result in prod.  The resulting 
-	multiprecision prod will be 1 word longer than the multiplicand.   
-	multiplicand is unit_prec words long.  We add into prod, so caller 
-	should zero it out first.  For best results, this time-critical 
-	function should be implemented in assembly.
-	NOTE:  Unlike other functions in the multiprecision arithmetic 
-	library, both multiplicand and prod are pointing at the LSB, 
-	regardless of byte order of the machine.  On an 80x86, this makes 
-	no difference.  But if this assembly function is implemented
-	on a 680x0, it becomes important.
-*/
+#ifndef mp_smul
 void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
 {
 	short i;
 	unsigned long p, carry;
 
 	carry = 0;
-	for (i=0; i<unit_prec; ++i)
-	{	p = (unsigned long)multiplier * *post_higherunit(multiplicand);
+	for (i=0; i<munit_prec; ++i) {
+		p = (unsigned long)multiplier * *post_higherunit(multiplicand);
 		p += *prod + carry;
 		*post_higherunit(prod) = (MULTUNIT)p;
 		carry = p >> MULTUNITSIZE;
 	}
-	/* We know that the high-order word of prod will always be 0 */
-	*prod = (MULTUNIT)carry;	/* copy carry to empty high word of prod */
-}	/* mp_smul */
-
-#else	/* not MPORTABLE-- use assembly routine */
-
-#define mp_smul P_SMUL	/* use external assembly routine */
-
-#endif /* MPORTABLE */
-
-#ifdef VMS
-
-#define mp_dmul p_dmul /* use external assembly routine */
-
-void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
-
-#else /* VMS */
+	/* Add carry to the next higher word of product / dividend */
+	*prod += (MULTUNIT)carry;
+} /* mp_smul */
+#endif
 
-/*	mp_dmul is a double-precision multiply multiplicand times multiplier, 
-	result into prod.  prod must be pointing at a "double precision" 
-	buffer.  E.g. If global_precision is 10 words, prod must be 
-	pointing at a 20-word buffer.
-*/
-static void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
+/*
+ * mp_dmul is a double-precision multiply multiplicand times multiplier,
+ * result into prod.  prod must be pointing at a "double precision"
+ * buffer.  E.g. If global_precision is 10 words, prod must be
+ * pointing at a 20-word buffer.
+ */
+#ifndef mp_dmul
+void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
 {
 	register	int i;
 	register	MULTUNIT *p_multiplicand, *p_multiplier;
 	register	MULTUNIT *prodp;
 
+
 	unitfill0(prod,global_precision*2);	/* Pre-zero prod */
 	/* Calculate precision in units of MULTUNIT */
-	unit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
+	munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
 	p_multiplicand = (MULTUNIT *)multiplicand;
 	p_multiplier = (MULTUNIT *)multiplier;
 	prodp = (MULTUNIT *)prod;
-	make_lsbptr(p_multiplicand,unit_prec);
-	make_lsbptr(p_multiplier,unit_prec);
-	make_lsbptr(prodp,unit_prec*2);
+	make_lsbptr(p_multiplicand,munit_prec);
+	make_lsbptr(p_multiplier,munit_prec);
+	make_lsbptr(prodp,munit_prec*2);
 	/* Multiply multiplicand by each word in multiplier, accumulating prod: */
-	for (i=0; i<unit_prec; ++i)
-		mp_smul (post_higherunit(prodp), 
+	for (i=0; i<munit_prec; ++i)
+		mp_smul (post_higherunit(prodp),
 			p_multiplicand, *post_higherunit(p_multiplier));
-}	/* mp_dmul */
+} /* mp_dmul */
+#endif  /* mp_dmul */
 
-#endif /* VMS */
+static unit ALIGN modulus[MAX_UNIT_PRECISION];
+static short nbits;			/* number of modulus significant bits */
+#endif /* UPTON_OR_SMITH */
 
-/*	These scratchpad arrays are used only by upton_modmult (mp_modmult).
- 	Some of them could be staticly declared inside of mp_modmult, but we 
-	put them outside mp_modmult so that they can be wiped clean by 
-	modmult_burn(), which is called at the end of mp_modexp.  This is 
-	so that no sensitive data is left in memory after the program exits.
-*/
+#ifdef UPTON
 
-#ifdef VAXC
 /*
- * Operations on arrays are sped up on a VAX if the data is quadword (64-bit)
- * aligned. This is because many VAXen use a minimum of 64-bit data paths to
- * cache and memory. This technique is VAX C private, hence the conditional.
- */
-static unit _align( quadword ) modulus[MAX_UNIT_PRECISION];
-static unit _align( quadword ) reciprocal[MAX_UNIT_PRECISION];
-static unit _align( quadword ) dhi[MAX_UNIT_PRECISION];
-static unit _align( quadword ) d_data[MAX_UNIT_PRECISION*2];	 /* double-wide register */
-static unit _align( quadword ) e_data[MAX_UNIT_PRECISION*2];	 /* double-wide register */
-static unit _align( quadword ) f_data[MAX_UNIT_PRECISION*2];	 /* double-wide register */
-
-#else /* VAXC */
-
-static unit modulus[MAX_UNIT_PRECISION];
-static unit reciprocal[MAX_UNIT_PRECISION];
-static unit dhi[MAX_UNIT_PRECISION];
-static unit d_data[MAX_UNIT_PRECISION*2];	/* double-wide register */
-static unit e_data[MAX_UNIT_PRECISION*2];	/* double-wide register */
-static unit f_data[MAX_UNIT_PRECISION*2];	/* double-wide register */
+ * These scratchpad arrays are used only by upton_modmult (mp_modmult).
+ * Some of them could be staticly declared inside of mp_modmult, but we
+ * put them outside mp_modmult so that they can be wiped clean by
+ * modmult_burn(), which is called at the end of mp_modexp.  This is
+ * so that no sensitive data is left in memory after the program exits.
+ */
 
-#endif /* VAXC */
+static unit ALIGN reciprocal[MAX_UNIT_PRECISION];
+static unit ALIGN dhi[MAX_UNIT_PRECISION];
+static unit ALIGN d_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
+static unit ALIGN e_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
+static unit ALIGN f_data[MAX_UNIT_PRECISION*2];     /* double-wide register */
 
-static short nbits;
 static short nbitsDivUNITSIZE;
 static short nbitsModUNITSIZE;
 
-/*	stage_upton_modulus() is aliased to stage_modulus().
-	Prepare for an Upton modmult.  Calculate the reciprocal of modulus,
-	and save both.  Note that reciprocal will have one more bit than 
-	modulus.
-	Assumes that global_precision has already been adjusted to the
-	size of the modulus, plus SLOP_BITS.
-*/
+/*
+ * stage_upton_modulus() is aliased to stage_modulus().
+ * Prepare for an Upton modmult.  Calculate the reciprocal of modulus,
+ * and save both.  Note that reciprocal will have one more bit than
+ * modulus.
+ * Assumes that global_precision has already been adjusted to the
+ * size of the modulus, plus SLOP_BITS.
+ */
 int stage_upton_modulus(unitptr n)
 {
 	mp_move(modulus, n);
@@ -1173,29 +1251,29 @@ int stage_upton_modulus(unitptr n)
 	nbits = countbits(modulus);
 	nbitsDivUNITSIZE = nbits / UNITSIZE;
 	nbitsModUNITSIZE = nbits % UNITSIZE;
-	return(0);	/* normal return */
-}	/* stage_upton_modulus */
+	return 0;	/* normal return */
+} /* stage_upton_modulus */
 
 
-/* 	Upton's algorithm performs a multiply combined with a modulo operation.  
-	Computes:  prod = (multiplicand*multiplier) mod modulus
-	WARNING: All the arguments must be less than the modulus!
-	References global unitptr modulus and reciprocal.
-	The reciprocal of modulus is 1 bit longer than the modulus.  
-	upton_modmult() is aliased to mp_modmult().
-*/
+/*
+ * Upton's algorithm performs a multiply combined with a modulo operation.
+ * Computes:  prod = (multiplicand*multiplier) mod modulus
+ * WARNING: All the arguments must be less than the modulus!
+ * References global unitptr modulus and reciprocal.
+ * The reciprocal of modulus is 1 bit longer than the modulus.
+ * upton_modmult() is aliased to mp_modmult().
+ */
 int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier)
 {
 	unitptr d = d_data;
 	unitptr d1 = d_data;
 	unitptr e = e_data;
 	unitptr f = f_data;
-	short orig_precision, dprec;
-	short i;
+	short orig_precision;
 
 	orig_precision = global_precision;
 	mp_dmul (d,multiplicand,multiplier);
-	
+
 	/* Throw off low nbits of d */
 #ifndef HIGHFIRST
 	d1 = d + nbitsDivUNITSIZE;
@@ -1229,13 +1307,13 @@ int upton_modmult (unitptr prod, unitptr
 		mp_sub (d,modulus);
 
 	mp_move(prod,d);
-	return(0);	/* normal return */
-}	/* upton_modmult */
+	return 0;  /* normal return */
+} /* upton_modmult */
 
 
 /*	Upton's mp_modmult function leaves some internal arrays in memory,
-	so we have to call modmult_burn() at the end of mp_modexp.  
-	This is so that no cryptographically sensitive data is left in memory 
+	so we have to call modmult_burn() at the end of mp_modexp.
+	This is so that no cryptographically sensitive data is left in memory
 	after the program exits.
 	upton_burn() is aliased to modmult_burn().
 */
@@ -1248,18 +1326,407 @@ void upton_burn(void)
 	unitfill0(e_data,MAX_UNIT_PRECISION*2);
 	unitfill0(f_data,MAX_UNIT_PRECISION*2);
 	nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
-}	/* upton_burn */
+} /* upton_burn */
+
+/******* end of Upton's MODMULT stuff. *******/
+/*=========================================================================*/
+#endif  /* UPTON */
+
+#ifdef SMITH	/* using Thad Smith's modmult algorithm */
+
+/*
+ * Thad Smith's implementation of multiprecision modmult algorithm in C.
+ * Performs a multiply combined with a modulo operation.
+ * The multiplication is done with mp_dmul, the same as for Upton's
+ * modmult.  The modulus reduction is done by long division, in
+ * which a trial quotient "digit" is determined, then the product of
+ * that digit and the divisor are subtracted from the dividend.
+ *
+ * In this case, the digit is MULTUNIT in size and the subtraction
+ * is done by adding the product to the one's complement of the
+ * dividend, which allows use of the existing mp_smul routine.
+ *
+ * The following support functions and data structures
+ * are used only by Smith's modmult algorithm...
+ */
+
+/*
+ * These scratchpad arrays are used only by smith_modmult (mp_modmult).
+ * Some of them could be statically declared inside of mp_modmult, but we
+ * put them outside mp_modmult so that they can be wiped clean by
+ * modmult_burn(), which is called at the end of mp_modexp.  This is
+ * so that no sensitive data is left in memory after the program exits.
+ */
+
+static unit ALIGN ds_data[MAX_UNIT_PRECISION*2+2];
+
+static unit mod_quotient  [4];
+static unit mod_divisor   [4];	/* 2 most signif. MULTUNITs of modulus */
+
+static MULTUNIT *modmpl;		/* ptr to modulus least significant
+					** MULTUNIT			    */
+static int  mshift;			/* number of bits for
+					** recip scaling	  */
+static MULTUNIT reciph; 		/* MSunit of scaled recip */
+static MULTUNIT recipl; 		/* LSunit of scaled recip */
+
+static short modlenMULTUNITS;		/* length of modulus in MULTUNITs */
+static MULTUNIT mutemp; 		/* temporary */
+
+/*
+ * The routines mp_smul and mp_dmul are the same as for UPTON and
+ * should be coded in assembly.  Note, however, that the previous
+ * Upton's mp_smul version has been modified to compatible with
+ * Smith's modmult.  The new version also still works for Upton's
+ * modmult.
+ */
+
+#ifndef mp_set_recip
+#define mp_set_recip(rh,rl,m)     /* null */
+#else
+/* setup routine for external mp_quo_digit */
+void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m);
+#endif
+MULTUNIT mp_quo_digit (MULTUNIT *dividend);
+
+#ifdef	MULTUNIT_SIZE_SAME
+#define mp_musubb mp_subb	/* use existing routine */
+#else  /* ! MULTUNIT_SIZE_SAME */
+
+/*
+ * This performs the same function as mp_subb, but with MULTUNITs.
+ * Note: Processors without alignment requirements may be able to use
+ * mp_subb, even though MULTUNITs are smaller than units.  In that case,
+ * use mp_subb, since it would be faster if coded in assembly.  Note that
+ * this implementation won't work for MULTUNITs which are long -- use
+ * mp_subb in that case.
+ */
+#ifndef mp_musubb
+boolean mp_musubb
+	(register MULTUNIT* r1,register MULTUNIT* r2,register boolean borrow)
+/*
+ * multiprecision subtract of MULTUNITs with borrow, r2 from r1, result in r1
+ * borrow is incoming borrow flag-- value should be 0 or 1
+ */
+{
+	register ulint x;	/* won't work if sizeof(MULTUNIT)==
+						 sizeof(long)	    */
+	short precision;	/* number of MULTUNITs to subtract */
+	precision = global_precision * MULTUNITs_perunit;
+	make_lsbptr(r1,precision);
+	make_lsbptr(r2,precision);
+	while (precision--) {
+		x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
+		*post_higherunit(r1) = x;
+		borrow = (((1L << MULTUNITSIZE) & x) != 0L);
+	}
+	return (borrow);
+} /* mp_musubb */
+#endif	/* mp_musubb */
+#endif	/* MULTUNIT_SIZE_SAME */
+
+/*
+ * The function mp_quo_digit is the heart of Smith's modulo reduction,
+ * which uses a form of long division.  It computes a trial quotient
+ * "digit" (MULTUNIT-sized digit) by multiplying the three most
+ * significant MULTUNITs of the dividend by the two most significant
+ * MULTUNITs of the reciprocal of the modulus.  Note that this function
+ * requires that MULTUNITSIZE * 2 <= sizeof(unsigned long).
+ *
+ * An important part of this technique is that the quotient never be
+ * too small, although it may occasionally be too large.  This was
+ * done to eliminate the need to check and correct for a remainder
+ * exceeding the divisor.	It is easier to check for a negative
+ * remainder.  The following technique rarely needs correction for
+ * MULTUNITs of at least 16 bits.
+ *
+ * The following routine has two implementations:
+ *
+ * ASM_PROTOTYPE defined: written to be an executable prototype for
+ * an efficient assembly language implementation.  Note that several
+ * of the following masks and shifts can be done by directly
+ * manipulating single precision registers on some architectures.
+ *
+ * ASM_PROTOTYPE undefined: a slightly more efficient implementation
+ * in C.  Although this version returns a result larger than the
+ * optimum (which is corrected in smith_modmult) more often than the
+ * prototype version, the faster execution in C more than makes up
+ * for the difference.
+ * 
+ * Parameter: dividend - points to the most significant MULTUNIT
+ * 	of the dividend.  Note that dividend actually contains the 
+ * 	one's complement of the actual dividend value (see comments for 
+ * 	smith_modmult).
+ * 
+ *  Return: the trial quotient digit resulting from dividing the first
+ * 	three MULTUNITs at dividend by the upper two MULTUNITs of the 
+ * 	modulus.
+ */
+
+/* #define ASM_PROTOTYPE */ /* undefined: use C-optimized version */
+
+#ifndef mp_quo_digit
+MULTUNIT mp_quo_digit (MULTUNIT *dividend) {
+	unsigned long q, q0, q1, q2;
+	unsigned short lsb_factor;
+
+/*
+ * Compute the least significant product group.
+ *The last terms of q1 and q2 perform upward rounding, which is
+ *needed to guarantee that the result not be too small.
+ */
+	q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long)reciph
+	     + reciph;
+	q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)recipl
+	     + (1L << MULTUNITSIZE) ;
+#ifdef ASM_PROTOTYPE
+	lsb_factor = 1 & (q1>>MULTUNITSIZE) & (q2>>MULTUNITSIZE);
+	q = q1 + q2;
+
+	/* The following statement is equivalent to shifting the sum right
+	   one bit while shifting in the carry bit.
+	*/
+	q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
+#else 	/* optimized C version */
+	q0 = (q1>>1) + (q2>>1) + 1;
+#endif
+
+/*	Compute the middle significant product group.	*/
+
+	q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph;
+	q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl;
+#ifdef ASM_PROTOTYPE
+	q = q1 + q2;
+	q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
+
+/*	Add in the most significant word of the first group.
+	The last term takes care of the carry from adding the lsb's
+	that were shifted out prior to addition.
+*/
+	q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2));
+#else 	/* optimized C version */
+	q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1;
+#endif
+
+/*	Compute the most significant term and add in the others */
+
+	q = (q >> (MULTUNITSIZE-2)) +
+	    (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1);
+	q >>= mshift;
+
+/*	Prevent overflow and then wipe out the intermediate results. */
+
+	mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1);
+	q= q0 = q1 = q2 = 0;   lsb_factor = 0; (void)lsb_factor;
+	return mutemp;
+}
+#endif	/* mp_quo_digit */
+
+/*
+ * stage_smith_modulus() - Prepare for a Smith modmult.
+ * 
+ * Calculate the reciprocal of modulus with a precision of two MULTUNITs.
+ * Assumes that global_precision has already been adjusted to the
+ * size of the modulus, plus SLOP_BITS.
+ * 
+ * Note: This routine was designed to work with large values and
+ * doesn't have the necessary testing or handling to work with a
+ * modulus having less than three significant units.  For such cases,
+ * the separate multiply and modulus routines can be used.
+ * 
+ * stage_smith_modulus() is aliased to stage_modulus().
+ */
+
+int stage_smith_modulus(unitptr n_modulus)
+{
+	int   original_precision;
+	int   sigmod;		/* significant units in modulus */
+	unitptr  mp;		/* modulus most significant pointer */
+	MULTUNIT *mpm;		/* reciprocal pointer */
+	int   prec;		/* precision of reciprocal calc in units */
+
+	mp_move(modulus, n_modulus);
+	modmpl = (MULTUNIT*) modulus;
+	modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit);
+	nbits = countbits(modulus);
+	modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE;
+
+	original_precision = global_precision;
+
+	/* The following code copies the three most significant units of
+	 * modulus to mod_divisor.
+	*/
+	mp = modulus;
+	sigmod = significance (modulus);
+	rescale (mp, original_precision, sigmod);
+/* prec is the unit precision required for 3 MULTUNITs */
+	prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit;
+	set_precision (prec);
+ 
+	/* set mp = ptr to most significant units of modulus, then move
+	 * the most significant units to mp_divisor 
+	*/
+	mp = msbptr(mp,sigmod) -tohigher(prec-1);
+	unmake_lsbptr (mp, prec);
+	mp_move (mod_divisor, mp);
+ 
+	/* Keep 2*MULTUNITSIZE bits in mod_divisor.
+	 * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits.
+	*/
+	mshift = countbits (mod_divisor) - 2*MULTUNITSIZE;
+	mp_shift_right_bits (mod_divisor, mshift);
+	mp_recip(mod_quotient,mod_divisor);
+	mp_shift_right_bits (mod_quotient,1);
+
+	/* Reduce to:   0 < mshift <= MULTUNITSIZE */
+	mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1;
+	/* round up, rescaling if necessary */
+	mp_inc (mod_quotient);
+	if (countbits (mod_quotient) > 2*MULTUNITSIZE) {
+		mp_shift_right_bits (mod_quotient,1);
+		mshift--;	/* now  0 <= mshift <= MULTUNITSIZE */
+	}
+	mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit);
+	recipl = *post_higherunit (mpm);
+	reciph = *mpm;
+	mp_set_recip (reciph, recipl, mshift);
+	set_precision (original_precision);
+	return 0;	/* normal return */
+} /* stage_smith_modulus */
+
+/*	Smith's algorithm performs a multiply combined with a modulo operation.
+	Computes:  prod = (multiplicand*multiplier) mod modulus
+	The modulus must first be set by stage_smith_modulus().
+	smith_modmult() is aliased to mp_modmult().
+*/
+
+int
+smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
+{
+	unitptr    d;	/* ptr to product */ 
+	MULTUNIT   *dmph, *dmpl, *dmp;	/* ptrs to dividend (high, low, first)
+					 * aligned for subtraction	     */
+/*	Note that dmph points one MULTUNIT higher than indicated by
+	global precision.  This allows us to zero out a word one higher than
+	the normal precision.
+*/
+	short	    orig_precision;
+	short	    nqd;	/* number of quotient digits remaining to
+				 * be generated 			     */
+	short	    dmi;	/* number of significant MULTUNITs in product */
+
+	d = ds_data + 1;	/* room for leading MSB if HIGHFIRST */
+	orig_precision = global_precision;
+	mp_dmul(d, multiplicand, multiplier);
+
+	rescale(d, orig_precision * 2, orig_precision * 2 + 1);
+	set_precision(orig_precision * 2 + 1);
+	*msbptr(d, global_precision) = 0;	/* leading 0 unit */
+
+/*	We now start working with MULTUNITs.
+	Determine the most significant MULTUNIT of the product so we don't
+	have to process leading zeros in our divide loop.
+*/
+	dmi = significance(d) * MULTUNITs_perunit;
+	if (dmi >= modlenMULTUNITS) {
+		/* Make dividend negative.  This allows the use of mp_smul to
+		 * "subtract" the product of the modulus and the trial divisor
+		 * by actually adding to a negative dividend.
+		 * The one's complement of the dividend is used, since it causes
+		 * a zero value to be represented as all ones.  This facilitates
+		 * testing the result for possible overflow, since a sign bit
+		 * indicates that no adjustment is necessary, and we should not
+		 * attempt to adjust if the result of the addition is zero.
+		 */
+		mp_inc(d);
+		mp_neg(d);
+		set_precision(orig_precision);
+		munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
+
+		/* Set msb, lsb, and normal ptrs of dividend */
+		dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) *
+			    MULTUNITs_perunit) + tohigher(dmi);
+		nqd = dmi + 1 - modlenMULTUNITS;
+		dmpl = dmph - tohigher(modlenMULTUNITS);
+
+/*	
+ * Divide loop.
+ * Each iteration computes the next quotient MULTUNIT digit, then
+ * multiplies the divisor (modulus) by the quotient digit and adds
+ * it to the one's complement of the dividend (equivalent to
+ * subtracting).  If the product was greater than the remaining dividend,
+ * we get a non-negative result, in which case we subtract off the
+ * modulus to get the proper negative remainder.
+ */
+		for (; nqd; nqd--) {
+			MULTUNIT    q;	    /* quotient trial digit */
+
+			q = mp_quo_digit(dmph);
+			if (q > 0) {
+				mp_smul(dmpl, modmpl, q);
+
+				/* Perform correction if q too large.
+				*  This rarely occurs.
+				*/
+				if (!(*dmph & MULTUNIT_msb)) {
+					dmp = dmpl;
+					unmake_lsbptr(dmp, orig_precision * 
+						   MULTUNITs_perunit);
+					if (mp_musubb(dmp,
+						   (MULTUNIT *) modulus, 0))
+						(*dmph)  --;
+				}
+			}
+			pre_lowerunit(dmph);
+			pre_lowerunit(dmpl);
+		}
+		/* d contains the one's complement of the remainder. */
+		rescale(d, orig_precision * 2 + 1, orig_precision);
+		set_precision(orig_precision);
+		mp_neg(d);
+		mp_dec(d);
+	} else {
+		/* Product was less than modulus.  Return it. */
+		rescale(d, orig_precision * 2 + 1, orig_precision);
+		set_precision(orig_precision);
+	}
+	mp_move(prod, d);
+	return (0);		/* normal return */
+} /* smith_modmult */
+
+
+/*	Smith's mp_modmult function leaves some internal arrays in memory,
+	so we have to call modmult_burn() at the end of mp_modexp.
+	This is so that no cryptographically sensitive data is left in memory
+	after the program exits.
+	smith_burn() is aliased to modmult_burn().
+*/
+void smith_burn(void)
+{
+	empty_array (modulus);
+	empty_array (ds_data);
+	empty_array (mod_quotient);
+	empty_array (mod_divisor);
+	modmpl = 0;
+	mshift =nbits = 0;
+	reciph = recipl = 0;
+	modlenMULTUNITS = mutemp = 0;
+	mp_set_recip (0,0,0);
+} /* smith_burn */
 
-#endif	/* UPTON */
+/*	End of Thad Smith's implementation of modmult. */
+
+#endif	/* SMITH */
 
 
 int countbits(unitptr r)
 	/* Returns number of significant bits in r */
-{	int bits;
+{
+	int bits;
 	short prec;
 	register unit bitmask;
 	init_bitsniffer(r,bitmask,prec,bits);
-	return(bits);
+	return bits;
 } /* countbits */
 
 
@@ -1270,11 +1737,13 @@ char *copyright_notice(void)
 
 int mp_modexp(register unitptr expout,register unitptr expin,
 	register unitptr exponent,register unitptr modulus)
-{	/*	Russian peasant combined exponentiation/modulo algorithm.
-		Calls modmult instead of mult. 
-		Computes:  expout = (expin**exponent) mod modulus
-		WARNING: All the arguments must be less than the modulus!
-	*/
+/*
+ * Russian peasant combined exponentiation/modulo algorithm.
+ * Calls modmult instead of mult.
+ * Computes:  expout = (expin**exponent) mod modulus
+ * WARNING: All the arguments must be less than the modulus!
+ */
+{
 	int bits;
 	short oldprecision;
 	register unit bitmask;
@@ -1286,21 +1755,21 @@ int mp_modexp(register unitptr expout,re
 	tally_modsquares = 0;	/* clear "number of modsquares" counter */
 #endif	/* COUNTMULTS */
 	mp_init(expout,1);
-	if (testeq(exponent,0))
-	{	if (testeq(expin,0))
-			return(-1); /* 0 to the 0th power means return error */
-		return(0);	/* otherwise, zero exponent means expout is 1 */
+	if (testeq(exponent,0)) {
+		if (testeq(expin,0))
+			return -1; /* 0 to the 0th power means return error */
+		return 0;	/* otherwise, zero exponent means expout is 1 */
 	}
 	if (testeq(modulus,0))
-		return(-2);		/* zero modulus means error */
+		return -2;		/* zero modulus means error */
 #if SLOP_BITS > 0	/* if there's room for sign bits */
 	if (mp_tstminus(modulus))
-		return(-2);		/* negative modulus means error */
+		return -2;		/* negative modulus means error */
 #endif	/* SLOP_BITS > 0 */
 	if (mp_compare(expin,modulus) >= 0)
-		return(-3); /* if expin >= modulus, return error */
+		return -3; /* if expin >= modulus, return error */
 	if (mp_compare(exponent,modulus) >= 0)
-		return(-4); /* if exponent >= modulus, return error */
+		return -4; /* if exponent >= modulus, return error */
 
 	oldprecision = global_precision;	/* save global_precision */
 	/* set smallest optimum precision for this modulus */
@@ -1311,9 +1780,9 @@ int mp_modexp(register unitptr expout,re
 	rescale(exponent,oldprecision,global_precision);
 	rescale(expout,oldprecision,global_precision);
 
-	if (stage_modulus(modulus))
-	{	set_precision(oldprecision);	/* restore original precision */
-		return(-5);		/* unstageable modulus (STEWART algorithm) */
+	if (stage_modulus(modulus)) {
+		set_precision(oldprecision);	/* restore original precision */
+		return -5;		/* unstageable modulus (STEWART algorithm) */
 	}
 
 	/* normalize and compute number of bits in exponent first */
@@ -1323,29 +1792,29 @@ int mp_modexp(register unitptr expout,re
 	bits--;		/* We know for sure at this point that bits>0 */
 	mp_move(expout,expin);		/*  expout = (1*1)*expin; */
 	bump_bitsniffer(exponent,bitmask);
-	
-	while (bits--)
-	{
+
+	while (bits--) {
 		poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
 #ifdef COUNTMULTS
 		tally_modsquares++;	/* bump "number of modsquares" counter */
 #endif	/* COUNTMULTS */
 		mp_modsquare(product,expout);
-		mp_move(expout,product);
-		if (sniff_bit(exponent,bitmask))
-		{	mp_modmult(product,expout,expin);
-			mp_move(expout,product);
+		if (sniff_bit(exponent,bitmask)) {
+			mp_modmult(expout,product,expin);
 #ifdef COUNTMULTS
 			tally_modmults++;	/* bump "number of modmults" counter */
 #endif	/* COUNTMULTS */
+		} else {
+			mp_move(expout,product);
 		}
 		bump_bitsniffer(exponent,bitmask);
-	}	/* while bits-- */
+	} /* while bits-- */
 	mp_burn(product);	/* burn the evidence on the stack */
 	modmult_burn(); /* ask mp_modmult to also burn its own evidence */
 
 #ifdef COUNTMULTS	/* diagnostic analysis */
-	{	long atomic_mults;
+	{
+		long atomic_mults;
 		unsigned int unitcount,totalmults;
 		unitcount = bits2units(countbits(modulus));
 		/* calculation assumes modsquare takes as long as a modmult: */
@@ -1365,109 +1834,110 @@ int mp_modexp(register unitptr expout,re
 	   will be forced to include it in the executable object image... */
 	copyright_notice();	/* has no real effect at run time */
 
-	return(0);		/* normal return */
-}	/* mp_modexp */
-
+	return 0;		/* normal return */
+} /* mp_modexp */
 
-
-
-/*********************************************************************
-	RSA-specific routines follow.  These are the only functions that 
-	are specific to the RSA public key cryptosystem.  The other
-	multiprecision integer math functions may be used for non-RSA
-	applications.  
-
-	The RSA public key cryptosystem is patented by the Massachusetts
-	Institute of Technology (U.S. patent #4,405,829).  This patent does
-	not apply outside the USA.  Public Key Partners (PKP) holds the
-	exclusive commercial license to sell and sub-license the RSA public
-	key cryptosystem.  The author of this software implementation of the
-	RSA algorithm is providing this software for educational use only. 
-	Licensing this algorithm from PKP is the responsibility of you, the
-	user, not Philip Zimmermann, the author of this software.  The author
-	assumes no liability for any breach of patent law resulting from the
-	unlicensed use of this software by the user.
-*********************************************************************/
-
-
-int rsa_decrypt(unitptr M, unitptr C,
-	unitptr d, unitptr p, unitptr q, unitptr u)
-	/*	Uses Quisquater's Chinese Remainder Theorem shortcut 
-		for RSA decryption.
-		M is the output plaintext message.
-		C is the input ciphertext message.
-		d is the secret decryption exponent.
-		p and q are the secret prime factors of n.
-		u is the multiplicative inverse of p, mod q.
-		Note that u is precomputed on the assumption that p<q.
-		n, the common modulus, is not used here because of the
-		Chinese Remainder Theorem shortcut.
-	*/
+int mp_modexp_crt(unitptr expout, unitptr expin,
+	unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u)
+/*
+ * This is a faster modexp for moduli with a known factorisation into two
+ * relatively prime factors p and q, and an input relatively prime to the
+ * modulus, the Chinese Remainder Theorem to do the computation mod p and
+ * mod q, and then combine the results.  This relies on a number of
+ * precomputed values, but does not actually require the modulus n or the
+ * exponent e.
+ * 
+ * expout = expin ^ e mod (p*q).
+ * We form this by evaluating
+ * p2 = (expin ^ e) mod p and
+ * q2 = (expin ^ e) mod q
+ * and then combining the two by the CRT.
+ * 
+ * Two optimisations of this are possible.  First, we can reduce expin
+ * modulo p and q before starting.
+ * 
+ * Second, since we know the factorisation of p and q (trivially derived
+ * from the factorisation of n = p*q), and expin is relatively prime to
+ * both p and q, we can use Euler's theorem, expin^phi(m) = 1 (mod m),
+ * to throw away multiples of phi(p) or phi(q) in e.
+ * Letting ep = e mod phi(p) and
+ *         eq = e mod phi(q)
+ * then combining these two speedups, we only need to evaluate
+ * p2 = ((expin mod p) ^ ep) mod p and
+ * q2 = ((expin mod q) ^ eq) mod q.
+ * 
+ * Now we need to apply the CRT.  Starting with
+ * expout = p2 (mod p) and
+ * expout = q2 (mod q)
+ * we can say that expout = p2 + p * k, and if we assume that 0 <= p2 < p,
+ * then 0 <= expout < p*q for some 0 <= k < q.  Since we want expout = q2
+ * (mod q), then p*k = q2-p2 (mod q).  Since p and q are relatively prime,
+ * p has a multiplicative inverse u mod q.  In other words, u = 1/p (mod q).
+ *
+ * Multiplying by u on both sides gives k = u*(q2-p2) (mod q).
+ * Since we want 0 <= k < q, we can thus find k as
+ * k = (u * (q2-p2)) mod q.
+ * 
+ * Once we have k, evaluating p2 + p * k is easy, and
+ * that gives us the result.
+ * 
+ * In the detailed implementation, there is a temporary, temp, used to
+ * hold intermediate results, p2 is held in expout, and q2 is used as a
+ * temporary in the final step when it is no longer needed.  With that,
+ * you should be able to understand the code below.
+ */
 {
-	unit p2[MAX_UNIT_PRECISION];
 	unit q2[MAX_UNIT_PRECISION];
-	unit temp1[MAX_UNIT_PRECISION];
-	unit temp2[MAX_UNIT_PRECISION];
+	unit temp[MAX_UNIT_PRECISION];
 	int status;
 
-	mp_init(M,1);	/* initialize result in case of error */
+/*	First, compiute p2 (physically held in M) */
 
-	if (mp_compare(p,q) >= 0)	/* ensure that p<q */
-	{	/* swap the pointers p and q */
-		unitptr t;
-		t = p;  p = q; q = t;
-	}
-
-/*	Rather than decrypting by computing modexp with full mod n
-	precision, compute a shorter modexp with mod p precision... */
-
-/*	p2 = [ (C mod p)**( d mod (p-1) ) ] mod p		*/
-	mp_move(temp1,p);
-	mp_dec(temp1);			/* temp1 = p-1 */
-	mp_mod(temp2,d,temp1);	/* temp2 = d mod (p-1) ) */
-	mp_mod(temp1,C,p);		/* temp1 = C mod p  */
-	status = mp_modexp(p2,temp1,temp2,p);
-	if (status < 0)	/* mp_modexp returned an error. */
-		return(status);	/* error return */
-
-/*	Now compute a short modexp with mod q precision... */
-
-/*	q2 = [ (C mod q)**( d mod (q-1) ) ] mod q		*/
-	mp_move(temp1,q);
-	mp_dec(temp1);			/* temp1 = q-1 */
-	mp_mod(temp2,d,temp1);	/* temp2 = d mod (q-1) */
-	mp_mod(temp1,C,q);		/* temp1 = C mod q  */
-	status = mp_modexp(q2,temp1,temp2,q);
-	if (status < 0)	/* mp_modexp returned an error. */
-		return(status);	/* error return */
+/*	p2 = [ (expin mod p) ^ ep ] mod p		*/
+	mp_mod(temp,expin,p);		/* temp = expin mod p  */
+	status = mp_modexp(expout,temp,ep,p);
+	if (status < 0) {	/* mp_modexp returned an error. */
+		mp_init(expout,1);
+		return status;	/* error return */
+	}
+
+/*	And the same thing for q2 */
+
+/*	q2 = [ (expin mod q) ^ eq ] mod q		*/
+	mp_mod(temp,expin,q);		/* temp = expin mod q  */
+	status = mp_modexp(q2,temp,eq,q);
+	if (status < 0) {	/* mp_modexp returned an error. */
+		mp_init(expout,1);
+		return status;	/* error return */
+	}
 
 /*	Now use the multiplicative inverse u to glue together the
-	two halves, saving a lot of time by avoiding a full mod n
-	exponentiation.  At key generation time, u was computed
-	with the secret key as the multiplicative inverse of
-	p, mod q such that:  (p*u) mod q = 1, assuming p<q.
+	two halves.
+*/
+#if 0
+/* This optimisation is useful if you commonly get small results,
+   but for random results and large q, the odds of (1/q) of it
+   being useful do not warrant the test.
 */
-	if (mp_compare(p2,q2) == 0)	/* only happens if C<p */
-		mp_move(M,p2);
-	else
-	{	/*	q2 = q2 - p2;  if q2<0 then q2 = q2 + q  */
-		if (mp_sub(q2,p2))	/* if the result went negative... */
-			mp_add(q2,q);		/* add q to q2 */
-
-		/*	M = p2 + ( p * [(q2*u) mod q] ) 	*/
-		mp_mult(temp1,q2,u);		/* temp1 = q2*u  */
-		mp_mod(temp2,temp1,q);		/* temp2 = temp1 mod q */
-		mp_mult(temp1,p,temp2);	/* temp1 = p * temp2 */
-		mp_add(temp1,p2);		/* temp1 = temp1 + p2 */
-		mp_move(M,temp1);		/* M = temp1 */
-	}
-
-	mp_burn(p2);	/* burn the evidence on the stack...*/
-	mp_burn(q2);
-	mp_burn(temp1);
-	mp_burn(temp2);
-	return(0);	/* normal return */
-}	/* rsa_decrypt */
+	if (mp_compare(expout,q2) != 0) {
+#endif
+	/*	Find q2-p2 mod q */
+	if (mp_sub(q2,expout))	/* if the result went negative */
+		mp_add(q2,q);	/* add q to q2 */
+
+	/*	expout = p2 + ( p * [(q2*u) mod q] )	*/
+	mp_mult(temp,q2,u);		/* q2*u  */
+	mp_mod(q2,temp,q);		/* (q2*u) mod q */
+	mp_mult(temp,p,q2);		/* p * [(q2*u) mod q] */
+	mp_add(expout,temp);		/* expout = p2 + p * [...] */
+#if 0
+	}
+#endif
+
+	mp_burn(q2);	/* burn the evidence on the stack...*/
+	mp_burn(temp);
+	return 0;	/* normal return */
+} /* mp_modexp_crt */
 
 
 /****************** end of MPI library ****************************/