File:  [PGP] / pgp / src / rsalib.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs
Tue Apr 24 16:37:52 2018 UTC (3 years, 3 months ago) by root
Branches: phill, MAIN
CVS tags: pgp10, HEAD
PGP 1.0

/*	C source code for RSA multiprecision arithmetic library routines.
	Implemented Nov 86 by Philip Zimmermann
	Last revised 10 May 91 by PRZ

	Boulder Software Engineering
	3021 Eleventh Street
	Boulder, CO 80304
	(303) 444-4541

	(c) Copyright 1986 by Philip Zimmermann.  All rights reserved.
	The author assumes no liability for damages resulting from the use 
	of this software, even if the damage results from defects in this 
	software.  No warranty is expressed or implied.  The use of this 
	cryptographic software for developing weapon systems is expressly 
	forbidden.

	The RSA public key cryptosystem is patented by the Massachusetts
	Institute of Technology (U.S. patent #4,405,829).  Public Key 
	Partners (PKP) holds the exclusive commercial license to sell and 
	sub-license the RSA public key cryptosystem.  The author of this 
	software implementation of the RSA algorithm is providing this 
	implementation for educational use only.  Licensing this algorithm 
	from PKP is the responsibility of you, the user, not Philip
	Zimmermann, the author of this implementation.  The author assumes
	no liability for any breach of patent law resulting from the
	unlicensed use of this software by the user.

	These routines implement all of the multiprecision arithmetic necessary
	for Rivest-Shamir-Adleman (RSA) public key cryptography.

	Although originally developed in Microsoft C for the IBM PC, this code 
	contains few machine dependancies.  It assumes 2's complement 
	arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines, 
	lowbyte-highbyte order or highbyte-lowbyte order.  This version
	has been converted to ANSI C.


	The internal representation for these extended precision integer
	"registers" is an array of "units".  A unit is a machine word, which
	is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
	unsigned integer, depending on the machine's word size.  For example,
	an IBM PC or AT uses a unit size of 16 bits.  To perform arithmetic
	on these huge precision integers, we pass pointers to these unit
	arrays to various subroutines.  A pointer to an array of units is of
	type unitptr.  This is a pointer to a huge integer "register".

	When calling a subroutine, we always pass a pointer to the BEGINNING
	of the array of units, regardless of the byte order of the machine.
	On a lowbyte-first machine, such as the Intel 80x86, this unitptr
	points to the LEAST significant unit, and the array of units increases
	significance to the right.  On a highbyte-first machine, such as the
	Motorola 680x0, this unitptr points to the MOST significant unit, and
	the array of units decreases significance to the right.
*/

/* #define COUNTMULTS */ /* count modmults for performance studies */

#ifdef DEBUG
#ifndef EMBEDDED	/* not EMBEDDED - not compiling for embedded target */
#include <stdio.h> 	/* for printf, etc. */
#include <conio.h> 	/* DEBUG mode:  for kbhit() */
#define poll_for_break() {while (kbhit()) getch();}
#endif	/* not EMBEDDED */
#else
#define poll_for_break() /* stub */
#endif	/* not DEBUG */

#define RSALIB
#include "rsalib.h"

short global_precision = 0; /* units of precision for all routines */
/*	global_precision is the unit precision last set by set_precision.
	Initially, set_precision() should be called to define global_precision
	before using any of these other RSA library routines.
	i.e.:   set_precision(MAX_UNIT_PRECISION);
*/

#ifdef PORTABLE
/*************** multiprecision library primitives ****************/
/*	The following portable C primitives should be recoded in assembly.
	The equivalent assembly primitives are externally defined under
	different names, unless PORTABLE is defined.  See the header file
	"rsalib.h" for further details.

	The carry bit mechanism we use for these portable primitives
	will only work if the size of unit is smaller than the size of
	a long integer.  In most cases, this means UNITSIZE must be
	less than 32 bits -- if you use the C portable primitives.
*/

typedef unsigned long int ulint;
#define carrybit ((ulint) 1 << UNITSIZE)
/* ...assumes sizeof(unit) < sizeof(unsigned long) */

boolean mp_addc
	(register unitptr r1,register unitptr r2,register boolean carry)
	/* multiprecision add with carry r2 to r1, result in r1 */
	/* carry is incoming carry flag-- value should be 0 or 1 */
{	register ulint x;	/* won't work if sizeof(unit)==sizeof(long) */
	short precision;	/* number of units to add */
	precision = global_precision;
	make_lsbptr(r1,precision);
	make_lsbptr(r2,precision);
	while (precision--)	
	{	x = (ulint) *r1 + (ulint) *post_higherunit(r2) + (ulint) carry;
		*post_higherunit(r1) = x;
		carry = ((x & carrybit) != 0L);
	}
	return(carry);		/* return the final carry flag bit */
}	/* mp_addc */


boolean mp_subb
	(register unitptr r1,register unitptr r2,register boolean borrow)
	/* multiprecision subtract with borrow, r2 from r1, result in r1 */
	/* borrow is incoming borrow flag-- value should be 0 or 1 */
{	register ulint x;	/* won't work if sizeof(unit)==sizeof(long) */
	short precision;	/* number of units to subtract */
	precision = global_precision;
	make_lsbptr(r1,precision);
	make_lsbptr(r2,precision);
	while (precision--)	
	{	x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
		*post_higherunit(r1) = x;
		borrow = ((x & carrybit) != 0L);
	}
	return(borrow);		/* return the final carry/borrow flag bit */
}	/* mp_subb */

#undef carrybit


boolean mp_rotate_left(register unitptr r1,register boolean carry)
	/* multiprecision rotate left 1 bit with carry, result in r1. */
	/* carry is incoming carry flag-- value should be 0 or 1 */
{	register short precision;	/* number of units to rotate */
	register boolean nextcarry;
	precision = global_precision;
	make_lsbptr(r1,precision);
	while (precision--)	
	{	
		nextcarry = (((signedunit) *r1) < 0);
		/* nextcarry = ((*r1 & uppermostbit) != 0); */
		*r1 <<= 1 ;
		if (carry) *r1 |= 1;
		carry = nextcarry;
		pre_higherunit(r1);
	}
	return(nextcarry);	/* return the final carry flag bit */
}	/* mp_rotate_left */

/************** end of primitives that should be in assembly *************/
#endif	/* PORTABLE */


/* 	The mp_rotate_right function is not called in any time-critical
	situations in RSA functions, so it doesn't need to be coded 
	in assembly language.
*/
boolean mp_rotate_right(register unitptr r1,register boolean carry)
	/* multiprecision rotate right 1 bit with carry, result in r1. */
	/* carry is incoming carry flag-- value should be 0 or 1 */
{	register short precision;	/* number of units to rotate */
	register boolean nextcarry;
	precision = global_precision;
	make_msbptr(r1,precision);
	while (precision--)	
	{	nextcarry = *r1 & 1;
		*r1 >>= 1 ;
		if (carry) *r1 |= uppermostbit;
		carry = nextcarry;
		pre_lowerunit(r1);
	}
	return(nextcarry);	/* return the final carry flag bit */
}	/* mp_rotate_right */


short mp_compare(register unitptr r1,register unitptr r2)
/*	Compares multiprecision integers *r1, *r2, and returns:	
	-1 iff *r1 < *r2
	 0 iff *r1 == *r2
	+1 iff *r1 > *r2
*/
{	register short precision;	/* number of units to compare */
	precision = global_precision;
	make_msbptr(r1,precision);
	make_msbptr(r2,precision);
	do	
	{	if (*r1 < *r2) 
			return(-1);
		if (*post_lowerunit(r1) > *post_lowerunit(r2)) 
			return(1);
	} while (--precision);	
	return(0);	/*  *r1 == *r2  */
}	/* mp_compare */


boolean mp_inc(register unitptr r)
	/* Increment multiprecision integer r. */
{	register short precision;
	precision = global_precision;
	make_lsbptr(r,precision);
	do	
	{	if ( ++(*r) ) return(0);	/* no carry */
		post_higherunit(r);	
	} while (--precision);
	return(1);		/* return carry set */
}	/* mp_inc */


boolean mp_dec(register unitptr r)
	/* Decrement multiprecision integer r. */
{	register short precision;
	precision = global_precision;
	make_lsbptr(r,precision);
	do	
	{	if ( (signedunit) (--(*r)) != (signedunit) -1 )
			return(0);	/* no borrow */
		post_higherunit(r);	
	} while (--precision);
	return(1);		/* return borrow set */
}	/* mp_dec */


void mp_neg(register unitptr r)
	/* Compute 2's complement, the arithmetic negative, of r */
{	register short precision;	/* number of units to negate */
	precision = global_precision;
	mp_dec(r);	/* 2's complement is 1's complement plus 1 */
	do	/* now do 1's complement */
	{	*r = ~(*r);
		r++;	
	} while (--precision);
}	/* mp_neg */


void mp_move(register unitptr dst,register unitptr src)
{	register short precision;	/* number of units to move */
	precision = global_precision;
	do { *dst++ = *src++; } while (--precision);
}	/* mp_move */


void mp_init(register unitptr r, word16 value)
	/* Init multiprecision register r with short value. */
{	/* Note that mp_init doesn't extend sign bit for >32767 */
	register short precision;	/* number of units to init */
	precision = global_precision;
	make_lsbptr(r,precision);
	*post_higherunit(r) = value; precision--;
#ifdef UNIT8
	*post_higherunit(r) = value >> UNITSIZE; precision--;
#endif
	while (precision--)
		*post_higherunit(r) = 0;
}	/* mp_init */


short significance(register unitptr r)
	/* Returns number of significant units in r */
{	register short precision;
	precision = global_precision;
	make_msbptr(r,precision);
	do	
	{	if (*post_lowerunit(r)) 
			return(precision);
	} while (--precision);
	return(precision);
}	/* significance */


void unitfill0(unitptr r,word16 unitcount)
	/* Zero-fill the unit buffer r. */
{	while (unitcount--) *r++ = 0;
}	/* unitfill0 */


/* The macro normalize(r,precision) "normalizes" a multiprecision integer 
   by adjusting r and precision to new values.  For Motorola-style processors 
   (MSB-first), r is a pointer to the MSB of the register, and must
   be adjusted to point to the first nonzero unit.  For Intel/VAX-style
   (LSB-first) processors, r is a  pointer to the LSB of the register,
   and must be left unchanged.  The precision counter is always adjusted,
   regardless of processor type.  In the case of precision = 0,
   r becomes undefined.
*/


/* The macro rescale(r,current_precision,new_precision) rescales
   a multiprecision integer by adjusting r and its precision to new values.  
   It can be used to reverse the effects of the normalize 
   routine given above.  See the comments in normalize concerning 
   Intel vs. Motorola LSB/MSB conventions.
   WARNING:  You can only safely call rescale on registers that 
   you have previously normalized with the above normalize routine, 
   or are known to be big enough for the new precision.  You may
   specify a new precision that is smaller than the current precision.
   You must be careful not to specify a new_precision value that is 
   too big, or which adjusts the r pointer out of range.
*/


/* These bit sniffer definitions begin sniffing at the MSB */
#define sniff_bit(bptr,bitmask)	(*(bptr) & bitmask)

#define init_bitsniffer(bptr,bitmask,prec,bits) \
{ normalize(bptr,prec); \
  if (!prec) \
    return(0); \
  bits = units2bits(prec); \
  make_msbptr(bptr,prec); bitmask = uppermostbit; \
  while (!sniff_bit(bptr,bitmask)) \
  { bitmask >>= 1; bits--; \
  } \
}

#define bump_bitsniffer(bptr,bitmask) \
{ if (!(bitmask >>= 1)) \
  { bitmask = uppermostbit; \
    post_lowerunit(bptr); \
  } \
}

/* the following two macros are used by mp_udiv */
#define bump_2bitsniffers(bptr,bptr2,bitmask) \
{ if (!(bitmask >>= 1)) \
  { bitmask = uppermostbit; \
    post_lowerunit(bptr); \
    post_lowerunit(bptr2); \
  } \
}

#define stuff_bit(bptr,bitmask)	*(bptr) |= bitmask


int mp_udiv(register unitptr remainder,register unitptr quotient,
	register unitptr dividend,register unitptr divisor)
	/* Unsigned divide, treats both operands as positive. */
{	int bits;
	short dprec;
	register unit bitmask;
	if (testeq(divisor,0))
		return(-1);	/* zero divisor means divide error */
	mp_init(remainder,0);
	mp_init(quotient,0);
	/* normalize and compute number of bits in dividend first */
	init_bitsniffer(dividend,bitmask,dprec,bits);
	/* rescale quotient to same precision (dprec) as dividend */
	rescale(quotient,global_precision,dprec);
	make_msbptr(quotient,dprec); 

	while (bits--)
	{	mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
		if (mp_compare(remainder,divisor) >= 0)
		{	mp_sub(remainder,divisor);
			stuff_bit(quotient,bitmask);
		}
		bump_2bitsniffers(dividend,quotient,bitmask); 
	}
	return(0);
} /* mp_udiv */


int mp_div(register unitptr remainder,register unitptr quotient,
	register unitptr dividend,register unitptr divisor)
	/* Signed divide, either or both operands may be negative. */
{	boolean dvdsign,dsign;
	int status;
	dvdsign = (mp_tstminus(dividend)!=0);
	dsign = (mp_tstminus(divisor)!=0);
	if (dvdsign) mp_neg(dividend);
	if (dsign) mp_neg(divisor);
	status = mp_udiv(remainder,quotient,dividend,divisor);
	if (dvdsign) mp_neg(dividend);		/* repair caller's dividend */
	if (dsign) mp_neg(divisor);		/* repair caller's divisor */
	if (status<0) return(status);		/* divide error? */
	if (dvdsign) mp_neg(remainder);
	if (dvdsign ^ dsign) mp_neg(quotient);
	return(status);
} /* mp_div */


word16 mp_shortdiv(register unitptr quotient,
	register unitptr dividend,register word16 divisor)
/*	This function does a fast divide and mod on a multiprecision dividend
	using a short integer divisor returning a short integer remainder.
	This is an unsigned divide.  It treats both operands as positive.
	It is used mainly for faster printing of large numbers in base 10. 
*/
{	int bits;
	short dprec;
	register unit bitmask;
	register word16 remainder;
	if (!divisor)	/* if divisor == 0 */
		return(-1);	/* zero divisor means divide error */
	remainder=0;
	mp_init(quotient,0);
	/* normalize and compute number of bits in dividend first */
	init_bitsniffer(dividend,bitmask,dprec,bits);
	/* rescale quotient to same precision (dprec) as dividend */
	rescale(quotient,global_precision,dprec);
	make_msbptr(quotient,dprec); 

	while (bits--)
	{	remainder <<= 1;
		if (sniff_bit(dividend,bitmask))
			remainder++;
		if (remainder >= divisor)
		{	remainder -= divisor;
			stuff_bit(quotient,bitmask);
		}
		bump_2bitsniffers(dividend,quotient,bitmask); 
	}	
	return(remainder);
} /* mp_shortdiv */


int mp_mod(register unitptr remainder,
	register unitptr dividend,register unitptr divisor)
	/* Unsigned divide, treats both operands as positive. */
{	int bits;
	short dprec;
	register unit bitmask;
	if (testeq(divisor,0))
		return(-1);	/* zero divisor means divide error */
	mp_init(remainder,0);
	/* normalize and compute number of bits in dividend first */
	init_bitsniffer(dividend,bitmask,dprec,bits);

	while (bits--)
	{	mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
		msub(remainder,divisor);
		bump_bitsniffer(dividend,bitmask);
	}	
	return(0);
} /* mp_mod */


word16 mp_shortmod(register unitptr dividend,register word16 divisor)
/*	This function does a fast mod operation on a multprecision dividend
	using a short integer modulus returning a short integer remainder.
	This is an unsigned divide.  It treats both operands as positive.
	It is used mainly for fast sieve searches for large primes. 
*/
{	int bits;
	short dprec;
	register unit bitmask;
	register word16 remainder;
	if (!divisor)	/* if divisor == 0 */
		return(-1);	/* zero divisor means divide error */
	remainder=0;
	/* normalize and compute number of bits in dividend first */
	init_bitsniffer(dividend,bitmask,dprec,bits);

	while (bits--)
	{	remainder <<= 1;
		if (sniff_bit(dividend,bitmask))
			remainder++;
		if (remainder >= divisor) remainder -= divisor;
		bump_bitsniffer(dividend,bitmask);
	}	
	return(remainder);
} /* mp_shortmod */



#ifndef COMB_MULT	/* if not "comb" multiply, use peasant multiply */

int mp_mult(register unitptr prod,
	register unitptr multiplicand,register unitptr multiplier)
	/* Computes multiprecision prod = multiplicand * multiplier */
{	/* Uses "Russian peasant" multiply algorithm. */
	int bits;
	register unit bitmask;
	short mprec;
	mp_init(prod,0);
	if (testeq(multiplicand,0))
		return(0);	/* zero multiplicand means zero product */
	/* normalize and compute number of bits in multiplier first */
	init_bitsniffer(multiplier,bitmask,mprec,bits);

	while (bits--)
	{	mp_shift_left(prod);
		if (sniff_bit(multiplier,bitmask))
		{	mp_add(prod,multiplicand);
		}
		bump_bitsniffer(multiplier,bitmask);
	}
	return(0);	
}	/* mp_mult */

#else	/* use "comb" multiply algorithm */

int mp_mult(register unitptr prod,
	register unitptr multiplicand, register unitptr multiplier)
	/*	Computes multiprecision prod = multiplicand * multiplier */
{	/*	Uses interleaved comb multiply algorithm.
		This improved multiply more than twice as fast as a Russian 
		peasant multiply, because it does a lot fewer shifts. 
		Must have global_precision set to the size of the multiplicand 
		plus UNITSIZE-1 SLOP_BITS.  Produces a product that is the sum 
		of the lengths of the multiplier and multiplicand.
	*/
	int bits;
	register unit bitmask;
	unitptr product, mplier, temp;
	short mprec,mprec2;
	unit mplicand[MAX_UNIT_PRECISION];
	mp_init(prod,0);		/* better clear full width--double precision */
	if (testeq(multiplicand,0))
		return(0);	/* zero multiplicand means zero product */

	mp_move(mplicand,multiplicand);	/* save it from damage */

	normalize(multiplier,mprec);
	if (!mprec)
		return(0);

	make_lsbptr(multiplier,mprec);
	bitmask = 1;	/* start scan at LSB of multiplier */

	do	/* UNITSIZE times */
	{	/* do for bits 0-15 */
		product = prod;
		mplier = multiplier;
		mprec2 = mprec;
		while (mprec2--)	/* do for each word in multiplier */
		{
			if (sniff_bit(mplier,bitmask))
			{	if (mp_addc(product,multiplicand,0))	/* ripple carry */
				{	/* After 1st time thru, this is rarely encountered. */
					temp = msbptr(product,global_precision);
					pre_higherunit(temp);
					/* temp now points to LSU of carry region. */
					unmake_lsbptr(temp,global_precision);
					mp_inc(temp);
				}	/* ripple carry */
			}
			pre_higherunit(mplier);
			pre_higherunit(product);
		}
		if (!(bitmask <<= 1))
			break;
		mp_shift_left(multiplicand);

	} while (TRUE);

	mp_move(multiplicand,mplicand);	/* recover */

	return(0);	/* normal return */	
}	/* mp_mult */

#endif	/* COMB_MULT */


/*	mp_modmult computes a multiply combined with a modulo operation. 
	Before calling mp_modmult, you must first call
	stage_modulus(the_modulus)
*/

#ifdef COUNTMULTS
/* "number of modmults" counters, used for performance studies. */
static unsigned int tally_modmults = 0;
static unsigned int tally_modsquares = 0;
#endif	/* COUNTMULTS */

#ifndef ASM_MODMULT	/* not assembly primitive modmult */

#ifdef PEASANT
/* Conventional Russian peasant multiply with modulo algorithm. */

static unitptr modulus = 0;	/* used only by mp_modmult */

int stage_peasant_modulus(unitptr n)
/*	Must pass modulus to stage_modulus before calling modmult.
	Assumes that global_precision has already been adjusted to the
	size of the modulus, plus SLOP_BITS.
*/
{	/* For this simple version of modmult, just copy unit pointer. */
	modulus = n;
	return(0);	/* normal return */
}	/* stage_peasant_modulus */


int peasant_modmult(register unitptr prod,
	unitptr multiplicand,register unitptr multiplier)
{	/*	"Russian peasant" multiply algorithm, combined with a modulo 
		operation.  This is a simple naive replacement for Merritt's 
		faster modmult algorithm.  References global unitptr "modulus".
		Computes:  prod = (multiplicand*multiplier) mod modulus
		WARNING: All the arguments must be less than the modulus!
	*/
	int bits;
	register unit bitmask;
	short mprec;
	mp_init(prod,0);
/*	if (testeq(multiplicand,0))
		return(0); */	/* zero multiplicand means zero product */
	/* normalize and compute number of bits in multiplier first */
	init_bitsniffer(multiplier,bitmask,mprec,bits);

	while (bits--)
	{	mp_shift_left(prod);
		msub(prod,modulus);	/* turns mult into modmult */
		if (sniff_bit(multiplier,bitmask))
		{	mp_add(prod,multiplicand);
			msub(prod,modulus);	/* turns mult into modmult */
		}
		bump_bitsniffer(multiplier,bitmask);
	}
	return(0);
}	/* peasant_modmult */


/*	If we are using a version of mp_modmult that uses internal tables 
	in memory, we have to call modmult_burn() at the end of mp_modexp.
	This is so that no sensitive data is left in memory after the program 
	exits.  The Russian peasant method doesn't use any such tables.
*/
static void peasant_burn(void)
/*	Alias for modmult_burn, called only from mp_modexp().  Destroys
	internal modmult tables.  This version does nothing because no 
	tables are used by the Russian peasant modmult. */
{ }	/* peasant_burn */

#endif	/* PEASANT */


#ifdef MERRITT
/*=========================================================================*/
/*
	This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
	Also refined by Zhahai Stewart to reduce the number of subtracts.
	It performs a multiply combined with a modulo operation, without 
	going into "double precision".  It is faster than the Russian peasant
	method,	and still works well on machines that lack a fast hardware 
	multiply instruction.  
*/

/*	The following support functions, macros, and data structures
	are used only by Merritt's modmult algorithm... */

static void mp_lshift_unit(register unitptr r1)
/*	Shift r1 1 whole unit to the left.  Used only by modmult function. */
{	register short precision;
	register unitptr r2;
	precision = global_precision;
	make_msbptr(r1,precision);
	r2 = r1;
	while (--precision)
		*post_lowerunit(r1) = *pre_lowerunit(r2);
	*r1 = 0;
}	/* mp_lshift_unit */


/* moduli_buf contains shifted images of the modulus, set by stage_modulus */
static unit moduli_buf[UNITSIZE][MAX_UNIT_PRECISION] = {0};
static unitptr moduli[UNITSIZE+1] = /* contains pointers into moduli_buf */
{	0
	,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0], 
	 &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0], &moduli_buf[ 7][0]
#ifndef UNIT8
	,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0], 
	 &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0], &moduli_buf[15][0] 
#ifndef UNIT16	/* and not UNIT8 */
	,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0], 
	 &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0], 
	 &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0], 
	 &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0], &moduli_buf[31][0]
#endif	/* UNIT16 and UNIT8 not defined */
#endif	/* UNIT8 not defined */
};

/*	To optimize msubs, need following 2 unit arrays, each filled
	with the most significant unit and next-to-most significant unit
	of the preshifted images of the RSA modulus. */
static unit msu_moduli[UNITSIZE+1] = {0}; /* most signif. unit */
static unit nmsu_moduli[UNITSIZE+1] = {0}; /* next-most signif. unit */

/*	mpdbuf contains preshifted images of the multiplicand, mod n.
 	It is used only by mp_modmult.  It could be staticly declared
	inside of mp_modmult, but we put it outside mp_modmult so that 
	it can be wiped clean by modmult_burn(), which is called at the
	end of mp_modexp.  This is so that no sensitive data is left in 
	memory after the program exits.
*/
static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};


static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
/*	Computes UNITSIZE images of r, each shifted left 1 more bit.
	Used only by modmult function. 
*/
{	short int i;
	images[0] = r;	/* no need to move the first image, just copy ptr */
	for (i=1; i<UNITSIZE; i++)
	{	mp_move(images[i],images[i-1]);
		mp_shift_left(images[i]);
	}
}	/* stage_mp_images */


int stage_merritt_modulus(unitptr n)
/*	Computes UNITSIZE+1 images of modulus, each shifted left 1 more bit.
	Before calling mp_modmult, you must first stage the moduli images by
	calling stage_modulus.  n is the pointer to the modulus.
	Assumes that global_precision has already been adjusted to the
	size of the modulus, plus SLOP_BITS.
*/
{	short int i;
	unitptr msu;	/* ptr to most significant unit, for faster msubs */
	moduli[0] = n;	/* no need to move the first image, just copy ptr */

	/* used by optimized msubs macro... */
	msu = msbptr(n,global_precision);	/* needed by msubs */
	msu_moduli[0] = *post_lowerunit(msu);
	nmsu_moduli[0] = *msu;

	for (i=1; i<UNITSIZE+1; i++)
	{	mp_move(moduli[i],moduli[i-1]);
		mp_shift_left(moduli[i]);

		/* used by optimized msubs macro... */
		msu = msbptr(moduli[i],global_precision);	/* needed by msubs */
		msu_moduli[i] = *post_lowerunit(msu);	/* for faster msubs */
		nmsu_moduli[i] = *msu;
	}
	return(0);	/* normal return */
}	/* stage_merritt_modulus */


/* The following macros, sniffadd and msubs, are used by modmult... */
#define sniffadd(i) if (*multiplier & power_of_2(i))  mp_add(prod,mpd[i])

/* Unoptimized msubs macro (msubs0) follows... */
/* #define msubs0(i) msub(prod,moduli[i]) 
*/

/*	To optimize msubs, compare the most significant units of the 
	product and the shifted modulus before deciding to call the full 
	mp_compare routine.  Better still, compare the upper two units
	before deciding to call mp_compare.
	Optimization of msubs relies heavily on standard C left-to-right 
	parsimonious evaluation of logical expressions. 
*/

/* Partially-optimized msubs macro (msubs1) follows... */
/* #define msubs1(i) if ( \
  ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
  (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
  ) mp_sub(prod,moduli[i])
*/

/* Fully-optimized msubs macro (msubs2) follows... */
#define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
 (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
 (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
 mp_sub(prod,moduli[i])


int merritt_modmult(register unitptr prod,
	unitptr multiplicand,register unitptr multiplier)
	/*	Performs combined multiply/modulo operation.  
		Computes:  prod = (multiplicand*multiplier) mod modulus
		WARNING: All the arguments must be less than the modulus!
		Assumes the modulus has been predefined by first calling 
		stage_modulus.
	*/
{
	/* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
	register signedunit p_m;
	register unitptr msu_prod;	/* ptr to most significant unit of product */
	register unitptr nmsu_prod;	/* next-most signif. unit of product */
	short mprec;		/* precision of multiplier, in units */
	/*	Array mpd contains a list of pointers to preshifted images of 
		the multiplicand: */
	static unitptr mpd[UNITSIZE] =
	{	 0,              &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
		 &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
#ifndef UNIT8
		,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
		 &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
#ifndef UNIT16	/* and not UNIT8 */
		,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
		 &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
		 &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
		 &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
#endif	/* UNIT16 and UNIT8 not defined */
#endif	/* UNIT8 not defined */
	};

	/* Compute preshifted images of multiplicand, mod n: */
	stage_mp_images(mpd,multiplicand);

	/* To optimize msubs, set up msu_prod and nmsu_prod: */
	msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */
	nmsu_prod = msu_prod;
	post_lowerunit(nmsu_prod); /* Get next-MSU of prod */

	/*	To understand this algorithm, it would be helpful to first 
		study the conventional Russian peasant modmult algorithm.
		This one does about the same thing as Russian peasant, but
		is organized differently to save some steps.  It loops 
		through the multiplier a word (unit) at a time, instead of 
		a bit at a time.  It word-shifts the product instead of 
		bit-shifting it, so it should be faster.  It also does about 
		half as many subtracts as Russian peasant.
	*/

	mp_init(prod,0);	/* Initialize product to 0. */

	/*	The way mp_modmult is actually used in cryptographic 
		applications, there will NEVER be a zero multiplier or 
		multiplicand.  So there is no need to optimize for that 
		condition.
	*/
/*	if (testeq(multiplicand,0))
		return(0); */	/* zero multiplicand means zero product */
	/* Normalize and compute number of units in multiplier first: */
	normalize(multiplier,mprec);
	if (mprec==0)	/* if precision of multiplier is 0 */
		return(0);	/* zero multiplier means zero product */
	make_msbptr(multiplier,mprec); /* start at MSU of multiplier */

	while (mprec--)	/* Loop for the number of units in the multiplier */
	{
		/*	Shift the product one whole unit to the left.
			This is part of the multiply phase of modmult. 
		*/

		mp_lshift_unit(prod);

		/*	Sniff each bit in the current unit of the multiplier, 
			and conditionally add the the corresponding preshifted 
			image of the multiplicand to the product.
			This is also part of the multiply phase of modmult.

			The following loop is unrolled for speed, using macros...

		for (i=UNITSIZE-1; i>=0; i--)
		   if (*multiplier & power_of_2(i)) 
				mp_add(prod,mpd[i]);
		*/
#ifndef UNIT8
#ifndef UNIT16	/* and not UNIT8 */
		sniffadd(31); 
		sniffadd(30); 
		sniffadd(29); 
		sniffadd(28);
		sniffadd(27); 
		sniffadd(26); 
		sniffadd(25); 
		sniffadd(24);
		sniffadd(23); 
		sniffadd(22); 
		sniffadd(21); 
		sniffadd(20);
		sniffadd(19); 
		sniffadd(18); 
		sniffadd(17); 
		sniffadd(16);
#endif	/* not UNIT16 and not UNIT8 */
		sniffadd(15); 
		sniffadd(14); 
		sniffadd(13); 
		sniffadd(12);
		sniffadd(11); 
		sniffadd(10); 
		sniffadd(9); 
		sniffadd(8);
#endif	/* not UNIT8 */
		sniffadd(7); 
		sniffadd(6); 
		sniffadd(5); 
		sniffadd(4);
		sniffadd(3); 
		sniffadd(2); 
		sniffadd(1); 
		sniffadd(0);

		/*	The product may have grown by as many as UNITSIZE+1 
			bits.  That's why we have global_precision set to the
			size of the modulus plus UNITSIZE+1 slop bits.  
			Now reduce the product back down by conditionally 
			subtracting	the UNITSIZE+1 preshifted images of the 
			modulus.  This is the modulo reduction phase of modmult.

			The following loop is unrolled for speed, using macros...

		for (i=UNITSIZE; i>=0; i--)
			if (mp_compare(prod,moduli[i]) >= 0) 
				mp_sub(prod,moduli[i]); 
		*/
#ifndef UNIT8
#ifndef UNIT16	/* and not UNIT8 */
		msubs(32); 
		msubs(31); 
		msubs(30); 
		msubs(29); 
		msubs(28);
		msubs(27); 
		msubs(26); 
		msubs(25); 
		msubs(24);
		msubs(23); 
		msubs(22); 
		msubs(21); 
		msubs(20);
		msubs(19); 
		msubs(18); 
		msubs(17); 
#endif	/* not UNIT16 and not UNIT8 */
		msubs(16);
		msubs(15); 
		msubs(14); 
		msubs(13); 
		msubs(12);
		msubs(11); 
		msubs(10); 
		msubs(9); 
#endif	/* not UNIT8 */
		msubs(8);
		msubs(7); 
		msubs(6); 
		msubs(5); 
		msubs(4);
		msubs(3); 
		msubs(2); 
		msubs(1); 
		msubs(0);

		/* Bump pointer to next lower unit of multiplier: */
		post_lowerunit(multiplier);

	}	/* Loop for the number of units in the multiplier */

	return(0);	/* normal return */

}	/* merritt_modmult */


#undef msubs
#undef sniffadd


/*	Merritt's mp_modmult function leaves some internal tables in memory,
	so we have to call modmult_burn() at the end of mp_modexp.  
	This is so that no cryptographically sensitive data is left in memory 
	after the program exits.
*/
static void merritt_burn(void)
/*	Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
{	unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
	unitfill0(&(moduli_buf[0][0]),(UNITSIZE)*MAX_UNIT_PRECISION);
	unitfill0(msu_moduli,UNITSIZE+1);
	unitfill0(nmsu_moduli,UNITSIZE+1);
} /* merritt_burn() */

/******* end of Merritt's MODMULT stuff. *******/
/*=========================================================================*/
#endif	/* MERRITT */


#ifdef STEWART	/* using Zhahai Stewart's modmult algorithm */
#include "stewart.c"
#endif	/* STEWART */

#endif	/* not ASM_MODMULT */


int countbits(unitptr r)
	/* Returns number of significant bits in r */
{	int bits;
	short prec;
	register unit bitmask;
	init_bitsniffer(r,bitmask,prec,bits);
	return(bits);
} /* countbits */


int mp_modexp(register unitptr expout,register unitptr expin,
	register unitptr exponent,register unitptr modulus)
{	/*	Russian peasant combined exponentiation/modulo algorithm.
		Calls modmult instead of mult. 
		Computes:  expout = (expin**exponent) mod modulus
		WARNING: All the arguments must be less than the modulus!
	*/
	int bits;
	short oldprecision;
	register unit bitmask;
	unit product[MAX_UNIT_PRECISION];
	short eprec;

#ifdef COUNTMULTS
	tally_modmults = 0;	/* clear "number of modmults" counter */
	tally_modsquares = 0;	/* clear "number of modsquares" counter */
#endif	/* COUNTMULTS */
	mp_init(expout,1);
	if (testeq(exponent,0))
	{	if (testeq(expin,0))
			return(-1); /* 0 to the 0th power means return error */
		return(0);	/* otherwise, zero exponent means expout is 1 */
	}
	if (testeq(modulus,0))
		return(-2);		/* zero modulus means error */
#if SLOP_BITS > 0	/* if there's room for sign bits */
	if (mp_tstminus(modulus))
		return(-2);		/* negative modulus means error */
#endif	/* SLOP_BITS > 0 */
	if (mp_compare(expin,modulus) >= 0)
		return(-3); /* if expin >= modulus, return error */
	if (mp_compare(exponent,modulus) >= 0)
		return(-4); /* if exponent >= modulus, return error */

	oldprecision = global_precision;	/* save global_precision */
	/* set smallest optimum precision for this modulus */
	set_precision(bits2units(countbits(modulus)+SLOP_BITS));
	/* rescale all these registers to global_precision we just defined */
	rescale(modulus,oldprecision,global_precision);
	rescale(expin,oldprecision,global_precision);
	rescale(exponent,oldprecision,global_precision);
	rescale(expout,oldprecision,global_precision);

	if (stage_modulus(modulus))
	{	set_precision(oldprecision);	/* restore original precision */
		return(-5);		/* unstageable modulus (STEWART algorithm) */
	}

	/* normalize and compute number of bits in exponent first */
	init_bitsniffer(exponent,bitmask,eprec,bits);

	/* We can "optimize out" the first modsquare and modmult: */
	bits--;		/* We know for sure at this point that bits>0 */
	mp_move(expout,expin);		/*  expout = (1*1)*expin; */
	bump_bitsniffer(exponent,bitmask);
	
	while (bits--)
	{
		poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
#ifdef COUNTMULTS
		tally_modsquares++;	/* bump "number of modsquares" counter */
#endif	/* COUNTMULTS */
		mp_modsquare(product,expout);
		mp_move(expout,product);
		if (sniff_bit(exponent,bitmask))
		{	mp_modmult(product,expout,expin);
			mp_move(expout,product);
#ifdef COUNTMULTS
			tally_modmults++;	/* bump "number of modmults" counter */
#endif	/* COUNTMULTS */
		}
		bump_bitsniffer(exponent,bitmask);
	}	/* while bits-- */
	mp_burn(product);	/* burn the evidence on the stack */
	modmult_burn(); /* ask mp_modmult to also burn its own evidence */

#ifdef COUNTMULTS	/* diagnostic analysis */
	{	long atomic_mults;
		unsigned int unitcount,totalmults;
		unitcount = bits2units(countbits(modulus));
		/* calculation assumes modsquare takes as long as a modmult: */
		atomic_mults = (long) tally_modmults * (unitcount * unitcount);
		atomic_mults += (long) tally_modsquares * (unitcount * unitcount);
		printf("%ld atomic mults for ",atomic_mults);
		printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
			tally_modsquares,tally_modmults,
			tally_modsquares+tally_modmults,
			countbits(modulus), unitcount);
	}
#endif	/* COUNTMULTS */

	set_precision(oldprecision);	/* restore original precision */
	return(0);		/* normal return */
}	/* mp_modexp */


char *copyright_notice(void)
/* force linker to include copyright notice in the executable object image. */
{ return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */


int rsa_decrypt(unitptr M, unitptr C,
	unitptr d, unitptr p, unitptr q, unitptr u)
	/*	Uses Chinese Remainder Theorem shortcut for RSA decryption.
		M is the output plaintext message.
		C is the input ciphertext message.
		d is the secret decryption exponent.
		p and q are the prime factors of n.
		u is the multiplicative inverse of p, mod q.
		Note that u is precomputed on the assumption that p<q.
		n, the common modulus, is not used here because of the
		Chinese Remainder Theorem shortcut.
	*/
{
	unit p2[MAX_UNIT_PRECISION];
	unit q2[MAX_UNIT_PRECISION];
	unit temp1[MAX_UNIT_PRECISION];
	unit temp2[MAX_UNIT_PRECISION];
	int status;

	mp_init(M,1);	/* initialize result in case of error */

	if (mp_compare(p,q) >= 0)	/* ensure that p<q */
	{	/* swap the pointers p and q */
		unitptr t;
		t = p;  p = q; q = t;
	}

/*	Rather than decrypting by computing modexp with full mod n
	precision, compute a shorter modexp with mod p precision... */

/*	p2 = [ (C mod p)**( d mod (p-1) ) ] mod p		*/
	mp_move(temp1,p);
	mp_dec(temp1);			/* temp1 = p-1 */
	mp_mod(temp2,d,temp1);	/* temp2 = d mod (p-1) ) */
	mp_mod(temp1,C,p);		/* temp1 = C mod p  */
	status = mp_modexp(p2,temp1,temp2,p);
	if (status < 0)	/* mp_modexp returned an error. */
		return(status);	/* error return */

/*	Now compute a short modexp with mod q precision... */

/*	q2 = [ (C mod q)**( d mod (q-1) ) ] mod q		*/
	mp_move(temp1,q);
	mp_dec(temp1);			/* temp1 = q-1 */
	mp_mod(temp2,d,temp1);	/* temp2 = d mod (q-1) */
	mp_mod(temp1,C,q);		/* temp1 = C mod q  */
	status = mp_modexp(q2,temp1,temp2,q);
	if (status < 0)	/* mp_modexp returned an error. */
		return(status);	/* error return */

/*	Now use the multiplicative inverse u to glue together the
	two halves, saving a lot of time by avoiding a full mod n
	exponentiation.  At key generation time, u was computed
	with the secret key as the multiplicative inverse of
	p, mod q such that:  (p*u) mod q = 1, assuming p<q.
*/
	if (mp_compare(p2,q2) == 0)	/* only happens if C<p */
		mp_move(M,p2);
	else
	{	/*	q2 = q2 - p2;  if q2<0 then q2 = q2 + q  */
		if (mp_sub(q2,p2))	/* if the result went negative... */
			mp_add(q2,q);		/* add q to q2 */

		/*	M = p2 + ( p * [(q2*u) mod q] ) 	*/
		mp_mult(temp1,q2,u);		/* temp1 = q2*u  */
		mp_mod(temp2,temp1,q);		/* temp2 = temp1 mod q */
		mp_mult(temp1,p,temp2);	/* temp1 = p * temp2 */
		mp_add(temp1,p2);		/* temp1 = temp1 + p2 */
		mp_move(M,temp1);		/* M = temp1 */
	}

	mp_burn(p2);	/* burn the evidence on the stack...*/
	mp_burn(q2);
	mp_burn(temp1);
	mp_burn(temp2);
	/* Do an explicit reference to the copyright notice so that the linker
	   will be forced to include it in the executable object image... */
	copyright_notice();	/* has no real effect at run time */
	return(0);	/* normal return */
}	/* rsa_decrypt */


/*
**	mp_sqrt - returns square root of a number.
**	returns -1 for error, 0 for perfect square, 1 for not perfect square.
**	Not used by any RSA encrypt or decrypt functions.
**	Used by some factoring algorithms.  This version needs optimization.
**	by Charles W. Merritt  July 15, 1989, refined by PRZ.
*/
int mp_sqrt(unitptr quotient,unitptr dividend)
	/* Quotient is returned as the square root of dividend. */
{
	register char next2bits; /* "period", or group of 2 bits of dividend */
	register unit dvdbitmask,qbitmask;
	unit remainder[MAX_UNIT_PRECISION],rjq[MAX_UNIT_PRECISION],
		divisor[MAX_UNIT_PRECISION];
	unsigned int qbits,qprec,dvdbits,dprec,oldprecision;
	int notperfect;

	mp_init(quotient,0);
	if (mp_tstminus(dividend)) /* if dividend<0, return error */
	{	mp_dec(quotient);	/* quotient = -1 */
		return(-1);
	}

	/* normalize and compute number of bits in dividend first */
	init_bitsniffer(dividend,dvdbitmask,dprec,dvdbits);
	/* init_bitsniffer returns a 0 if dvdbits is 0 */
	if (dvdbits==1)
	{	mp_init(quotient,1);	/* square root of 1 is 1 */
		return(0);
	}

	/* rescale quotient to half the precision of dividend */
	qbits = (dvdbits+1) >> 1;
	qprec = bits2units(qbits);
	rescale(quotient,global_precision,qprec);
	make_msbptr(quotient,qprec); 
	qbitmask = power_of_2( (qbits-1) & (UNITSIZE-1)) ;

	/* Set smallest optimum precision for this square root.
	   The low-level primitives are affected by the call to set_precision.
	   Even though the dividend precision is bigger than the precision
	   we will use, no low-level primitives will be used on the dividend.
	   They will be used on the quotient, remainder, and rjq, which are
	   smaller precision.
	*/
	oldprecision = global_precision;	/* save global_precision */
	set_precision(bits2units(qbits+3));	/* 3 bits of precision slop */

	/* special case: sqrt of 1st 2 (binary) digits of dividend
		is 1st (binary) digit of quotient.  This is always 1. */
	stuff_bit(quotient,qbitmask);
	bump_bitsniffer(quotient,qbitmask);
	mp_init(rjq,1); /* rjq is Right Justified Quotient */

	if (!(dvdbits & 1))
	{	/* even number of bits in dividend */
		next2bits = 2;
		bump_bitsniffer(dividend,dvdbitmask); dvdbits--;
		if (sniff_bit(dividend,dvdbitmask)) next2bits++;
		bump_bitsniffer(dividend,dvdbitmask); dvdbits--;
	}
	else
	{	/* odd number of bits in dividend */
		next2bits = 1;
		bump_bitsniffer(dividend,dvdbitmask); dvdbits--;
	}

	mp_init(remainder,next2bits-1);

	/* dvdbits is guaranteed to be even at this point */

	while (dvdbits)
	{	next2bits=0;
		if (sniff_bit(dividend,dvdbitmask)) next2bits=2;
		bump_bitsniffer(dividend,dvdbitmask); dvdbits--;
		if (sniff_bit(dividend,dvdbitmask)) next2bits++;
		bump_bitsniffer(dividend,dvdbitmask); dvdbits--;
		mp_rotate_left(remainder,(boolean)((next2bits&2)!=0));
		mp_rotate_left(remainder,(boolean)((next2bits&1)!=0));

		/*	"divisor" is trial divisor, complete divisor is 4*rjq 
			or 4*rjq+1.
			Subtract divisor times its last digit from remainder.
			If divisor ends in 1, remainder -= divisor*1,
			or if divisor ends in 0, remainder -= divisor*0 (do nothing).
			Last digit of divisor inflates divisor as large as possible
			yet still subtractable from remainder.
		*/
		mp_move(divisor,rjq);		/* divisor = 4*rjq+1 */
		mp_rotate_left(divisor,0);
		mp_rotate_left(divisor,1);
		if (mp_compare(remainder,divisor) >= 0)
		{	mp_sub(remainder,divisor);
			stuff_bit(quotient,qbitmask);
			mp_rotate_left(rjq,1);
		}
		else
			mp_rotate_left(rjq,0);
		bump_bitsniffer(quotient,qbitmask);
	}
	notperfect = testne(remainder,0); /* not a perfect square? */
	set_precision(oldprecision);	/* restore original precision */
	return(notperfect);	/* normal return */

}	/* mp_sqrt */


/*	These are notes on computing the square root the manual old-fashioned 
	way.  This is the basis of the above fast sqrt algorthm:

1)	Separate the number into groups (periods) of two digits each,
	beginning with units or at the decimal point.
2)	Find the greatest perfect square in the left hand period & write 
	its	square root as the first figure of the required root.  Subtract
	the square of this number from the left hand period.  Annex to the
	remainder the next group so as to form a dividend.
3)	Double the root already found and write it as a partial divisor at 
	the left of the new dividend.  Annex one zero digit, making a trial 
	divisor, and divide the new dividend by the trial divisor.
4)	Write the quotient in the root as the trial term and also substitute 
	this quotient for the annexed zero digit in the partial divisor, 
	making the latter complete.
5)	Multiply the complete divisor by the figure just obtained and, 
	if possible, subtract the product from the last remainder.
	If this product is too large, the trial term of the quotient
	must be replaced by the next smaller number and the operations
	preformed as before.
	(IN BINARY, OUR TRIAL TERM IS ALWAYS 1 AND WE USE IT OR DO NOTHING.)
6)	Proceed in this manner until all periods are used.
	If there is still a remainder, it's not a perfect square.
*/


/****************** end of RSA library ****************************/



unix.superglobalmegacorp.com