File:  [PGP] / pgp / contrib / idea / idea.asm
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs
Tue Apr 24 16:39:53 2018 UTC (8 years, 1 month ago) by root
Branches: phill, MAIN
CVS tags: pgp23a, HEAD
PGP 2.3a

	%PAGESIZE	59	; Turbo assembler formatting codes
	%BIN	13
	%LINUM	3

; Copyright (c) 1993 Colin Plumb.  This code may be freely
; distributed under the terms of the GNU General Public Licence.

	.model large
	.code

; A core operation in IDEA is multiplication modulo 65537.
; The valid inputs, 1 through 66636 inclusive are represented in
; 16-bit registers modulo 65536.  I.e. a value of 0 means 65536,
; or -1.  Thus, we need to test for that specially.  -x, modulo
; 65537, is 65537-x = 1-x.
; For any other number, represent the product as a*65536+b.  Since
; 65536 = -1 (mod 65537), this is the same number as b-a.  Should
; this result be negautive (generate a borrow), -n mod 65537 = 1-n
; mod 65536.  Or in other words, if you add the borrow bit back on,
; you get the right answer.

; This is what the assembly code does.  It forms a zero, and adds
; that on with carry.

; Another useful optimisation takes advantage of the fact that
; a and b are equal only if the answer is congruent to 0 mod 65537.
; Since 65537 is prime, this happens only if one of the inputs is
; congruent to 0 mod 65537.  Since the inputs are all less than 65537,
; this means it must have been zero.

; The code below tests for a zero result of the subtraction, and if
; one arises, it branches out of line to figure out what happened.


; This code implemets the IDEA encryption algorithm.
; It follows in pseudo-C, where the * operator operates
; modulo 65537, as Idea needs.  (If you don't understand,
; learn IDEA better.)

; IDEA is works on 16-bit units.  If you're processing bytes,
; it's defined to be big-endian, so an Intel machine needs to
; swap the bytes around.

; void Idea(u_int16 *in, u_int16 *out, u_int16 *key)
; {
;	register u+int16 x0, x1, x2, x3, s1, s2, round;
;
;	x0 = *in++;  x1 = *in++;  x2 = *in++;  x3 = *in;
;
;	for (round = 0; round < 8; round++) {
;		x0 *= *key++;
;		x1 += *key++;
;		x2 += *key++;
;		x3 *= *key++;
;
;		s1  = x1;  s2  = x2;
;		x2 ^= x0;  x1 ^= x3;
;
;		x2 *= *key++;
;		x1 += x2;
;		x1 *= *key++;
;		x2 += x1;
;
;		x0 ^= x1;  x3 ^= x2;
;		x1 ^= s2;  x2 ^= s1;
;	}
;	*out++ = x0 * *key++;
;	*out++ = x2 + *key++;	/* Yes, this is x2, not x1 */
;	*out++ = x1 + *key++;
;	*out   = x3 * *key;
; }

; ds:si points to key, ax, dx are temps, args in bx, cx, di, bp
; Trashes *all* registers.  direction flag must be clear.
; Leaves es zero.

; Since there is no spare register to hold the loop count, I make
; clever use of the stack, pushing the start of the loop several
; times and using a ret instruction to do the return.

; Annoyingly, lods is fastest on 8086's, but other techniques are
; best on 386's.  Well, that's what the manual says, but real
; life is different.  USELODS wins on a 386SX, at least.
; Leave it set for all platforms.

USELODS	equ	1

; bp must be x0 for some of the code below to work
x0	equ	bp
x1	equ	bx
x2	equ	cx
x3	equ	di
; di must be x3 for some of the code below to work

;; Now, this is rather interesting.  We test for zero arguments
;; after the multiply.  Assuming random inputs, one or both are
;; zero (2^17-1)/2^32, or approximately 1/32786 of the time.
;; Encryption in any feedback mode produces essentially random
;; inputs, so average-case analysis is okay.  While we don't
;; want the out-of-line code to waste time, it is not worth
;; slowing down the in-line case to speed it up.
;;
;; Basically, we start inverting the source x, and if that was 0,
;; we use the inverse of the key instead.

Core1Z:
	neg	x0
	jnz	Core1Za
if USELODS
	sub	x0,[si-2]
else
	sub	x0,[si]
endif
Core1Za:
	inc	x0
	jmp	Core1done
Core2Z:
	neg	x3
	jnz	Core2Za
if USELODS
	sub	x3,[si-2]
else
	sub	x3,[si+6]
endif
Core2Za:
	inc	x3
	jmp	Core2done
Core3Z:
	neg	x2
	jnz	Core3Za
if USELODS
	sub	x2,[si-2]
else
	sub	x2,[si+8]
endif
Core3Za:
	inc	x2
	jmp	Core3done
Core4Z:
	neg	x1
	jnz	Core4Za
if USELODS
	sub	x1,[si-2]
else
	sub	x1,[si+10]
endif
Core4Za:
	inc	x1
	jmp	Core4done

; We need a constant 0 that we can move into a register without affecting
; the carry flag (as the classic xor ax,ax is wont to do), so we use the
; es register for a constant 0 source.  This is okay even in protected
; mode.  (I *told* you this was tricky code!)

; BTW, since you wanted to know, this is 8 + 78*4 + 16 = 336 instructions.

Core	proc	near
	xor	ax,ax
	mov	es,ax
	mov	ax,OFFSET Finish
	push	ax
	mov	ax,OFFSET Coreloop
	push	ax	; Loop 3 times, then return
	push	ax
	push	ax

Coreloop:
if USELODS
	lodsw
else
	mov	ax,[si]		; x0 *= *key++
endif
	mul	x0
	sub	ax,dx
	jz	Core1Z
	mov	x0,es
	adc	x0,ax
Core1done:

if USELODS
	lodsw
	add	x1,ax
	lodsw
	add	x2,ax
else
	add	x1,[si+2]	; x1 += *key++
	add	x2,[si+4]	; x2 += *key++
endif

if USELODS
	lodsw
else
	mov	ax,[si+6]	; x3 += *key++
endif
	mul	x3
	sub	ax,dx
	jz	Core2Z
	mov	x3,es
	adc	x3,ax
Core2done:

	push	x1          	; s1 = x1
	push	x2		; s2 = x2

	xor	x1,x3		; x1 ^= x3
	xor	x2,x0		; x2 ^= x0

if USELODS
	lodsw
else
	mov	ax,[si+8]	; x2 *= *key++
endif
	mul	x2
	sub	ax,dx
	jz	Core3Z
	mov	x2,es
	adc	x2,ax
Core3done:

	add	x1,x2		; x1 += x2

if USELODS
	lodsw
else
	mov	ax,[si+10]	; x1 *= *key++
endif
	mul	x1
	sub	ax,dx
	jz	Core4Z
	mov	x1,es
	adc	x1,ax
Core4done:

	add	x2,x1		; x2 += x1

	xor	x0,x1		; x0 ^= x1
	xor	x3,x2		; x3 ^= x2

	pop	dx
	xor	x1,dx		; x1 ^= s2
	pop	dx
	xor	x2,dx		; x2 ^= s1

; Second unrolling of loop
if USELODS
	lodsw
else
	mov     ax,[si+12]  	; x0 *= *key++
endif
	mul	x0
	sub	ax,dx
	jz	Core5Z
	mov	x0,es
	adc	x0,ax
Core5done:

if USELODS
	lodsw
	add	x1,ax
	lodsw
	add	x2,ax
else
	add	x1,[si+14]	; x1 += *key++
	add	x2,[si+16]	; x2 += *key++
endif

if USELODS
	lodsw
else
	mov	ax,[si+18]	; x3 *= *key++
endif
	mul	x3
	sub	ax,dx
	jz	Core6Z
	mov	x3,es
	adc	x3,ax
Core6done:

	push	x1           	; s1 = x1
	push	x2		; s2 = x2

	xor	x1,x3		; x1 ^= x3
	xor	x2,x0		; x2 ^= x0

if USELODS
	lodsw
else
	mov	ax,[si+20]	; x2 *= *key++
endif
	mul	x2
	sub	ax,dx
	jz	Core7Z
	mov	x2,es
	adc	x2,ax
Core7done:

	add	x1,x2		; x1 += x2

if USELODS
	lodsw
else
	mov	ax,[si+22]	; x1 *= *key++
endif
	mul	x1
	sub	ax,dx
	jz	Core8Z
	mov	x1,es
	adc	x1,ax
Core8done:

	add	x2,x1		; x2 += x1

	xor	x0,x1		; x0 ^= x1
	xor	x3,x2		; x3 ^= x2

	pop	dx
	xor	x1,dx		; x1 ^= s2
	pop	dx
	xor	x2,dx		; x2 ^= s1

ife USELODS
	lea	si,[si+24]
endif

	ret	; Used as a loop instruction!

Core5Z:
	neg	x0
	jnz	Core5Za
if USELODS
	sub	x0,[si-2]
else
	sub	x0,[si+12]
endif
Core5Za:
	inc	x0
	jmp	Core5done
Core6Z:
	neg	x3
	jnz	Core6Za
if USELODS
	sub	x3,[si-2]
else
	sub	x3,[si+18]
endif
Core6Za:
	inc	x3
	jmp	Core6done
Core7Z:
	neg	x2
	jnz	Core7Za
if USELODS
	sub	x2,[si-2]
else
	sub	x2,[si+20]
endif
Core7Za:
	inc	x2
	jmp	Core7done
Core8Z:
	neg	x1
	jnz	Core8Za
if USELODS
	sub	x1,[si-2]
else
	sub	x1,[si+22]
endif
Core8Za:
	inc	x1
	jmp	Core8done
Core9Z:
	neg	x0
	jnz	Core9Za
if USELODS
	sub	x0,[si-2]
else
	sub	x0,[si]
endif
Core9Za:
	inc	x0
	jmp	Core9done
; Special: compute into dx (zero on entry)
Core10Z:
	sub	dx,x3
	jnz	Core10Za
if USELODS
	sub	dx,[si-2]
else
	sub	dx,[si+6]
endif
Core10Za:
	inc	dx
;	jmp	Core10done
	ret


Finish:
if USELODS
	lodsw
else
	mov	ax,[si]		; x0 *= *key++
endif
	mul	x0
	sub	ax,dx
	jz	Core9Z
	mov	x0,es
	adc	x0,ax
Core9done:

	xchg	x1,x2
if USELODS
	lodsw
	add	x1,ax
	lodsw
	add	x2,ax
else
	add	x1,[si+2]	; x1 += *key++
	add	x2,[si+4]       ; x2 += *key++
endif

; This is special: compute into dx, not x3
if USELODS
	lodsw
else
	mov	ax,[si+6]	; x3 *= *key++
endif
	mul	x3
	sub	ax,dx
	mov	dx,es
	jz	Core10Z
	adc	dx,ax
Core10done:

	ret

	endp


; Args are in, out, key
	public	_Idea2
_Idea2 proc far
	cld
	push	bp   	; Args start at [bp+6]
	mov	bp,sp
	push	si
	push	di
	push	ds	; 6 more words here, so args are at [sp+12]
	lds	si,[bp+6]	; in
	lodsw
	xchg	ah,al
	mov	dx,ax
	lodsw
	xchg	ah,al
	mov	x1,ax
	lodsw
	xchg	ah,al
	mov	x2,ax
	lodsw
	xchg	ah,al
	mov	x3,ax
	lds	si,[bp+14]	; key

	mov	x0,dx

	call	Core

	mov	ax,x0
	mov	bp,sp
	les	di,[bp+16]
	xchg	ah,al
	stosw
	mov	ax,x1
	xchg	ah,al
	stosw
	mov	ax,x2
	xchg	ah,al
	stosw
	mov	ax,x3
	xchg	ah,al
	stosw

	pop	ds
	pop	di
	pop	si
	pop	bp

	ret

	endp

; Okay, the basic plan for the CFB kernel is
; get x0,x1,x2,x3
; get key pointer
; call core
; get buffer pointers
;Loop:
; lodsw
; xor	ax,x0
; mov   x0,ax
; stosw
; lodsw
; xor	ax,x1
; mov	x0,ax
; stosw
; lodsw
; xor	ax,x2
; mov	x0,ax
; stosw
; lodsw
; xor	ax,x3
; mov	x3,ax
; stosw
; push buffer pointers
; get key pointer
; call	core
; pop buffer pointers
; loop
; lodsw/xor/etc.
;
;
; This function is designed to go in the middle of a byte-granularity
; CFB engine.  It performs "len" encryptions of the IV, encrypting
; 8*(len-1) bytes from the source to the destination.  The idea is
; that you first xor any odd leading bytes, then call this function,
; then xor up to 8 trailing bytes.

; The main loop in this is 38 instructions, plus the 336 for the core
; makes 374 total.  That's 46.75 instructions per byte.
; (It's the same for IdeaCFBx)

; IV, key, plain, cipher, len
	public	_IdeaCFB
_IdeaCFB proc far       ; Args are at [sp+4]
	cld
	push	bp
	push	si
	push	di
	push	ds	; 8 more words here, so args are at [sp+12]
; To be precise, IV is at 12, key at 16, plain at 20,
; cipher at 24 and len at 28
	mov	bp,sp
	lds	si,[bp+12]	; IV
; Load and byte-swap IV
	mov	ax,[si]
	xchg	ah,al
	mov	x1,[si+2]
	mov	x2,[si+4]
	xchg	bh,bl
	xchg	ch,cl
	mov	dx,[si+6]
	xchg	dh,dl

	lds	si,[bp+16]	; Key
	mov	x0,ax
	mov	x3,dx

	call	Core
IdeaCFBLoop:
;	mov	ax,x0
;	mov	bp,sp
;	dec	WORD PTR [bp+28]	; Decrement count
;	jz	IdeaCFBEnd
;	lds	si,[bp+20]
;	les	di,[bp+24]
;	mov	x0,ax
; Alternate code: (which is faster?  Two moves or three segment overrides?)
	mov	si,sp
	dec	WORD PTR ss:[si+28]
	jz	IdeaCFBEnd
	les	di,ss:[si+24]
	lds	si,ss:[si+20]

	lodsw
	xchg	ah,al
	xor	ax,x0
	mov	x0,ax
	xchg	ah,al
	stosw
	lodsw
	xchg	ah,al
	xor	ax,x1
	mov	x1,ax
	xchg	ah,al
	stosw
	lodsw
	xchg	ah,al
	xor	ax,x2
	mov	x2,ax
	xchg	ah,al
	stosw
	lodsw
	xchg	ah,al
	xor	ax,dx
	mov	dx,ax
	xchg	ah,al
	stosw

;	mov	ax,x0
;	mov	bp,sp
;	mov	[bp+20],si	; Save source offset
;	mov	[bp+24],di	; Save destination offset
;	lds	si,[bp+16]	; Key
;	mov	x0,ax		; Get x0 in place for another iteration
; Alternate code for the above: (which is faster?  One move or three ss:?)
	mov	ax,si
	mov	si,sp
	mov	ss:[si+20],ax
	mov	ss:[si+24],di
	lds	si,ss:[si+16]

	mov	x3,dx		; Get x3 in place
	mov	ax,OFFSET IdeaCFBLoop
	push	ax
	jmp	Core

IdeaCFBEnd:
;	lds	si,[bp+12]
	lds	di,ss:[si+12]	; Get IV for writing back

	mov	ax,x0
	xchg	ah,al
	mov	[di],ax		; Use stosw?
	xchg	bh,bl
	xchg	ch,cl
	mov	[di+2],x1
	mov	[di+4],x2
	xchg	dh,dl
	mov	[di+6],dx

	pop	ds
	pop	di
	pop	si
	pop	bp

	ret

	endp

; This decoding step is similar, except that instead of
;	lods
;	xor	x0,ax
;	mov	ax,x0
; 	stos
; the feedback step is
;	lods
;	xchg	x0,ax
;	xor	ax,x0
;	stos

; IV, key, cipher, plain, len
	public	_IdeaCFBx
_IdeaCFBx proc far       ; Args are at [sp+4]
	cld
	push	bp
	push	si
	push	di
	push	ds	; 8 more words here, so args are at [sp+12]
	mov	bp,sp
	lds	si,[bp+12]	; IV
; Load and byte-swap IV
	mov	ax,[si]
	xchg	ah,al
	mov	x1,[si+2]
	mov	x2,[si+4]
	xchg	bh,bl
	xchg	ch,cl
	mov	dx,[si+6]
	xchg	dh,dl

	lds	si,[bp+16]	; Key
	mov	x0,ax
	mov	x3,dx

	call	Core
IdeaCFBxLoop:
;	mov	ax,x0
;	mov	bp,sp
;	dec	WORD PTR [bp+28]	; Decrement count
;	jz	IdeaCFBxEnd
;	lds	si,[bp+20]
;	les	di,[bp+24]
;	mov	x0,ax
; Alternate code: (which is faster?  Two moves or three segment overrides)
	mov	si,sp
	dec	WORD PTR ss:[si+28]
	jz	IdeaCFBxEnd
	les	di,ss:[si+24]
	lds	si,ss:[si+20]

	lodsw
	xchg	ah,al
	xchg	x0,ax
	xor	ax,x0
	xchg	ah,al
	stosw
	lodsw
	xchg	ah,al
	xchg	x1,ax
	xor	ax,x1
	xchg	ah,al
	stosw
	lodsw
	xchg	ah,al
	xchg	x2,ax
	xor	ax,x2
	xchg	ah,al
	stosw
	lodsw
	xchg	ah,al
	xchg	dx,ax
	xor	ax,dx
	xchg	ah,al
	stosw

;	mov	ax,x0
;	mov	bp,sp
;	mov	[bp+20],si	; Save source offset
;	mov	[bp+24],di	; Save destination offset
;	lds	si,[bp+16]	; Key
;	mov	x0,ax		; Get x0 in place for another iteration
; Alternate code for the above: (which is faster?  One move or three ss:?)
	mov	ax,si
	mov	si,sp
	mov	ss:[si+20],ax
	mov	ss:[si+24],di
	lds	si,ss:[si+16]

	mov	x3,dx		; Get x3 in place
	mov	ax,OFFSET IdeaCFBxLoop
	push	ax
	jmp	Core

IdeaCFBxEnd:
;	lds	si:[bp+12]
	lds	di,ss:[si+12]	; Get IV for writing back

	mov	ax,x0
	xchg	ah,al
	mov	[di],ax		; Use stosw?
	xchg	bh,bl
	xchg	ch,cl
	mov	[di+2],x1
	mov	[di+4],x2
	xchg	dh,dl
	mov	[di+6],dx


	pop	ds
	pop	di
	pop	si
	pop	bp

	ret

	endp




	end

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.