--- pgp/src/8086.asm 2018/04/24 16:37:53 1.1.1.1 +++ pgp/src/8086.asm 2018/04/24 16:38:48 1.1.1.2 @@ -1,359 +1,518 @@ -; Assembly primitives for RSA multiprecision library -; -; Tested with Turbo Assembler 1.0 and masm 1.00 -; -; Written by Branko Lankester (lankeste@fwi.uva.nl) 10/10/91 - -; define LDATA and LCODE as follows: -; model: small compact medium large -; LDATA 0 1 0 1 -; LCODE 0 0 1 1 - -LDATA equ 1 -LCODE equ 1 - -IF LDATA -DSTPTR equ es:[bx+si] -ELSE -DSTPTR equ [bx+si] -ENDIF - -IF LCODE -prec equ [bp+6] ; 1st arg -r1 equ [bp+6] ; 1st arg -IF LDATA -r2 equ [bp+10] ; 2nd arg -carry equ [bp+14] ; 3rd arg -scarry equ [bp+10] ; carry for shift (arg 2) -ELSE -r2 equ [bp+8] -carry equ [bp+10] -scarry equ [bp+8] -ENDIF -ELSE ; small code model -prec equ [bp+4] -r1 equ [bp+4] -IF LDATA -r2 equ [bp+8] -carry equ [bp+12] -scarry equ [bp+8] -ELSE -r2 equ [bp+6] -carry equ [bp+8] -scarry equ [bp+6] -ENDIF -ENDIF - - -_TEXT segment byte public 'CODE' -DGROUP group _DATA,_BSS - assume cs:_TEXT,ds:DGROUP -_TEXT ends - -_DATA segment word public 'DATA' -_DATA ends - -_BSS segment word public 'BSS' -prec16 dw ? ; precision / 16 (seems to be / 256?) -unitprec dw ? ; precision / 16, really -addp dw ? ; jump offset -subp dw ? -rotp dw ? -mulp dw ? -_BSS ends - -_TEXT segment byte public 'CODE' - - public _P_SETP - public _P_ADDC - public _P_SUBB - public _P_ROTL - -IF LCODE -fprims proc far ; dummy proc -ELSE -fprims proc near -ENDIF - -; -; ******************** set precision ******************** -; -_P_SETP: - push bp - mov bp,sp - mov ax, prec ; precision in bits - add ax, 0fh - mov cl,4 - shr ax,cl ; prec. in units - mov unitprec,ax - push ax - shr ax,cl - mov prec16,ax ; precision / 16 - pop ax - and ax,0fh ; al = prec % 16 - mov bx,ax - mov cx,ax - shl bx,1 ; multiply by 4 (=number of bytes - shl bx,1 ; in instruction sequence) - mov dx,bx -IFE LDATA - sub dx,ax ; small model only 3 for add/sub -ENDIF - mov ax,offset add_ref - sub ax,dx - mov addp,ax - - mov ax,offset sub_ref - sub ax,dx - mov subp,ax - - mov ax,offset rot_ref - sub ax,bx - mov rotp,ax - - mov ax,offset mul_ref - shl bx,1 ; MULU macro is 17 bytes for large data - shl bx,1 - sub ax,bx - sub ax,cx - mov mulp,ax - - pop bp - ret - - - -; -; ******************** mpi add with carry ******************** -; -ADDU macro n - rept n - lodsw - adc DSTPTR,ax - endm -endm - - -_P_ADDC: - push bp - mov bp,sp - push si - mov cx, prec16 - mov dx, addp -IF LDATA - push ds - lds si, dword ptr r2 - les bx, dword ptr r1 -ELSE - mov si, r2 - mov bx, r1 -ENDIF - sub bx, si ; calculate relative offset - dec bx - dec bx - cld - shr byte ptr carry,1 ; load carry - jcxz add_units -add_16u: - ADDU 16 - loop add_16u -add_units: - jmp dx - ADDU 15 -add_ref: - rcl ax,1 ; return carry - and ax,1 -IF LDATA - pop ds -ENDIF - pop si - pop bp - ret - - - -; -; ******************** mpi subtract with borrow ******************** -; -SUBU macro n - rept n - lodsw - sbb DSTPTR,ax - endm -endm - - -_P_SUBB: - push bp - mov bp,sp - push si - mov cx, prec16 - mov dx, subp -IF LDATA - push ds - lds si, dword ptr r2 - les bx, dword ptr r1 -ELSE - mov si, r2 - mov bx, r1 -ENDIF - sub bx, si ; calculate relative offset - dec bx - dec bx - cld - shr byte ptr carry,1 - jcxz sub_units -sub_16u: - SUBU 16 - loop sub_16u -sub_units: - jmp dx - SUBU 15 -sub_ref: - rcl ax,1 ; return carry - and ax,1 -IF LDATA - pop ds -ENDIF - pop si - pop bp - ret - - - -; -; ******************** mpi rotate left ******************** -; -_P_ROTL: - push bp - mov bp,sp - mov cx, prec16 - mov dx, rotp -IF LDATA - push ds - lds bx, dword ptr r1 -ELSE - mov bx, r1 -ENDIF - shr byte ptr scarry,1 - jcxz rot_units -rot_16u: - i = 0 - rept 16 - rcl word ptr [bx + i],1 - i = i + 2 - endm - lahf - add bx,32 - sahf - loop rot_16u -rot_units: - jmp dx - rept 15 - rcl word ptr [bx],1 - inc bx - inc bx - endm -rot_ref: - - rcl ax,1 - and ax,1 -IF LDATA - pop ds -ENDIF - pop bp - ret - -fprims endp - -_TEXT ends - - - -; *************************************************************** -; P_SMUL (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier) -; mp_smul routine from Upton's modmult, converted to assembler -; -; Multiply the single-word multiplier times the multiprecision integer -; in multiplicand, accumulating result in prod. The resulting -; multiprecision prod will be 1 word longer than the multiplicand. -; multiplicand is unit_prec words long. We add into prod, so caller -; should zero it out first. -; -; NOTE: Unlike other functions in the multiprecision arithmetic -; library, both multiplicand and prod are pointing at the LSB, -; regardless of byte order of the machine. On an 80x86, this makes -; no difference. But if this assembly function is implemented -; on a 680x0, it becomes important. -; *************************************************************** -; Variable assignments: -; multiplier = [bp+14] -; multiplicand = [ds:di] 32-bit pointer -; prod = [es:si] 32-bit pointer -; unit_prec = cx -; p = ax-dx -; carry = bx -UPTON_TEXT SEGMENT WORD PUBLIC 'CODE' -UPTON_TEXT ENDS -UPTON_TEXT SEGMENT - ASSUME CS: UPTON_TEXT - ASSUME DS: DGROUP - PUBLIC _P_SMUL - -MULU macro n - rept n - lodsw ;multiplicand - mul bp ;multiplier, results (p) to AX/DX - add ax,bx ;carry - adc dx,0 - add ax,WORD PTR es:[di] - adc dx,0 - mov bx,dx ;carry - stosw - endm -endm - -_P_SMUL PROC FAR - push bp - mov bp,sp - push di - push si - push ds - mov cx,prec16 - mov ax,mulp - push ax - - sub bx,bx ;carry = 0, store in bx - - les di,DWORD PTR [bp+6] ;prod in es:di - lds si,DWORD PTR [bp+10] ;multiplicand in ds:si - cld - mov bp,[bp+14] - - or cx,cx - jnz mul_16u - jmp mul_units -mul_16u: - MULU 16 - dec cx - jz mul_units - jmp mul_16u -mul_units: - pop cx - jmp cx - MULU 15 -mul_ref: - - ; We know that the high-order word of prod will always be 0 - mov WORD PTR es:[di],bx ;store carry in prod empty high word - - pop ds - pop si - pop di - pop bp - ret - -_P_SMUL ENDP -UPTON_TEXT ends - end - +; Assembly primitives for RSA multiprecision library +; +; Tested with Turbo Assembler 1.0 and masm 1.00 +; +; Written by Branko Lankester (lankeste@fwi.uva.nl) 10/10/91 +; +; Modified to add, rather than store carry bit to allow using a +; smaller precision for long division. + +; define LDATA and LCODE as follows: +; model: small compact medium large +; LDATA 0 1 0 1 +; LCODE 0 0 1 1 + +LDATA equ 1 +LCODE equ 1 + +; Note: Only the large memory model has been implemented for P_SMULA, +; P_SETRECIP and P_QUO_DIGIT. + +IF LDATA +DSTPTR equ es:[bx+si] +ELSE +DSTPTR equ [bx+si] +ENDIF + +IF LCODE +prec equ [bp+6] ; 1st arg +r1 equ [bp+6] ; 1st arg +IF LDATA +r2 equ [bp+10] ; 2nd arg +carry equ [bp+14] ; 3rd arg +scarry equ [bp+10] ; carry for shift (arg 2) +ELSE +r2 equ [bp+8] +carry equ [bp+10] +scarry equ [bp+8] +ENDIF +ELSE ; small code model +prec equ [bp+4] +r1 equ [bp+4] +IF LDATA +r2 equ [bp+8] +carry equ [bp+12] +scarry equ [bp+8] +ELSE +r2 equ [bp+6] +carry equ [bp+8] +scarry equ [bp+6] +ENDIF +ENDIF + +IF NOT LCODE +UPTON_TEXT = _TEXT +ENDIF + +_TEXT segment byte public 'CODE' +DGROUP group _DATA,_BSS + assume cs:_TEXT,ds:DGROUP +_TEXT ends + +_DATA segment word public 'DATA' +_DATA ends + +_BSS segment word public 'BSS' +prec16 dw ? ; precision / 16 (seems to be / 256?) +unitprec dw ? ; precision / 16, really +addp dw ? ; jump offset +subp dw ? +rotp dw ? +mulp dw ? +_BSS ends + +_TEXT segment byte public 'CODE' + + public _P_SETP + public _P_ADDC + public _P_SUBB + public _P_MUSUBB + public _P_ROTL + +IF LCODE +fprims proc far ; dummy proc +ELSE +fprims proc near +ENDIF + +; +; ******************** set precision ******************** +; +_P_SETP: + push bp + mov bp,sp + mov ax, prec ; precision in bits + add ax, 0fh + mov cl,4 + shr ax,cl ; prec. in units + mov unitprec,ax + push ax + shr ax,cl + mov prec16,ax ; precision / 16 + pop ax + and ax,0fh ; al = prec % 16 + mov bx,ax + mov cx,ax + shl bx,1 ; multiply by 4 (=number of bytes + shl bx,1 ; in instruction sequence) + mov dx,bx +IFE LDATA + sub dx,ax ; small model only 3 for add/sub +ENDIF + mov ax,offset add_ref + sub ax,dx + mov addp,ax + + mov ax,offset sub_ref + sub ax,dx + mov subp,ax + + mov ax,offset rot_ref + sub ax,bx + mov rotp,ax + + mov ax,offset mul_ref + shl bx,1 ; MULU macro is 17 bytes for large data + shl bx,1 + sub ax,bx + sub ax,cx + mov mulp,ax + + pop bp + ret + + + +; +; ******************** mpi add with carry ******************** +; +ADDU macro n + rept n + lodsw + adc DSTPTR,ax + endm +endm + + +_P_ADDC: + push bp + mov bp,sp + push si + mov cx, prec16 + mov dx, addp +IF LDATA + push ds + lds si, dword ptr r2 + les bx, dword ptr r1 +ELSE + mov si, r2 + mov bx, r1 +ENDIF + sub bx, si ; calculate relative offset + dec bx + dec bx + cld + shr byte ptr carry,1 ; load carry + jcxz add_units +add_16u: + ADDU 16 + loop add_16u +add_units: + jmp dx + ADDU 15 +add_ref: + rcl ax,1 ; return carry + and ax,1 +IF LDATA + pop ds +ENDIF + pop si + pop bp + ret + + + +; +; ******************** mpi subtract with borrow ******************** +; +SUBU macro n + rept n + lodsw + sbb DSTPTR,ax + endm +endm + + +_P_MUSUBB: ; MULTUNIT is same size as unit +_P_SUBB: + push bp + mov bp,sp + push si + mov cx, prec16 + mov dx, subp +IF LDATA + push ds + lds si, dword ptr r2 + les bx, dword ptr r1 +ELSE + mov si, r2 + mov bx, r1 +ENDIF + sub bx, si ; calculate relative offset + dec bx + dec bx + cld + shr byte ptr carry,1 + jcxz sub_units +sub_16u: + SUBU 16 + loop sub_16u +sub_units: + jmp dx + SUBU 15 +sub_ref: + rcl ax,1 ; return carry + and ax,1 +IF LDATA + pop ds +ENDIF + pop si + pop bp + ret + + + +; +; ******************** mpi rotate left ******************** +; +_P_ROTL: + push bp + mov bp,sp + mov cx, prec16 + mov dx, rotp +IF LDATA + push ds + lds bx, dword ptr r1 +ELSE + mov bx, r1 +ENDIF + shr byte ptr scarry,1 + jcxz rot_units +rot_16u: + i = 0 + rept 16 + rcl word ptr [bx + i],1 + i = i + 2 + endm + lahf + add bx,32 + sahf + loop rot_16u +rot_units: + jmp dx + rept 15 + rcl word ptr [bx],1 + inc bx + inc bx + endm +rot_ref: + + rcl ax,1 + and ax,1 +IF LDATA + pop ds +ENDIF + pop bp + ret + +fprims endp + + + + +; *************************************************************** +; P_SMULA (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier) +; mp_smul routine from Upton's modmult, converted to assembler +; +; Multiply the single-word multiplier times the multiprecision integer +; in multiplicand, accumulating result in prod. The resulting +; multiprecision prod will be 1 word longer than the multiplicand. +; multiplicand is unit_prec words long. We add into prod, so caller +; should zero it out first. +; +; NOTE: Unlike other functions in the multiprecision arithmetic +; library, both multiplicand and prod are pointing at the LSB, +; regardless of byte order of the machine. On an 80x86, this makes +; no difference. But if this assembly function is implemented +; on a 680x0, it becomes important. +; +; This version differs from P_SMUL by adding in, rather than storing, +; the final carry. This better supports use by Smith's modmult. +; *************************************************************** +; Variable assignments: +; multiplier = [bp+14] +; multiplicand = [ds:di] 32-bit pointer +; prod = [es:si] 32-bit pointer +; unit_prec = cx +; p = ax-dx +; carry = bx + + PUBLIC _P_SMULA + +MULU macro n + rept n + lodsw ;multiplicand + mul bp ;multiplier, results (p) to AX/DX + add ax,bx ;carry + adc dx,0 + add ax,WORD PTR es:[di] + adc dx,0 + mov bx,dx ;carry + stosw + endm +endm + +_P_SMULA PROC FAR + push bp + mov bp,sp + push di + push si + push ds + mov cx,prec16 + mov ax,mulp + push ax + + sub bx,bx ;carry = 0, store in bx + + les di,DWORD PTR [bp+6] ;prod in es:di + lds si,DWORD PTR [bp+10] ;multiplicand in ds:si + cld + mov bp,[bp+14] + + or cx,cx + jnz mul_16u + jmp mul_units +mul_16u: + MULU 16 + dec cx + jz mul_units + jmp mul_16u +mul_units: + pop cx + jmp cx + MULU 15 +mul_ref: + + add WORD PTR es:[di],bx ;add final carry + + pop ds + pop si + pop di + pop bp + ret +_P_SMULA ENDP + +; *************************************************************** +; void P_SETRECIP (MULTUNIT reciph, MULTUNIT recipl, short mshift) +; Specify reciprocal factors for use by P_QUO_DIGIT. +; +; This implementation is for 16-bit MULTUNIT. +; +; *************************************************************** + +DGROUP group _DATA,_BSS + assume ds:DGROUP +_BSS segment word public 'BSS' +reciph dw ? ; recip msw +recipl dw ? ; recip lsw +mshift dw ? ; shift adjust +_BSS ends + + PUBLIC _P_SETRECIP + +_P_SETRECIP PROC FAR + push bp + mov bp,sp + + mov ax,6[bp] ; reciph + mov reciph,ax + mov ax,8[bp] ; recipl + mov recipl,ax + mov ax,10[bp] ; mshift + mov mshift,ax + + pop bp + ret +_P_SETRECIP endp + +; *************************************************************** +; MULTUNIT quo_digit (MULTUNIT *dividend) +; Determine the next quotient digit. +; (routine for modmult, converted to assembler) +; +; This implementation is for 16-bit MULTUNIT. +; +; The following items have already been set by calling +; P_SETRECIP: +; reciph, recipl - reciprocal of divisor +; mshift - scaling factor +; +; The dividend parameter points to the most significant word +; of the dividend. +; +; *************************************************************** +; Register assignments: +; dx:ax = product +; cx:bx = temp long +; es:si = dividend pointer +; di = MS word of q0 +; bp = lsb factor +; +; Comments reference the C implementation variables. + +DGROUP group _DATA,_BSS + assume ds:DGROUP + + PUBLIC _P_QUO_DIGIT + + +_P_QUO_DIGIT PROC FAR + push bp + mov bp,sp + push di + push si + + les si,6[bp] ; dividend + mov ax,es:[si-4] ; dividend[-2] + not ax + mul reciph + add ax,reciph + adc dx,0 + mov bx,ax + mov di,dx ; di:bx = q1 + + mov ax,es:[si-2] ; dividend[-1] + not ax + mul recipl + inc dx ; dx:ax = q2 + + mov bp,dx + and bp,di + and bp,1 ; bp = lsb_factor + + add ax,bx + adc di,dx + rcr di,1 ; di = MS word of q0 + + mov ax,es:[si-2] ; dividend [-1] + not ax + mul reciph + mov bx,ax + mov cx,dx ; cx:bx = q1 + + mov ax,es:[si] ; dividend[0] + not ax + mul recipl ; dx:ax = q2 + xor ax,bx + and bp,ax ; lsb correction + xor ax,bx ; restore ax + + add ax,bx + adc dx,cx + rcr dx,1 + rcr ax,1 ; dx:ax = q + + add ax,di ; + scaled q0 + adc dx,0 + add ax,bp ; + lsb correction + adc dx,0 ; q + + shl ax,1 + rcl dx,1 + rcl ax,1 + rcl dx,1 + rcl ax,1 + and ax,3 + mov cx,ax + mov bx,dx ; bx:cx = q >> 14 + + mov ax,es:[si] ; dividend[0] + not ax + mul reciph + shl ax,1 + rcl dx,1 + add ax,bx + adc dx,cx ; q + + mov cx,mshift + shr ax,cl + mov bx,dx + shr dx,cl + neg cx + add cx,16 + shl bx,cl + add ax,bx ; dx:ax = q >> mshift + + or dx,dx + jz no_overflow + mov ax,0ffffh +no_overflow: + pop si + pop di + pop bp + ret +_P_QUO_DIGIT ENDP +_TEXT ends + + end +