--- pgp/src/8086.asm	2018/04/24 16:37:53	1.1.1.1
+++ pgp/src/8086.asm	2018/04/24 16:38:48	1.1.1.2
@@ -1,359 +1,518 @@
-;  Assembly primitives for RSA multiprecision library
-; 
-;  Tested with Turbo Assembler 1.0 and masm 1.00
-; 
-;  Written by Branko Lankester (lankeste@fwi.uva.nl)    10/10/91
-
-; define LDATA and LCODE as follows:
-; model:        small   compact medium  large
-; LDATA         0       1       0       1
-; LCODE         0       0       1       1
-
-LDATA   equ     1
-LCODE   equ     1
-
-IF LDATA
-DSTPTR  equ     es:[bx+si]
-ELSE
-DSTPTR  equ     [bx+si]
-ENDIF
-
-IF LCODE
-prec    equ     [bp+6]          ; 1st arg
-r1      equ     [bp+6]          ; 1st arg
-IF LDATA
-r2      equ     [bp+10]         ; 2nd arg
-carry   equ     [bp+14]         ; 3rd arg
-scarry  equ     [bp+10]         ; carry for shift (arg 2)
-ELSE
-r2      equ     [bp+8]
-carry   equ     [bp+10]
-scarry  equ     [bp+8]
-ENDIF
-ELSE                            ; small code model
-prec    equ     [bp+4]
-r1      equ     [bp+4]
-IF LDATA
-r2      equ     [bp+8]
-carry   equ     [bp+12]
-scarry  equ     [bp+8]
-ELSE
-r2      equ     [bp+6]
-carry   equ     [bp+8]
-scarry  equ     [bp+6]
-ENDIF
-ENDIF
-
-
-_TEXT   segment byte public 'CODE'
-DGROUP  group   _DATA,_BSS
-        assume  cs:_TEXT,ds:DGROUP
-_TEXT   ends
-
-_DATA   segment word public 'DATA'
-_DATA   ends
-
-_BSS    segment word public 'BSS'
-prec16  dw      ?               ; precision / 16 (seems to be / 256?)
-unitprec dw     ?               ; precision / 16, really
-addp    dw      ?               ; jump offset
-subp    dw      ?
-rotp    dw      ?
-mulp    dw      ?
-_BSS    ends
-
-_TEXT   segment byte public 'CODE'
-
-        public  _P_SETP
-        public  _P_ADDC
-        public  _P_SUBB
-        public  _P_ROTL
-
-IF LCODE
-fprims  proc    far                     ; dummy proc
-ELSE
-fprims  proc    near
-ENDIF
-
-;
-; ******************** set precision ********************
-;
-_P_SETP:
-        push    bp
-        mov     bp,sp
-        mov     ax, prec        ; precision in bits
-        add     ax, 0fh
-        mov     cl,4
-        shr     ax,cl           ; prec. in units
-        mov     unitprec,ax
-        push    ax
-        shr     ax,cl
-        mov     prec16,ax       ; precision / 16
-        pop     ax
-        and     ax,0fh          ;   al = prec % 16
-        mov     bx,ax
-        mov     cx,ax
-        shl     bx,1            ; multiply by 4 (=number of bytes 
-        shl     bx,1            ;   in instruction sequence)
-        mov     dx,bx
-IFE LDATA
-        sub     dx,ax           ; small model only 3 for add/sub
-ENDIF
-        mov     ax,offset add_ref
-        sub     ax,dx
-        mov     addp,ax
-
-        mov     ax,offset sub_ref
-        sub     ax,dx
-        mov     subp,ax
-
-        mov     ax,offset rot_ref
-        sub     ax,bx
-        mov     rotp,ax
-
-        mov     ax,offset mul_ref
-        shl     bx,1            ; MULU macro is 17 bytes for large data
-        shl     bx,1
-        sub     ax,bx
-        sub     ax,cx
-        mov     mulp,ax
-
-        pop     bp
-        ret
-
-
-
-;
-; ******************** mpi add with carry ********************
-;
-ADDU    macro   n
-        rept    n
-                lodsw
-                adc     DSTPTR,ax
-        endm
-endm
-
-
-_P_ADDC:
-        push    bp
-        mov     bp,sp
-        push    si
-        mov     cx, prec16
-        mov     dx, addp
-IF LDATA
-        push    ds
-        lds     si, dword ptr r2
-        les     bx, dword ptr r1
-ELSE
-        mov     si, r2
-        mov     bx, r1
-ENDIF
-        sub     bx, si          ; calculate relative offset
-        dec     bx
-        dec     bx
-        cld
-        shr     byte ptr carry,1        ; load carry
-        jcxz    add_units
-add_16u:
-        ADDU    16
-        loop    add_16u
-add_units:
-        jmp     dx
-        ADDU    15
-add_ref:
-        rcl     ax,1            ; return carry
-        and     ax,1
-IF LDATA
-        pop     ds
-ENDIF
-        pop     si
-        pop     bp
-        ret
-
-
-
-;
-; ******************** mpi subtract with borrow ********************
-;
-SUBU    macro   n
-        rept    n
-                lodsw
-                sbb     DSTPTR,ax
-        endm
-endm
-
-
-_P_SUBB:
-        push    bp
-        mov     bp,sp
-        push    si
-        mov     cx, prec16
-        mov     dx, subp
-IF LDATA
-        push    ds
-        lds     si, dword ptr r2
-        les     bx, dword ptr r1
-ELSE
-        mov     si, r2
-        mov     bx, r1
-ENDIF
-        sub     bx, si          ; calculate relative offset
-        dec     bx
-        dec     bx
-        cld
-        shr     byte ptr carry,1
-        jcxz    sub_units
-sub_16u:
-        SUBU    16
-        loop    sub_16u
-sub_units:
-        jmp     dx
-        SUBU    15
-sub_ref:
-        rcl     ax,1            ; return carry
-        and     ax,1
-IF LDATA
-        pop     ds
-ENDIF
-        pop     si
-        pop     bp
-        ret
-
-
-
-;
-; ******************** mpi rotate left ********************
-;
-_P_ROTL:
-        push    bp
-        mov     bp,sp
-        mov     cx, prec16
-        mov     dx, rotp
-IF LDATA
-        push    ds
-        lds     bx, dword ptr r1
-ELSE
-        mov     bx, r1
-ENDIF
-        shr     byte ptr scarry,1
-        jcxz    rot_units
-rot_16u:
-        i = 0
-        rept    16
-                rcl     word ptr [bx + i],1
-                i = i + 2
-        endm
-        lahf
-        add     bx,32
-        sahf
-        loop    rot_16u
-rot_units:
-        jmp     dx
-        rept    15
-                rcl     word ptr [bx],1
-                inc     bx
-                inc     bx
-        endm
-rot_ref:
-
-        rcl     ax,1
-        and     ax,1
-IF LDATA
-        pop     ds
-ENDIF
-        pop     bp
-        ret
-
-fprims  endp
-
-_TEXT   ends
-
-
-
-; ***************************************************************
-;  P_SMUL (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
-;       mp_smul routine from Upton's modmult, converted to assembler
-;
-;       Multiply the single-word multiplier times the multiprecision integer 
-;       in multiplicand, accumulating result in prod.  The resulting 
-;       multiprecision prod will be 1 word longer than the multiplicand.   
-;       multiplicand is unit_prec words long.  We add into prod, so caller 
-;       should zero it out first.
-;
-;       NOTE:  Unlike other functions in the multiprecision arithmetic 
-;       library, both multiplicand and prod are pointing at the LSB, 
-;       regardless of byte order of the machine.  On an 80x86, this makes 
-;       no difference.  But if this assembly function is implemented
-;       on a 680x0, it becomes important.
-; ***************************************************************
-;   Variable assignments:
-;       multiplier = [bp+14]
-;       multiplicand = [ds:di]  32-bit pointer
-;       prod = [es:si]          32-bit pointer
-;       unit_prec = cx
-;       p = ax-dx
-;       carry = bx
-UPTON_TEXT      SEGMENT  WORD PUBLIC 'CODE'
-UPTON_TEXT      ENDS
-UPTON_TEXT      SEGMENT
-        ASSUME  CS: UPTON_TEXT
-        ASSUME  DS: DGROUP
-        PUBLIC  _P_SMUL
-
-MULU    macro   n
-        rept    n
-                lodsw                   ;multiplicand
-                mul     bp              ;multiplier, results (p) to AX/DX
-                add     ax,bx           ;carry
-                adc     dx,0
-                add     ax,WORD PTR es:[di]
-                adc     dx,0
-                mov     bx,dx           ;carry
-                stosw
-        endm
-endm
-
-_P_SMUL PROC FAR
-        push    bp
-        mov     bp,sp
-        push    di
-        push    si
-        push    ds
-        mov     cx,prec16
-        mov     ax,mulp
-        push    ax
-
-        sub     bx,bx           ;carry = 0, store in bx
-
-        les     di,DWORD PTR [bp+6]     ;prod in es:di
-        lds     si,DWORD PTR [bp+10]    ;multiplicand in ds:si
-        cld
-        mov     bp,[bp+14]
-
-        or      cx,cx
-        jnz     mul_16u
-        jmp     mul_units
-mul_16u:
-        MULU    16
-        dec     cx
-        jz      mul_units
-        jmp     mul_16u
-mul_units:
-        pop     cx
-        jmp     cx
-        MULU    15
-mul_ref:
-
-        ; We know that the high-order word of prod will always be 0
-        mov     WORD PTR es:[di],bx     ;store carry in prod empty high word
-
-        pop     ds
-        pop     si
-        pop     di
-        pop     bp
-        ret     
-
-_P_SMUL ENDP
-UPTON_TEXT   ends
-        end
-
+;  Assembly primitives for RSA multiprecision library
+;
+;  Tested with Turbo Assembler 1.0 and masm 1.00
+;
+;  Written by Branko Lankester (lankeste@fwi.uva.nl)    10/10/91
+;
+;  Modified to add, rather than store carry bit to allow using a
+;  smaller precision for long division.
+
+; define LDATA and LCODE as follows:
+; model:        small   compact medium  large
+; LDATA         0       1       0       1
+; LCODE         0       0       1       1
+
+LDATA   equ     1
+LCODE   equ     1
+
+; Note: Only the large memory model has been implemented for P_SMULA,
+; P_SETRECIP and P_QUO_DIGIT.
+
+IF LDATA
+DSTPTR  equ     es:[bx+si]
+ELSE
+DSTPTR  equ     [bx+si]
+ENDIF
+
+IF LCODE
+prec    equ     [bp+6]          ; 1st arg
+r1      equ     [bp+6]          ; 1st arg
+IF LDATA
+r2      equ     [bp+10]         ; 2nd arg
+carry   equ     [bp+14]         ; 3rd arg
+scarry  equ     [bp+10]         ; carry for shift (arg 2)
+ELSE
+r2      equ     [bp+8]
+carry   equ     [bp+10]
+scarry  equ     [bp+8]
+ENDIF
+ELSE                            ; small code model
+prec    equ     [bp+4]
+r1      equ     [bp+4]
+IF LDATA
+r2      equ     [bp+8]
+carry   equ     [bp+12]
+scarry  equ     [bp+8]
+ELSE
+r2      equ     [bp+6]
+carry   equ     [bp+8]
+scarry  equ     [bp+6]
+ENDIF
+ENDIF
+
+IF NOT LCODE
+UPTON_TEXT = _TEXT
+ENDIF
+
+_TEXT   segment byte public 'CODE'
+DGROUP  group   _DATA,_BSS
+        assume  cs:_TEXT,ds:DGROUP
+_TEXT   ends
+
+_DATA   segment word public 'DATA'
+_DATA   ends
+
+_BSS    segment word public 'BSS'
+prec16  dw      ?               ; precision / 16 (seems to be / 256?)
+unitprec dw     ?               ; precision / 16, really
+addp    dw      ?               ; jump offset
+subp    dw      ?
+rotp    dw      ?
+mulp    dw      ?
+_BSS    ends
+
+_TEXT   segment byte public 'CODE'
+
+        public  _P_SETP
+        public  _P_ADDC
+        public  _P_SUBB
+        public  _P_MUSUBB
+        public  _P_ROTL
+
+IF LCODE
+fprims  proc    far                     ; dummy proc
+ELSE
+fprims  proc    near
+ENDIF
+
+;
+; ******************** set precision ********************
+;
+_P_SETP:
+        push    bp
+        mov     bp,sp
+        mov     ax, prec        ; precision in bits
+        add     ax, 0fh
+        mov     cl,4
+        shr     ax,cl           ; prec. in units
+        mov     unitprec,ax
+        push    ax
+        shr     ax,cl
+        mov     prec16,ax       ; precision / 16
+        pop     ax
+        and     ax,0fh          ;   al = prec % 16
+        mov     bx,ax
+        mov     cx,ax
+        shl     bx,1            ; multiply by 4 (=number of bytes
+        shl     bx,1            ;   in instruction sequence)
+        mov     dx,bx
+IFE LDATA
+        sub     dx,ax           ; small model only 3 for add/sub
+ENDIF
+        mov     ax,offset add_ref
+        sub     ax,dx
+        mov     addp,ax
+
+        mov     ax,offset sub_ref
+        sub     ax,dx
+        mov     subp,ax
+
+        mov     ax,offset rot_ref
+        sub     ax,bx
+        mov     rotp,ax
+
+        mov     ax,offset mul_ref
+        shl     bx,1            ; MULU macro is 17 bytes for large data
+        shl     bx,1
+        sub     ax,bx
+        sub     ax,cx
+        mov     mulp,ax
+
+        pop     bp
+        ret
+
+
+
+;
+; ******************** mpi add with carry ********************
+;
+ADDU    macro   n
+        rept    n
+                lodsw
+                adc     DSTPTR,ax
+        endm
+endm
+
+
+_P_ADDC:
+        push    bp
+        mov     bp,sp
+        push    si
+        mov     cx, prec16
+        mov     dx, addp
+IF LDATA
+        push    ds
+        lds     si, dword ptr r2
+        les     bx, dword ptr r1
+ELSE
+        mov     si, r2
+        mov     bx, r1
+ENDIF
+        sub     bx, si          ; calculate relative offset
+        dec     bx
+        dec     bx
+        cld
+        shr     byte ptr carry,1        ; load carry
+        jcxz    add_units
+add_16u:
+        ADDU    16
+        loop    add_16u
+add_units:
+        jmp     dx
+        ADDU    15
+add_ref:
+        rcl     ax,1            ; return carry
+        and     ax,1
+IF LDATA
+        pop     ds
+ENDIF
+        pop     si
+        pop     bp
+        ret
+
+
+
+;
+; ******************** mpi subtract with borrow ********************
+;
+SUBU    macro   n
+        rept    n
+                lodsw
+                sbb     DSTPTR,ax
+        endm
+endm
+
+
+_P_MUSUBB:              ; MULTUNIT is same size as unit
+_P_SUBB:
+        push    bp
+        mov     bp,sp
+        push    si
+        mov     cx, prec16
+        mov     dx, subp
+IF LDATA
+        push    ds
+        lds     si, dword ptr r2
+        les     bx, dword ptr r1
+ELSE
+        mov     si, r2
+        mov     bx, r1
+ENDIF
+        sub     bx, si          ; calculate relative offset
+        dec     bx
+        dec     bx
+        cld
+        shr     byte ptr carry,1
+        jcxz    sub_units
+sub_16u:
+        SUBU    16
+        loop    sub_16u
+sub_units:
+        jmp     dx
+        SUBU    15
+sub_ref:
+        rcl     ax,1            ; return carry
+        and     ax,1
+IF LDATA
+        pop     ds
+ENDIF
+        pop     si
+        pop     bp
+        ret
+
+
+
+;
+; ******************** mpi rotate left ********************
+;
+_P_ROTL:
+        push    bp
+        mov     bp,sp
+        mov     cx, prec16
+        mov     dx, rotp
+IF LDATA
+        push    ds
+        lds     bx, dword ptr r1
+ELSE
+        mov     bx, r1
+ENDIF
+        shr     byte ptr scarry,1
+        jcxz    rot_units
+rot_16u:
+        i = 0
+        rept    16
+                rcl     word ptr [bx + i],1
+                i = i + 2
+        endm
+        lahf
+        add     bx,32
+        sahf
+        loop    rot_16u
+rot_units:
+        jmp     dx
+        rept    15
+                rcl     word ptr [bx],1
+                inc     bx
+                inc     bx
+        endm
+rot_ref:
+
+        rcl     ax,1
+        and     ax,1
+IF LDATA
+        pop     ds
+ENDIF
+        pop     bp
+        ret
+
+fprims  endp
+
+
+
+
+; ***************************************************************
+;  P_SMULA (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
+;       mp_smul routine from Upton's modmult, converted to assembler
+;
+;       Multiply the single-word multiplier times the multiprecision integer 
+;       in multiplicand, accumulating result in prod.  The resulting 
+;       multiprecision prod will be 1 word longer than the multiplicand.   
+;       multiplicand is unit_prec words long.  We add into prod, so caller 
+;       should zero it out first.
+;
+;       NOTE:  Unlike other functions in the multiprecision arithmetic 
+;       library, both multiplicand and prod are pointing at the LSB, 
+;       regardless of byte order of the machine.  On an 80x86, this makes 
+;       no difference.  But if this assembly function is implemented
+;       on a 680x0, it becomes important.
+;
+;       This version differs from P_SMUL by adding in, rather than storing,
+;       the final carry.  This better supports use by Smith's modmult.
+; ***************************************************************
+;   Variable assignments:
+;       multiplier = [bp+14]
+;       multiplicand = [ds:di]  32-bit pointer
+;       prod = [es:si]          32-bit pointer
+;       unit_prec = cx
+;       p = ax-dx
+;       carry = bx
+
+        PUBLIC  _P_SMULA
+
+MULU    macro   n
+        rept    n
+                lodsw                   ;multiplicand
+                mul     bp              ;multiplier, results (p) to AX/DX
+                add     ax,bx           ;carry
+                adc     dx,0
+                add     ax,WORD PTR es:[di]
+                adc     dx,0
+                mov     bx,dx           ;carry
+                stosw
+        endm
+endm
+
+_P_SMULA PROC FAR
+        push    bp
+        mov     bp,sp
+        push    di
+        push    si
+        push    ds
+        mov     cx,prec16
+        mov     ax,mulp
+        push    ax
+
+        sub     bx,bx           ;carry = 0, store in bx
+
+        les     di,DWORD PTR [bp+6]     ;prod in es:di
+        lds     si,DWORD PTR [bp+10]    ;multiplicand in ds:si
+        cld
+        mov     bp,[bp+14]
+
+        or      cx,cx
+        jnz     mul_16u
+        jmp     mul_units
+mul_16u:
+        MULU    16
+        dec     cx
+        jz      mul_units
+        jmp     mul_16u
+mul_units:
+        pop     cx
+        jmp     cx
+        MULU    15
+mul_ref:
+
+        add     WORD PTR es:[di],bx     ;add final carry
+
+        pop     ds
+        pop     si
+        pop     di
+        pop     bp
+        ret
+_P_SMULA ENDP
+
+; ***************************************************************
+; void P_SETRECIP (MULTUNIT reciph, MULTUNIT recipl, short mshift)
+;       Specify reciprocal factors for use by P_QUO_DIGIT.
+;
+;       This implementation is for 16-bit MULTUNIT.
+;
+; ***************************************************************
+
+DGROUP  group   _DATA,_BSS
+        assume  ds:DGROUP
+_BSS    segment word public 'BSS'
+reciph  dw      ?               ; recip msw
+recipl  dw      ?               ; recip lsw
+mshift  dw      ?               ; shift adjust
+_BSS    ends
+
+        PUBLIC  _P_SETRECIP
+
+_P_SETRECIP PROC FAR
+        push    bp
+        mov     bp,sp
+
+        mov     ax,6[bp]        ; reciph
+        mov     reciph,ax
+        mov     ax,8[bp]        ; recipl
+        mov     recipl,ax
+        mov     ax,10[bp]       ; mshift
+        mov     mshift,ax
+
+        pop     bp
+        ret
+_P_SETRECIP endp
+
+; ***************************************************************
+; MULTUNIT quo_digit (MULTUNIT *dividend)
+;       Determine the next quotient digit.
+;       (routine for modmult, converted to assembler)
+;
+;       This implementation is for 16-bit MULTUNIT.
+;
+;       The following items have already been set by calling
+;       P_SETRECIP:
+;       reciph, recipl - reciprocal of divisor
+;       mshift         - scaling factor
+;
+;       The dividend parameter points to the most significant word
+;       of the dividend.
+;
+; ***************************************************************
+;   Register assignments:
+;       dx:ax = product
+;       cx:bx = temp long
+;       es:si = dividend pointer
+;       di    = MS word of q0
+;       bp    = lsb factor
+;
+;   Comments reference the C implementation variables.
+
+DGROUP  group   _DATA,_BSS
+        assume  ds:DGROUP
+
+        PUBLIC  _P_QUO_DIGIT
+
+
+_P_QUO_DIGIT PROC FAR
+        push    bp
+        mov     bp,sp
+        push    di
+        push    si
+
+        les     si,6[bp]        ; dividend
+        mov     ax,es:[si-4]    ; dividend[-2]
+        not     ax
+        mul     reciph
+        add     ax,reciph
+        adc     dx,0
+        mov     bx,ax
+        mov     di,dx           ; di:bx = q1
+
+        mov     ax,es:[si-2]    ; dividend[-1]
+        not     ax
+        mul     recipl
+        inc     dx              ; dx:ax = q2
+
+        mov     bp,dx
+        and     bp,di
+        and     bp,1            ; bp = lsb_factor
+
+        add     ax,bx
+        adc     di,dx
+        rcr     di,1            ; di = MS word of q0
+
+        mov     ax,es:[si-2]    ; dividend [-1]
+        not     ax
+        mul     reciph
+        mov     bx,ax
+        mov     cx,dx           ; cx:bx = q1
+
+        mov     ax,es:[si]      ; dividend[0]
+        not     ax
+        mul     recipl          ; dx:ax = q2
+        xor     ax,bx
+        and     bp,ax           ; lsb correction
+        xor     ax,bx           ; restore ax
+
+        add     ax,bx
+        adc     dx,cx
+        rcr     dx,1
+        rcr     ax,1            ; dx:ax = q
+
+        add     ax,di           ; + scaled q0
+        adc     dx,0
+        add     ax,bp           ; + lsb correction
+        adc     dx,0            ; q
+
+        shl     ax,1
+        rcl     dx,1
+        rcl     ax,1
+        rcl     dx,1
+        rcl     ax,1
+        and     ax,3
+        mov     cx,ax
+        mov     bx,dx           ; bx:cx = q >> 14
+
+        mov     ax,es:[si]      ; dividend[0]
+        not     ax
+        mul     reciph
+        shl     ax,1
+        rcl     dx,1
+        add     ax,bx
+        adc     dx,cx           ; q
+
+        mov     cx,mshift
+        shr     ax,cl
+        mov     bx,dx
+        shr     dx,cl
+        neg     cx
+        add     cx,16
+        shl     bx,cl
+        add     ax,bx           ; dx:ax = q >> mshift
+
+        or      dx,dx
+        jz      no_overflow
+        mov     ax,0ffffh
+no_overflow:
+        pop     si
+        pop     di
+        pop     bp
+        ret
+_P_QUO_DIGIT ENDP
+_TEXT   ends
+
+        end
+