pgp/contrib/idea/idea.asm - annotate

Return to idea.asm CVS log
Up to [PGP] / pgp / contrib / idea
Annotation of pgp/contrib/idea/idea.asm, revision 1.1

1.1     ! root        1:        %PAGESIZE       59      ; Turbo assembler formatting codes
        !             2:        %BIN    13
        !             3:        %LINUM  3
        !             4: 
        !             5: ; Copyright (c) 1993 Colin Plumb.  This code may be freely
        !             6: ; distributed under the terms of the GNU General Public Licence.
        !             7: 
        !             8:        .model large
        !             9:        .code
        !            10: 
        !            11: ; A core operation in IDEA is multiplication modulo 65537.
        !            12: ; The valid inputs, 1 through 66636 inclusive are represented in
        !            13: ; 16-bit registers modulo 65536.  I.e. a value of 0 means 65536,
        !            14: ; or -1.  Thus, we need to test for that specially.  -x, modulo
        !            15: ; 65537, is 65537-x = 1-x.
        !            16: ; For any other number, represent the product as a*65536+b.  Since
        !            17: ; 65536 = -1 (mod 65537), this is the same number as b-a.  Should
        !            18: ; this result be negautive (generate a borrow), -n mod 65537 = 1-n
        !            19: ; mod 65536.  Or in other words, if you add the borrow bit back on,
        !            20: ; you get the right answer.
        !            21: 
        !            22: ; This is what the assembly code does.  It forms a zero, and adds
        !            23: ; that on with carry.
        !            24: 
        !            25: ; Another useful optimisation takes advantage of the fact that
        !            26: ; a and b are equal only if the answer is congruent to 0 mod 65537.
        !            27: ; Since 65537 is prime, this happens only if one of the inputs is
        !            28: ; congruent to 0 mod 65537.  Since the inputs are all less than 65537,
        !            29: ; this means it must have been zero.
        !            30: 
        !            31: ; The code below tests for a zero result of the subtraction, and if
        !            32: ; one arises, it branches out of line to figure out what happened.
        !            33: 
        !            34: 
        !            35: ; This code implemets the IDEA encryption algorithm.
        !            36: ; It follows in pseudo-C, where the * operator operates
        !            37: ; modulo 65537, as Idea needs.  (If you don't understand,
        !            38: ; learn IDEA better.)
        !            39: 
        !            40: ; IDEA is works on 16-bit units.  If you're processing bytes,
        !            41: ; it's defined to be big-endian, so an Intel machine needs to
        !            42: ; swap the bytes around.
        !            43: 
        !            44: ; void Idea(u_int16 *in, u_int16 *out, u_int16 *key)
        !            45: ; {
        !            46: ;      register u+int16 x0, x1, x2, x3, s1, s2, round;
        !            47: ;
        !            48: ;      x0 = *in++;  x1 = *in++;  x2 = *in++;  x3 = *in;
        !            49: ;
        !            50: ;      for (round = 0; round < 8; round++) {
        !            51: ;              x0 *= *key++;
        !            52: ;              x1 += *key++;
        !            53: ;              x2 += *key++;
        !            54: ;              x3 *= *key++;
        !            55: ;
        !            56: ;              s1  = x1;  s2  = x2;
        !            57: ;              x2 ^= x0;  x1 ^= x3;
        !            58: ;
        !            59: ;              x2 *= *key++;
        !            60: ;              x1 += x2;
        !            61: ;              x1 *= *key++;
        !            62: ;              x2 += x1;
        !            63: ;
        !            64: ;              x0 ^= x1;  x3 ^= x2;
        !            65: ;              x1 ^= s2;  x2 ^= s1;
        !            66: ;      }
        !            67: ;      *out++ = x0 * *key++;
        !            68: ;      *out++ = x2 + *key++;   /* Yes, this is x2, not x1 */
        !            69: ;      *out++ = x1 + *key++;
        !            70: ;      *out   = x3 * *key;
        !            71: ; }
        !            72: 
        !            73: ; ds:si points to key, ax, dx are temps, args in bx, cx, di, bp
        !            74: ; Trashes *all* registers.  direction flag must be clear.
        !            75: ; Leaves es zero.
        !            76: 
        !            77: ; Since there is no spare register to hold the loop count, I make
        !            78: ; clever use of the stack, pushing the start of the loop several
        !            79: ; times and using a ret instruction to do the return.
        !            80: 
        !            81: ; Annoyingly, lods is fastest on 8086's, but other techniques are
        !            82: ; best on 386's.  Well, that's what the manual says, but real
        !            83: ; life is different.  USELODS wins on a 386SX, at least.
        !            84: ; Leave it set for all platforms.
        !            85: 
        !            86: USELODS        equ     1
        !            87: 
        !            88: ; bp must be x0 for some of the code below to work
        !            89: x0     equ     bp
        !            90: x1     equ     bx
        !            91: x2     equ     cx
        !            92: x3     equ     di
        !            93: ; di must be x3 for some of the code below to work
        !            94: 
        !            95: ;; Now, this is rather interesting.  We test for zero arguments
        !            96: ;; after the multiply.  Assuming random inputs, one or both are
        !            97: ;; zero (2^17-1)/2^32, or approximately 1/32786 of the time.
        !            98: ;; Encryption in any feedback mode produces essentially random
        !            99: ;; inputs, so average-case analysis is okay.  While we don't
        !           100: ;; want the out-of-line code to waste time, it is not worth
        !           101: ;; slowing down the in-line case to speed it up.
        !           102: ;;
        !           103: ;; Basically, we start inverting the source x, and if that was 0,
        !           104: ;; we use the inverse of the key instead.
        !           105: 
        !           106: Core1Z:
        !           107:        neg     x0
        !           108:        jnz     Core1Za
        !           109: if USELODS
        !           110:        sub     x0,[si-2]
        !           111: else
        !           112:        sub     x0,[si]
        !           113: endif
        !           114: Core1Za:
        !           115:        inc     x0
        !           116:        jmp     Core1done
        !           117: Core2Z:
        !           118:        neg     x3
        !           119:        jnz     Core2Za
        !           120: if USELODS
        !           121:        sub     x3,[si-2]
        !           122: else
        !           123:        sub     x3,[si+6]
        !           124: endif
        !           125: Core2Za:
        !           126:        inc     x3
        !           127:        jmp     Core2done
        !           128: Core3Z:
        !           129:        neg     x2
        !           130:        jnz     Core3Za
        !           131: if USELODS
        !           132:        sub     x2,[si-2]
        !           133: else
        !           134:        sub     x2,[si+8]
        !           135: endif
        !           136: Core3Za:
        !           137:        inc     x2
        !           138:        jmp     Core3done
        !           139: Core4Z:
        !           140:        neg     x1
        !           141:        jnz     Core4Za
        !           142: if USELODS
        !           143:        sub     x1,[si-2]
        !           144: else
        !           145:        sub     x1,[si+10]
        !           146: endif
        !           147: Core4Za:
        !           148:        inc     x1
        !           149:        jmp     Core4done
        !           150: 
        !           151: ; We need a constant 0 that we can move into a register without affecting
        !           152: ; the carry flag (as the classic xor ax,ax is wont to do), so we use the
        !           153: ; es register for a constant 0 source.  This is okay even in protected
        !           154: ; mode.  (I *told* you this was tricky code!)
        !           155: 
        !           156: ; BTW, since you wanted to know, this is 8 + 78*4 + 16 = 336 instructions.
        !           157: 
        !           158: Core   proc    near
        !           159:        xor     ax,ax
        !           160:        mov     es,ax
        !           161:        mov     ax,OFFSET Finish
        !           162:        push    ax
        !           163:        mov     ax,OFFSET Coreloop
        !           164:        push    ax      ; Loop 3 times, then return
        !           165:        push    ax
        !           166:        push    ax
        !           167: 
        !           168: Coreloop:
        !           169: if USELODS
        !           170:        lodsw
        !           171: else
        !           172:        mov     ax,[si]         ; x0 *= *key++
        !           173: endif
        !           174:        mul     x0
        !           175:        sub     ax,dx
        !           176:        jz      Core1Z
        !           177:        mov     x0,es
        !           178:        adc     x0,ax
        !           179: Core1done:
        !           180: 
        !           181: if USELODS
        !           182:        lodsw
        !           183:        add     x1,ax
        !           184:        lodsw
        !           185:        add     x2,ax
        !           186: else
        !           187:        add     x1,[si+2]       ; x1 += *key++
        !           188:        add     x2,[si+4]       ; x2 += *key++
        !           189: endif
        !           190: 
        !           191: if USELODS
        !           192:        lodsw
        !           193: else
        !           194:        mov     ax,[si+6]       ; x3 += *key++
        !           195: endif
        !           196:        mul     x3
        !           197:        sub     ax,dx
        !           198:        jz      Core2Z
        !           199:        mov     x3,es
        !           200:        adc     x3,ax
        !           201: Core2done:
        !           202: 
        !           203:        push    x1              ; s1 = x1
        !           204:        push    x2              ; s2 = x2
        !           205: 
        !           206:        xor     x1,x3           ; x1 ^= x3
        !           207:        xor     x2,x0           ; x2 ^= x0
        !           208: 
        !           209: if USELODS
        !           210:        lodsw
        !           211: else
        !           212:        mov     ax,[si+8]       ; x2 *= *key++
        !           213: endif
        !           214:        mul     x2
        !           215:        sub     ax,dx
        !           216:        jz      Core3Z
        !           217:        mov     x2,es
        !           218:        adc     x2,ax
        !           219: Core3done:
        !           220: 
        !           221:        add     x1,x2           ; x1 += x2
        !           222: 
        !           223: if USELODS
        !           224:        lodsw
        !           225: else
        !           226:        mov     ax,[si+10]      ; x1 *= *key++
        !           227: endif
        !           228:        mul     x1
        !           229:        sub     ax,dx
        !           230:        jz      Core4Z
        !           231:        mov     x1,es
        !           232:        adc     x1,ax
        !           233: Core4done:
        !           234: 
        !           235:        add     x2,x1           ; x2 += x1
        !           236: 
        !           237:        xor     x0,x1           ; x0 ^= x1
        !           238:        xor     x3,x2           ; x3 ^= x2
        !           239: 
        !           240:        pop     dx
        !           241:        xor     x1,dx           ; x1 ^= s2
        !           242:        pop     dx
        !           243:        xor     x2,dx           ; x2 ^= s1
        !           244: 
        !           245: ; Second unrolling of loop
        !           246: if USELODS
        !           247:        lodsw
        !           248: else
        !           249:        mov     ax,[si+12]      ; x0 *= *key++
        !           250: endif
        !           251:        mul     x0
        !           252:        sub     ax,dx
        !           253:        jz      Core5Z
        !           254:        mov     x0,es
        !           255:        adc     x0,ax
        !           256: Core5done:
        !           257: 
        !           258: if USELODS
        !           259:        lodsw
        !           260:        add     x1,ax
        !           261:        lodsw
        !           262:        add     x2,ax
        !           263: else
        !           264:        add     x1,[si+14]      ; x1 += *key++
        !           265:        add     x2,[si+16]      ; x2 += *key++
        !           266: endif
        !           267: 
        !           268: if USELODS
        !           269:        lodsw
        !           270: else
        !           271:        mov     ax,[si+18]      ; x3 *= *key++
        !           272: endif
        !           273:        mul     x3
        !           274:        sub     ax,dx
        !           275:        jz      Core6Z
        !           276:        mov     x3,es
        !           277:        adc     x3,ax
        !           278: Core6done:
        !           279: 
        !           280:        push    x1              ; s1 = x1
        !           281:        push    x2              ; s2 = x2
        !           282: 
        !           283:        xor     x1,x3           ; x1 ^= x3
        !           284:        xor     x2,x0           ; x2 ^= x0
        !           285: 
        !           286: if USELODS
        !           287:        lodsw
        !           288: else
        !           289:        mov     ax,[si+20]      ; x2 *= *key++
        !           290: endif
        !           291:        mul     x2
        !           292:        sub     ax,dx
        !           293:        jz      Core7Z
        !           294:        mov     x2,es
        !           295:        adc     x2,ax
        !           296: Core7done:
        !           297: 
        !           298:        add     x1,x2           ; x1 += x2
        !           299: 
        !           300: if USELODS
        !           301:        lodsw
        !           302: else
        !           303:        mov     ax,[si+22]      ; x1 *= *key++
        !           304: endif
        !           305:        mul     x1
        !           306:        sub     ax,dx
        !           307:        jz      Core8Z
        !           308:        mov     x1,es
        !           309:        adc     x1,ax
        !           310: Core8done:
        !           311: 
        !           312:        add     x2,x1           ; x2 += x1
        !           313: 
        !           314:        xor     x0,x1           ; x0 ^= x1
        !           315:        xor     x3,x2           ; x3 ^= x2
        !           316: 
        !           317:        pop     dx
        !           318:        xor     x1,dx           ; x1 ^= s2
        !           319:        pop     dx
        !           320:        xor     x2,dx           ; x2 ^= s1
        !           321: 
        !           322: ife USELODS
        !           323:        lea     si,[si+24]
        !           324: endif
        !           325: 
        !           326:        ret     ; Used as a loop instruction!
        !           327: 
        !           328: Core5Z:
        !           329:        neg     x0
        !           330:        jnz     Core5Za
        !           331: if USELODS
        !           332:        sub     x0,[si-2]
        !           333: else
        !           334:        sub     x0,[si+12]
        !           335: endif
        !           336: Core5Za:
        !           337:        inc     x0
        !           338:        jmp     Core5done
        !           339: Core6Z:
        !           340:        neg     x3
        !           341:        jnz     Core6Za
        !           342: if USELODS
        !           343:        sub     x3,[si-2]
        !           344: else
        !           345:        sub     x3,[si+18]
        !           346: endif
        !           347: Core6Za:
        !           348:        inc     x3
        !           349:        jmp     Core6done
        !           350: Core7Z:
        !           351:        neg     x2
        !           352:        jnz     Core7Za
        !           353: if USELODS
        !           354:        sub     x2,[si-2]
        !           355: else
        !           356:        sub     x2,[si+20]
        !           357: endif
        !           358: Core7Za:
        !           359:        inc     x2
        !           360:        jmp     Core7done
        !           361: Core8Z:
        !           362:        neg     x1
        !           363:        jnz     Core8Za
        !           364: if USELODS
        !           365:        sub     x1,[si-2]
        !           366: else
        !           367:        sub     x1,[si+22]
        !           368: endif
        !           369: Core8Za:
        !           370:        inc     x1
        !           371:        jmp     Core8done
        !           372: Core9Z:
        !           373:        neg     x0
        !           374:        jnz     Core9Za
        !           375: if USELODS
        !           376:        sub     x0,[si-2]
        !           377: else
        !           378:        sub     x0,[si]
        !           379: endif
        !           380: Core9Za:
        !           381:        inc     x0
        !           382:        jmp     Core9done
        !           383: ; Special: compute into dx (zero on entry)
        !           384: Core10Z:
        !           385:        sub     dx,x3
        !           386:        jnz     Core10Za
        !           387: if USELODS
        !           388:        sub     dx,[si-2]
        !           389: else
        !           390:        sub     dx,[si+6]
        !           391: endif
        !           392: Core10Za:
        !           393:        inc     dx
        !           394: ;      jmp     Core10done
        !           395:        ret
        !           396: 
        !           397: 
        !           398: Finish:
        !           399: if USELODS
        !           400:        lodsw
        !           401: else
        !           402:        mov     ax,[si]         ; x0 *= *key++
        !           403: endif
        !           404:        mul     x0
        !           405:        sub     ax,dx
        !           406:        jz      Core9Z
        !           407:        mov     x0,es
        !           408:        adc     x0,ax
        !           409: Core9done:
        !           410: 
        !           411:        xchg    x1,x2
        !           412: if USELODS
        !           413:        lodsw
        !           414:        add     x1,ax
        !           415:        lodsw
        !           416:        add     x2,ax
        !           417: else
        !           418:        add     x1,[si+2]       ; x1 += *key++
        !           419:        add     x2,[si+4]       ; x2 += *key++
        !           420: endif
        !           421: 
        !           422: ; This is special: compute into dx, not x3
        !           423: if USELODS
        !           424:        lodsw
        !           425: else
        !           426:        mov     ax,[si+6]       ; x3 *= *key++
        !           427: endif
        !           428:        mul     x3
        !           429:        sub     ax,dx
        !           430:        mov     dx,es
        !           431:        jz      Core10Z
        !           432:        adc     dx,ax
        !           433: Core10done:
        !           434: 
        !           435:        ret
        !           436: 
        !           437:        endp
        !           438: 
        !           439: 
        !           440: ; Args are in, out, key
        !           441:        public  _Idea2
        !           442: _Idea2 proc far
        !           443:        cld
        !           444:        push    bp      ; Args start at [bp+6]
        !           445:        mov     bp,sp
        !           446:        push    si
        !           447:        push    di
        !           448:        push    ds      ; 6 more words here, so args are at [sp+12]
        !           449:        lds     si,[bp+6]       ; in
        !           450:        lodsw
        !           451:        xchg    ah,al
        !           452:        mov     dx,ax
        !           453:        lodsw
        !           454:        xchg    ah,al
        !           455:        mov     x1,ax
        !           456:        lodsw
        !           457:        xchg    ah,al
        !           458:        mov     x2,ax
        !           459:        lodsw
        !           460:        xchg    ah,al
        !           461:        mov     x3,ax
        !           462:        lds     si,[bp+14]      ; key
        !           463: 
        !           464:        mov     x0,dx
        !           465: 
        !           466:        call    Core
        !           467: 
        !           468:        mov     ax,x0
        !           469:        mov     bp,sp
        !           470:        les     di,[bp+16]
        !           471:        xchg    ah,al
        !           472:        stosw
        !           473:        mov     ax,x1
        !           474:        xchg    ah,al
        !           475:        stosw
        !           476:        mov     ax,x2
        !           477:        xchg    ah,al
        !           478:        stosw
        !           479:        mov     ax,x3
        !           480:        xchg    ah,al
        !           481:        stosw
        !           482: 
        !           483:        pop     ds
        !           484:        pop     di
        !           485:        pop     si
        !           486:        pop     bp
        !           487: 
        !           488:        ret
        !           489: 
        !           490:        endp
        !           491: 
        !           492: ; Okay, the basic plan for the CFB kernel is
        !           493: ; get x0,x1,x2,x3
        !           494: ; get key pointer
        !           495: ; call core
        !           496: ; get buffer pointers
        !           497: ;Loop:
        !           498: ; lodsw
        !           499: ; xor  ax,x0
        !           500: ; mov   x0,ax
        !           501: ; stosw
        !           502: ; lodsw
        !           503: ; xor  ax,x1
        !           504: ; mov  x0,ax
        !           505: ; stosw
        !           506: ; lodsw
        !           507: ; xor  ax,x2
        !           508: ; mov  x0,ax
        !           509: ; stosw
        !           510: ; lodsw
        !           511: ; xor  ax,x3
        !           512: ; mov  x3,ax
        !           513: ; stosw
        !           514: ; push buffer pointers
        !           515: ; get key pointer
        !           516: ; call core
        !           517: ; pop buffer pointers
        !           518: ; loop
        !           519: ; lodsw/xor/etc.
        !           520: ;
        !           521: ;
        !           522: ; This function is designed to go in the middle of a byte-granularity
        !           523: ; CFB engine.  It performs "len" encryptions of the IV, encrypting
        !           524: ; 8*(len-1) bytes from the source to the destination.  The idea is
        !           525: ; that you first xor any odd leading bytes, then call this function,
        !           526: ; then xor up to 8 trailing bytes.
        !           527: 
        !           528: ; The main loop in this is 38 instructions, plus the 336 for the core
        !           529: ; makes 374 total.  That's 46.75 instructions per byte.
        !           530: ; (It's the same for IdeaCFBx)
        !           531: 
        !           532: ; IV, key, plain, cipher, len
        !           533:        public  _IdeaCFB
        !           534: _IdeaCFB proc far       ; Args are at [sp+4]
        !           535:        cld
        !           536:        push    bp
        !           537:        push    si
        !           538:        push    di
        !           539:        push    ds      ; 8 more words here, so args are at [sp+12]
        !           540: ; To be precise, IV is at 12, key at 16, plain at 20,
        !           541: ; cipher at 24 and len at 28
        !           542:        mov     bp,sp
        !           543:        lds     si,[bp+12]      ; IV
        !           544: ; Load and byte-swap IV
        !           545:        mov     ax,[si]
        !           546:        xchg    ah,al
        !           547:        mov     x1,[si+2]
        !           548:        mov     x2,[si+4]
        !           549:        xchg    bh,bl
        !           550:        xchg    ch,cl
        !           551:        mov     dx,[si+6]
        !           552:        xchg    dh,dl
        !           553: 
        !           554:        lds     si,[bp+16]      ; Key
        !           555:        mov     x0,ax
        !           556:        mov     x3,dx
        !           557: 
        !           558:        call    Core
        !           559: IdeaCFBLoop:
        !           560: ;      mov     ax,x0
        !           561: ;      mov     bp,sp
        !           562: ;      dec     WORD PTR [bp+28]        ; Decrement count
        !           563: ;      jz      IdeaCFBEnd
        !           564: ;      lds     si,[bp+20]
        !           565: ;      les     di,[bp+24]
        !           566: ;      mov     x0,ax
        !           567: ; Alternate code: (which is faster?  Two moves or three segment overrides?)
        !           568:        mov     si,sp
        !           569:        dec     WORD PTR ss:[si+28]
        !           570:        jz      IdeaCFBEnd
        !           571:        les     di,ss:[si+24]
        !           572:        lds     si,ss:[si+20]
        !           573: 
        !           574:        lodsw
        !           575:        xchg    ah,al
        !           576:        xor     ax,x0
        !           577:        mov     x0,ax
        !           578:        xchg    ah,al
        !           579:        stosw
        !           580:        lodsw
        !           581:        xchg    ah,al
        !           582:        xor     ax,x1
        !           583:        mov     x1,ax
        !           584:        xchg    ah,al
        !           585:        stosw
        !           586:        lodsw
        !           587:        xchg    ah,al
        !           588:        xor     ax,x2
        !           589:        mov     x2,ax
        !           590:        xchg    ah,al
        !           591:        stosw
        !           592:        lodsw
        !           593:        xchg    ah,al
        !           594:        xor     ax,dx
        !           595:        mov     dx,ax
        !           596:        xchg    ah,al
        !           597:        stosw
        !           598: 
        !           599: ;      mov     ax,x0
        !           600: ;      mov     bp,sp
        !           601: ;      mov     [bp+20],si      ; Save source offset
        !           602: ;      mov     [bp+24],di      ; Save destination offset
        !           603: ;      lds     si,[bp+16]      ; Key
        !           604: ;      mov     x0,ax           ; Get x0 in place for another iteration
        !           605: ; Alternate code for the above: (which is faster?  One move or three ss:?)
        !           606:        mov     ax,si
        !           607:        mov     si,sp
        !           608:        mov     ss:[si+20],ax
        !           609:        mov     ss:[si+24],di
        !           610:        lds     si,ss:[si+16]
        !           611: 
        !           612:        mov     x3,dx           ; Get x3 in place
        !           613:        mov     ax,OFFSET IdeaCFBLoop
        !           614:        push    ax
        !           615:        jmp     Core
        !           616: 
        !           617: IdeaCFBEnd:
        !           618: ;      lds     si,[bp+12]
        !           619:        lds     di,ss:[si+12]   ; Get IV for writing back
        !           620: 
        !           621:        mov     ax,x0
        !           622:        xchg    ah,al
        !           623:        mov     [di],ax         ; Use stosw?
        !           624:        xchg    bh,bl
        !           625:        xchg    ch,cl
        !           626:        mov     [di+2],x1
        !           627:        mov     [di+4],x2
        !           628:        xchg    dh,dl
        !           629:        mov     [di+6],dx
        !           630: 
        !           631:        pop     ds
        !           632:        pop     di
        !           633:        pop     si
        !           634:        pop     bp
        !           635: 
        !           636:        ret
        !           637: 
        !           638:        endp
        !           639: 
        !           640: ; This decoding step is similar, except that instead of
        !           641: ;      lods
        !           642: ;      xor     x0,ax
        !           643: ;      mov     ax,x0
        !           644: ;      stos
        !           645: ; the feedback step is
        !           646: ;      lods
        !           647: ;      xchg    x0,ax
        !           648: ;      xor     ax,x0
        !           649: ;      stos
        !           650: 
        !           651: ; IV, key, cipher, plain, len
        !           652:        public  _IdeaCFBx
        !           653: _IdeaCFBx proc far       ; Args are at [sp+4]
        !           654:        cld
        !           655:        push    bp
        !           656:        push    si
        !           657:        push    di
        !           658:        push    ds      ; 8 more words here, so args are at [sp+12]
        !           659:        mov     bp,sp
        !           660:        lds     si,[bp+12]      ; IV
        !           661: ; Load and byte-swap IV
        !           662:        mov     ax,[si]
        !           663:        xchg    ah,al
        !           664:        mov     x1,[si+2]
        !           665:        mov     x2,[si+4]
        !           666:        xchg    bh,bl
        !           667:        xchg    ch,cl
        !           668:        mov     dx,[si+6]
        !           669:        xchg    dh,dl
        !           670: 
        !           671:        lds     si,[bp+16]      ; Key
        !           672:        mov     x0,ax
        !           673:        mov     x3,dx
        !           674: 
        !           675:        call    Core
        !           676: IdeaCFBxLoop:
        !           677: ;      mov     ax,x0
        !           678: ;      mov     bp,sp
        !           679: ;      dec     WORD PTR [bp+28]        ; Decrement count
        !           680: ;      jz      IdeaCFBxEnd
        !           681: ;      lds     si,[bp+20]
        !           682: ;      les     di,[bp+24]
        !           683: ;      mov     x0,ax
        !           684: ; Alternate code: (which is faster?  Two moves or three segment overrides)
        !           685:        mov     si,sp
        !           686:        dec     WORD PTR ss:[si+28]
        !           687:        jz      IdeaCFBxEnd
        !           688:        les     di,ss:[si+24]
        !           689:        lds     si,ss:[si+20]
        !           690: 
        !           691:        lodsw
        !           692:        xchg    ah,al
        !           693:        xchg    x0,ax
        !           694:        xor     ax,x0
        !           695:        xchg    ah,al
        !           696:        stosw
        !           697:        lodsw
        !           698:        xchg    ah,al
        !           699:        xchg    x1,ax
        !           700:        xor     ax,x1
        !           701:        xchg    ah,al
        !           702:        stosw
        !           703:        lodsw
        !           704:        xchg    ah,al
        !           705:        xchg    x2,ax
        !           706:        xor     ax,x2
        !           707:        xchg    ah,al
        !           708:        stosw
        !           709:        lodsw
        !           710:        xchg    ah,al
        !           711:        xchg    dx,ax
        !           712:        xor     ax,dx
        !           713:        xchg    ah,al
        !           714:        stosw
        !           715: 
        !           716: ;      mov     ax,x0
        !           717: ;      mov     bp,sp
        !           718: ;      mov     [bp+20],si      ; Save source offset
        !           719: ;      mov     [bp+24],di      ; Save destination offset
        !           720: ;      lds     si,[bp+16]      ; Key
        !           721: ;      mov     x0,ax           ; Get x0 in place for another iteration
        !           722: ; Alternate code for the above: (which is faster?  One move or three ss:?)
        !           723:        mov     ax,si
        !           724:        mov     si,sp
        !           725:        mov     ss:[si+20],ax
        !           726:        mov     ss:[si+24],di
        !           727:        lds     si,ss:[si+16]
        !           728: 
        !           729:        mov     x3,dx           ; Get x3 in place
        !           730:        mov     ax,OFFSET IdeaCFBxLoop
        !           731:        push    ax
        !           732:        jmp     Core
        !           733: 
        !           734: IdeaCFBxEnd:
        !           735: ;      lds     si:[bp+12]
        !           736:        lds     di,ss:[si+12]   ; Get IV for writing back
        !           737: 
        !           738:        mov     ax,x0
        !           739:        xchg    ah,al
        !           740:        mov     [di],ax         ; Use stosw?
        !           741:        xchg    bh,bl
        !           742:        xchg    ch,cl
        !           743:        mov     [di+2],x1
        !           744:        mov     [di+4],x2
        !           745:        xchg    dh,dl
        !           746:        mov     [di+6],dx
        !           747: 
        !           748: 
        !           749:        pop     ds
        !           750:        pop     di
        !           751:        pop     si
        !           752:        pop     bp
        !           753: 
        !           754:        ret
        !           755: 
        !           756:        endp
        !           757: 
        !           758: 
        !           759: 
        !           760: 
        !           761:        end
unix.superglobalmegacorp.com
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.