|
|
1.1 ! root 1: %PAGESIZE 59 ; Turbo assembler formatting codes ! 2: %BIN 13 ! 3: %LINUM 3 ! 4: ! 5: ; Copyright (c) 1993 Colin Plumb. This code may be freely ! 6: ; distributed under the terms of the GNU General Public Licence. ! 7: ! 8: .model large ! 9: .code ! 10: ! 11: ; A core operation in IDEA is multiplication modulo 65537. ! 12: ; The valid inputs, 1 through 66636 inclusive are represented in ! 13: ; 16-bit registers modulo 65536. I.e. a value of 0 means 65536, ! 14: ; or -1. Thus, we need to test for that specially. -x, modulo ! 15: ; 65537, is 65537-x = 1-x. ! 16: ; For any other number, represent the product as a*65536+b. Since ! 17: ; 65536 = -1 (mod 65537), this is the same number as b-a. Should ! 18: ; this result be negautive (generate a borrow), -n mod 65537 = 1-n ! 19: ; mod 65536. Or in other words, if you add the borrow bit back on, ! 20: ; you get the right answer. ! 21: ! 22: ; This is what the assembly code does. It forms a zero, and adds ! 23: ; that on with carry. ! 24: ! 25: ; Another useful optimisation takes advantage of the fact that ! 26: ; a and b are equal only if the answer is congruent to 0 mod 65537. ! 27: ; Since 65537 is prime, this happens only if one of the inputs is ! 28: ; congruent to 0 mod 65537. Since the inputs are all less than 65537, ! 29: ; this means it must have been zero. ! 30: ! 31: ; The code below tests for a zero result of the subtraction, and if ! 32: ; one arises, it branches out of line to figure out what happened. ! 33: ! 34: ! 35: ; This code implemets the IDEA encryption algorithm. ! 36: ; It follows in pseudo-C, where the * operator operates ! 37: ; modulo 65537, as Idea needs. (If you don't understand, ! 38: ; learn IDEA better.) ! 39: ! 40: ; IDEA is works on 16-bit units. If you're processing bytes, ! 41: ; it's defined to be big-endian, so an Intel machine needs to ! 42: ; swap the bytes around. ! 43: ! 44: ; void Idea(u_int16 *in, u_int16 *out, u_int16 *key) ! 45: ; { ! 46: ; register u+int16 x0, x1, x2, x3, s1, s2, round; ! 47: ; ! 48: ; x0 = *in++; x1 = *in++; x2 = *in++; x3 = *in; ! 49: ; ! 50: ; for (round = 0; round < 8; round++) { ! 51: ; x0 *= *key++; ! 52: ; x1 += *key++; ! 53: ; x2 += *key++; ! 54: ; x3 *= *key++; ! 55: ; ! 56: ; s1 = x1; s2 = x2; ! 57: ; x2 ^= x0; x1 ^= x3; ! 58: ; ! 59: ; x2 *= *key++; ! 60: ; x1 += x2; ! 61: ; x1 *= *key++; ! 62: ; x2 += x1; ! 63: ; ! 64: ; x0 ^= x1; x3 ^= x2; ! 65: ; x1 ^= s2; x2 ^= s1; ! 66: ; } ! 67: ; *out++ = x0 * *key++; ! 68: ; *out++ = x2 + *key++; /* Yes, this is x2, not x1 */ ! 69: ; *out++ = x1 + *key++; ! 70: ; *out = x3 * *key; ! 71: ; } ! 72: ! 73: ; ds:si points to key, ax, dx are temps, args in bx, cx, di, bp ! 74: ; Trashes *all* registers. direction flag must be clear. ! 75: ; Leaves es zero. ! 76: ! 77: ; Since there is no spare register to hold the loop count, I make ! 78: ; clever use of the stack, pushing the start of the loop several ! 79: ; times and using a ret instruction to do the return. ! 80: ! 81: ; Annoyingly, lods is fastest on 8086's, but other techniques are ! 82: ; best on 386's. Well, that's what the manual says, but real ! 83: ; life is different. USELODS wins on a 386SX, at least. ! 84: ; Leave it set for all platforms. ! 85: ! 86: USELODS equ 1 ! 87: ! 88: ; bp must be x0 for some of the code below to work ! 89: x0 equ bp ! 90: x1 equ bx ! 91: x2 equ cx ! 92: x3 equ di ! 93: ; di must be x3 for some of the code below to work ! 94: ! 95: ;; Now, this is rather interesting. We test for zero arguments ! 96: ;; after the multiply. Assuming random inputs, one or both are ! 97: ;; zero (2^17-1)/2^32, or approximately 1/32786 of the time. ! 98: ;; Encryption in any feedback mode produces essentially random ! 99: ;; inputs, so average-case analysis is okay. While we don't ! 100: ;; want the out-of-line code to waste time, it is not worth ! 101: ;; slowing down the in-line case to speed it up. ! 102: ;; ! 103: ;; Basically, we start inverting the source x, and if that was 0, ! 104: ;; we use the inverse of the key instead. ! 105: ! 106: Core1Z: ! 107: neg x0 ! 108: jnz Core1Za ! 109: if USELODS ! 110: sub x0,[si-2] ! 111: else ! 112: sub x0,[si] ! 113: endif ! 114: Core1Za: ! 115: inc x0 ! 116: jmp Core1done ! 117: Core2Z: ! 118: neg x3 ! 119: jnz Core2Za ! 120: if USELODS ! 121: sub x3,[si-2] ! 122: else ! 123: sub x3,[si+6] ! 124: endif ! 125: Core2Za: ! 126: inc x3 ! 127: jmp Core2done ! 128: Core3Z: ! 129: neg x2 ! 130: jnz Core3Za ! 131: if USELODS ! 132: sub x2,[si-2] ! 133: else ! 134: sub x2,[si+8] ! 135: endif ! 136: Core3Za: ! 137: inc x2 ! 138: jmp Core3done ! 139: Core4Z: ! 140: neg x1 ! 141: jnz Core4Za ! 142: if USELODS ! 143: sub x1,[si-2] ! 144: else ! 145: sub x1,[si+10] ! 146: endif ! 147: Core4Za: ! 148: inc x1 ! 149: jmp Core4done ! 150: ! 151: ; We need a constant 0 that we can move into a register without affecting ! 152: ; the carry flag (as the classic xor ax,ax is wont to do), so we use the ! 153: ; es register for a constant 0 source. This is okay even in protected ! 154: ; mode. (I *told* you this was tricky code!) ! 155: ! 156: ; BTW, since you wanted to know, this is 8 + 78*4 + 16 = 336 instructions. ! 157: ! 158: Core proc near ! 159: xor ax,ax ! 160: mov es,ax ! 161: mov ax,OFFSET Finish ! 162: push ax ! 163: mov ax,OFFSET Coreloop ! 164: push ax ; Loop 3 times, then return ! 165: push ax ! 166: push ax ! 167: ! 168: Coreloop: ! 169: if USELODS ! 170: lodsw ! 171: else ! 172: mov ax,[si] ; x0 *= *key++ ! 173: endif ! 174: mul x0 ! 175: sub ax,dx ! 176: jz Core1Z ! 177: mov x0,es ! 178: adc x0,ax ! 179: Core1done: ! 180: ! 181: if USELODS ! 182: lodsw ! 183: add x1,ax ! 184: lodsw ! 185: add x2,ax ! 186: else ! 187: add x1,[si+2] ; x1 += *key++ ! 188: add x2,[si+4] ; x2 += *key++ ! 189: endif ! 190: ! 191: if USELODS ! 192: lodsw ! 193: else ! 194: mov ax,[si+6] ; x3 += *key++ ! 195: endif ! 196: mul x3 ! 197: sub ax,dx ! 198: jz Core2Z ! 199: mov x3,es ! 200: adc x3,ax ! 201: Core2done: ! 202: ! 203: push x1 ; s1 = x1 ! 204: push x2 ; s2 = x2 ! 205: ! 206: xor x1,x3 ; x1 ^= x3 ! 207: xor x2,x0 ; x2 ^= x0 ! 208: ! 209: if USELODS ! 210: lodsw ! 211: else ! 212: mov ax,[si+8] ; x2 *= *key++ ! 213: endif ! 214: mul x2 ! 215: sub ax,dx ! 216: jz Core3Z ! 217: mov x2,es ! 218: adc x2,ax ! 219: Core3done: ! 220: ! 221: add x1,x2 ; x1 += x2 ! 222: ! 223: if USELODS ! 224: lodsw ! 225: else ! 226: mov ax,[si+10] ; x1 *= *key++ ! 227: endif ! 228: mul x1 ! 229: sub ax,dx ! 230: jz Core4Z ! 231: mov x1,es ! 232: adc x1,ax ! 233: Core4done: ! 234: ! 235: add x2,x1 ; x2 += x1 ! 236: ! 237: xor x0,x1 ; x0 ^= x1 ! 238: xor x3,x2 ; x3 ^= x2 ! 239: ! 240: pop dx ! 241: xor x1,dx ; x1 ^= s2 ! 242: pop dx ! 243: xor x2,dx ; x2 ^= s1 ! 244: ! 245: ; Second unrolling of loop ! 246: if USELODS ! 247: lodsw ! 248: else ! 249: mov ax,[si+12] ; x0 *= *key++ ! 250: endif ! 251: mul x0 ! 252: sub ax,dx ! 253: jz Core5Z ! 254: mov x0,es ! 255: adc x0,ax ! 256: Core5done: ! 257: ! 258: if USELODS ! 259: lodsw ! 260: add x1,ax ! 261: lodsw ! 262: add x2,ax ! 263: else ! 264: add x1,[si+14] ; x1 += *key++ ! 265: add x2,[si+16] ; x2 += *key++ ! 266: endif ! 267: ! 268: if USELODS ! 269: lodsw ! 270: else ! 271: mov ax,[si+18] ; x3 *= *key++ ! 272: endif ! 273: mul x3 ! 274: sub ax,dx ! 275: jz Core6Z ! 276: mov x3,es ! 277: adc x3,ax ! 278: Core6done: ! 279: ! 280: push x1 ; s1 = x1 ! 281: push x2 ; s2 = x2 ! 282: ! 283: xor x1,x3 ; x1 ^= x3 ! 284: xor x2,x0 ; x2 ^= x0 ! 285: ! 286: if USELODS ! 287: lodsw ! 288: else ! 289: mov ax,[si+20] ; x2 *= *key++ ! 290: endif ! 291: mul x2 ! 292: sub ax,dx ! 293: jz Core7Z ! 294: mov x2,es ! 295: adc x2,ax ! 296: Core7done: ! 297: ! 298: add x1,x2 ; x1 += x2 ! 299: ! 300: if USELODS ! 301: lodsw ! 302: else ! 303: mov ax,[si+22] ; x1 *= *key++ ! 304: endif ! 305: mul x1 ! 306: sub ax,dx ! 307: jz Core8Z ! 308: mov x1,es ! 309: adc x1,ax ! 310: Core8done: ! 311: ! 312: add x2,x1 ; x2 += x1 ! 313: ! 314: xor x0,x1 ; x0 ^= x1 ! 315: xor x3,x2 ; x3 ^= x2 ! 316: ! 317: pop dx ! 318: xor x1,dx ; x1 ^= s2 ! 319: pop dx ! 320: xor x2,dx ; x2 ^= s1 ! 321: ! 322: ife USELODS ! 323: lea si,[si+24] ! 324: endif ! 325: ! 326: ret ; Used as a loop instruction! ! 327: ! 328: Core5Z: ! 329: neg x0 ! 330: jnz Core5Za ! 331: if USELODS ! 332: sub x0,[si-2] ! 333: else ! 334: sub x0,[si+12] ! 335: endif ! 336: Core5Za: ! 337: inc x0 ! 338: jmp Core5done ! 339: Core6Z: ! 340: neg x3 ! 341: jnz Core6Za ! 342: if USELODS ! 343: sub x3,[si-2] ! 344: else ! 345: sub x3,[si+18] ! 346: endif ! 347: Core6Za: ! 348: inc x3 ! 349: jmp Core6done ! 350: Core7Z: ! 351: neg x2 ! 352: jnz Core7Za ! 353: if USELODS ! 354: sub x2,[si-2] ! 355: else ! 356: sub x2,[si+20] ! 357: endif ! 358: Core7Za: ! 359: inc x2 ! 360: jmp Core7done ! 361: Core8Z: ! 362: neg x1 ! 363: jnz Core8Za ! 364: if USELODS ! 365: sub x1,[si-2] ! 366: else ! 367: sub x1,[si+22] ! 368: endif ! 369: Core8Za: ! 370: inc x1 ! 371: jmp Core8done ! 372: Core9Z: ! 373: neg x0 ! 374: jnz Core9Za ! 375: if USELODS ! 376: sub x0,[si-2] ! 377: else ! 378: sub x0,[si] ! 379: endif ! 380: Core9Za: ! 381: inc x0 ! 382: jmp Core9done ! 383: ; Special: compute into dx (zero on entry) ! 384: Core10Z: ! 385: sub dx,x3 ! 386: jnz Core10Za ! 387: if USELODS ! 388: sub dx,[si-2] ! 389: else ! 390: sub dx,[si+6] ! 391: endif ! 392: Core10Za: ! 393: inc dx ! 394: ; jmp Core10done ! 395: ret ! 396: ! 397: ! 398: Finish: ! 399: if USELODS ! 400: lodsw ! 401: else ! 402: mov ax,[si] ; x0 *= *key++ ! 403: endif ! 404: mul x0 ! 405: sub ax,dx ! 406: jz Core9Z ! 407: mov x0,es ! 408: adc x0,ax ! 409: Core9done: ! 410: ! 411: xchg x1,x2 ! 412: if USELODS ! 413: lodsw ! 414: add x1,ax ! 415: lodsw ! 416: add x2,ax ! 417: else ! 418: add x1,[si+2] ; x1 += *key++ ! 419: add x2,[si+4] ; x2 += *key++ ! 420: endif ! 421: ! 422: ; This is special: compute into dx, not x3 ! 423: if USELODS ! 424: lodsw ! 425: else ! 426: mov ax,[si+6] ; x3 *= *key++ ! 427: endif ! 428: mul x3 ! 429: sub ax,dx ! 430: mov dx,es ! 431: jz Core10Z ! 432: adc dx,ax ! 433: Core10done: ! 434: ! 435: ret ! 436: ! 437: endp ! 438: ! 439: ! 440: ; Args are in, out, key ! 441: public _Idea2 ! 442: _Idea2 proc far ! 443: cld ! 444: push bp ; Args start at [bp+6] ! 445: mov bp,sp ! 446: push si ! 447: push di ! 448: push ds ; 6 more words here, so args are at [sp+12] ! 449: lds si,[bp+6] ; in ! 450: lodsw ! 451: xchg ah,al ! 452: mov dx,ax ! 453: lodsw ! 454: xchg ah,al ! 455: mov x1,ax ! 456: lodsw ! 457: xchg ah,al ! 458: mov x2,ax ! 459: lodsw ! 460: xchg ah,al ! 461: mov x3,ax ! 462: lds si,[bp+14] ; key ! 463: ! 464: mov x0,dx ! 465: ! 466: call Core ! 467: ! 468: mov ax,x0 ! 469: mov bp,sp ! 470: les di,[bp+16] ! 471: xchg ah,al ! 472: stosw ! 473: mov ax,x1 ! 474: xchg ah,al ! 475: stosw ! 476: mov ax,x2 ! 477: xchg ah,al ! 478: stosw ! 479: mov ax,x3 ! 480: xchg ah,al ! 481: stosw ! 482: ! 483: pop ds ! 484: pop di ! 485: pop si ! 486: pop bp ! 487: ! 488: ret ! 489: ! 490: endp ! 491: ! 492: ; Okay, the basic plan for the CFB kernel is ! 493: ; get x0,x1,x2,x3 ! 494: ; get key pointer ! 495: ; call core ! 496: ; get buffer pointers ! 497: ;Loop: ! 498: ; lodsw ! 499: ; xor ax,x0 ! 500: ; mov x0,ax ! 501: ; stosw ! 502: ; lodsw ! 503: ; xor ax,x1 ! 504: ; mov x0,ax ! 505: ; stosw ! 506: ; lodsw ! 507: ; xor ax,x2 ! 508: ; mov x0,ax ! 509: ; stosw ! 510: ; lodsw ! 511: ; xor ax,x3 ! 512: ; mov x3,ax ! 513: ; stosw ! 514: ; push buffer pointers ! 515: ; get key pointer ! 516: ; call core ! 517: ; pop buffer pointers ! 518: ; loop ! 519: ; lodsw/xor/etc. ! 520: ; ! 521: ; ! 522: ; This function is designed to go in the middle of a byte-granularity ! 523: ; CFB engine. It performs "len" encryptions of the IV, encrypting ! 524: ; 8*(len-1) bytes from the source to the destination. The idea is ! 525: ; that you first xor any odd leading bytes, then call this function, ! 526: ; then xor up to 8 trailing bytes. ! 527: ! 528: ; The main loop in this is 38 instructions, plus the 336 for the core ! 529: ; makes 374 total. That's 46.75 instructions per byte. ! 530: ; (It's the same for IdeaCFBx) ! 531: ! 532: ; IV, key, plain, cipher, len ! 533: public _IdeaCFB ! 534: _IdeaCFB proc far ; Args are at [sp+4] ! 535: cld ! 536: push bp ! 537: push si ! 538: push di ! 539: push ds ; 8 more words here, so args are at [sp+12] ! 540: ; To be precise, IV is at 12, key at 16, plain at 20, ! 541: ; cipher at 24 and len at 28 ! 542: mov bp,sp ! 543: lds si,[bp+12] ; IV ! 544: ; Load and byte-swap IV ! 545: mov ax,[si] ! 546: xchg ah,al ! 547: mov x1,[si+2] ! 548: mov x2,[si+4] ! 549: xchg bh,bl ! 550: xchg ch,cl ! 551: mov dx,[si+6] ! 552: xchg dh,dl ! 553: ! 554: lds si,[bp+16] ; Key ! 555: mov x0,ax ! 556: mov x3,dx ! 557: ! 558: call Core ! 559: IdeaCFBLoop: ! 560: ; mov ax,x0 ! 561: ; mov bp,sp ! 562: ; dec WORD PTR [bp+28] ; Decrement count ! 563: ; jz IdeaCFBEnd ! 564: ; lds si,[bp+20] ! 565: ; les di,[bp+24] ! 566: ; mov x0,ax ! 567: ; Alternate code: (which is faster? Two moves or three segment overrides?) ! 568: mov si,sp ! 569: dec WORD PTR ss:[si+28] ! 570: jz IdeaCFBEnd ! 571: les di,ss:[si+24] ! 572: lds si,ss:[si+20] ! 573: ! 574: lodsw ! 575: xchg ah,al ! 576: xor ax,x0 ! 577: mov x0,ax ! 578: xchg ah,al ! 579: stosw ! 580: lodsw ! 581: xchg ah,al ! 582: xor ax,x1 ! 583: mov x1,ax ! 584: xchg ah,al ! 585: stosw ! 586: lodsw ! 587: xchg ah,al ! 588: xor ax,x2 ! 589: mov x2,ax ! 590: xchg ah,al ! 591: stosw ! 592: lodsw ! 593: xchg ah,al ! 594: xor ax,dx ! 595: mov dx,ax ! 596: xchg ah,al ! 597: stosw ! 598: ! 599: ; mov ax,x0 ! 600: ; mov bp,sp ! 601: ; mov [bp+20],si ; Save source offset ! 602: ; mov [bp+24],di ; Save destination offset ! 603: ; lds si,[bp+16] ; Key ! 604: ; mov x0,ax ; Get x0 in place for another iteration ! 605: ; Alternate code for the above: (which is faster? One move or three ss:?) ! 606: mov ax,si ! 607: mov si,sp ! 608: mov ss:[si+20],ax ! 609: mov ss:[si+24],di ! 610: lds si,ss:[si+16] ! 611: ! 612: mov x3,dx ; Get x3 in place ! 613: mov ax,OFFSET IdeaCFBLoop ! 614: push ax ! 615: jmp Core ! 616: ! 617: IdeaCFBEnd: ! 618: ; lds si,[bp+12] ! 619: lds di,ss:[si+12] ; Get IV for writing back ! 620: ! 621: mov ax,x0 ! 622: xchg ah,al ! 623: mov [di],ax ; Use stosw? ! 624: xchg bh,bl ! 625: xchg ch,cl ! 626: mov [di+2],x1 ! 627: mov [di+4],x2 ! 628: xchg dh,dl ! 629: mov [di+6],dx ! 630: ! 631: pop ds ! 632: pop di ! 633: pop si ! 634: pop bp ! 635: ! 636: ret ! 637: ! 638: endp ! 639: ! 640: ; This decoding step is similar, except that instead of ! 641: ; lods ! 642: ; xor x0,ax ! 643: ; mov ax,x0 ! 644: ; stos ! 645: ; the feedback step is ! 646: ; lods ! 647: ; xchg x0,ax ! 648: ; xor ax,x0 ! 649: ; stos ! 650: ! 651: ; IV, key, cipher, plain, len ! 652: public _IdeaCFBx ! 653: _IdeaCFBx proc far ; Args are at [sp+4] ! 654: cld ! 655: push bp ! 656: push si ! 657: push di ! 658: push ds ; 8 more words here, so args are at [sp+12] ! 659: mov bp,sp ! 660: lds si,[bp+12] ; IV ! 661: ; Load and byte-swap IV ! 662: mov ax,[si] ! 663: xchg ah,al ! 664: mov x1,[si+2] ! 665: mov x2,[si+4] ! 666: xchg bh,bl ! 667: xchg ch,cl ! 668: mov dx,[si+6] ! 669: xchg dh,dl ! 670: ! 671: lds si,[bp+16] ; Key ! 672: mov x0,ax ! 673: mov x3,dx ! 674: ! 675: call Core ! 676: IdeaCFBxLoop: ! 677: ; mov ax,x0 ! 678: ; mov bp,sp ! 679: ; dec WORD PTR [bp+28] ; Decrement count ! 680: ; jz IdeaCFBxEnd ! 681: ; lds si,[bp+20] ! 682: ; les di,[bp+24] ! 683: ; mov x0,ax ! 684: ; Alternate code: (which is faster? Two moves or three segment overrides) ! 685: mov si,sp ! 686: dec WORD PTR ss:[si+28] ! 687: jz IdeaCFBxEnd ! 688: les di,ss:[si+24] ! 689: lds si,ss:[si+20] ! 690: ! 691: lodsw ! 692: xchg ah,al ! 693: xchg x0,ax ! 694: xor ax,x0 ! 695: xchg ah,al ! 696: stosw ! 697: lodsw ! 698: xchg ah,al ! 699: xchg x1,ax ! 700: xor ax,x1 ! 701: xchg ah,al ! 702: stosw ! 703: lodsw ! 704: xchg ah,al ! 705: xchg x2,ax ! 706: xor ax,x2 ! 707: xchg ah,al ! 708: stosw ! 709: lodsw ! 710: xchg ah,al ! 711: xchg dx,ax ! 712: xor ax,dx ! 713: xchg ah,al ! 714: stosw ! 715: ! 716: ; mov ax,x0 ! 717: ; mov bp,sp ! 718: ; mov [bp+20],si ; Save source offset ! 719: ; mov [bp+24],di ; Save destination offset ! 720: ; lds si,[bp+16] ; Key ! 721: ; mov x0,ax ; Get x0 in place for another iteration ! 722: ; Alternate code for the above: (which is faster? One move or three ss:?) ! 723: mov ax,si ! 724: mov si,sp ! 725: mov ss:[si+20],ax ! 726: mov ss:[si+24],di ! 727: lds si,ss:[si+16] ! 728: ! 729: mov x3,dx ; Get x3 in place ! 730: mov ax,OFFSET IdeaCFBxLoop ! 731: push ax ! 732: jmp Core ! 733: ! 734: IdeaCFBxEnd: ! 735: ; lds si:[bp+12] ! 736: lds di,ss:[si+12] ; Get IV for writing back ! 737: ! 738: mov ax,x0 ! 739: xchg ah,al ! 740: mov [di],ax ; Use stosw? ! 741: xchg bh,bl ! 742: xchg ch,cl ! 743: mov [di+2],x1 ! 744: mov [di+4],x2 ! 745: xchg dh,dl ! 746: mov [di+6],dx ! 747: ! 748: ! 749: pop ds ! 750: pop di ! 751: pop si ! 752: pop bp ! 753: ! 754: ret ! 755: ! 756: endp ! 757: ! 758: ! 759: ! 760: ! 761: end
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.