|
|
1.1 root 1: %PAGESIZE 59 ; Turbo assembler formatting codes
2: %BIN 13
3: %LINUM 3
4:
5: ; Copyright (c) 1993 Colin Plumb. This code may be freely
6: ; distributed under the terms of the GNU General Public Licence.
7:
8: .model large
9: .code
10:
11: ; A core operation in IDEA is multiplication modulo 65537.
12: ; The valid inputs, 1 through 66636 inclusive are represented in
13: ; 16-bit registers modulo 65536. I.e. a value of 0 means 65536,
14: ; or -1. Thus, we need to test for that specially. -x, modulo
15: ; 65537, is 65537-x = 1-x.
16: ; For any other number, represent the product as a*65536+b. Since
17: ; 65536 = -1 (mod 65537), this is the same number as b-a. Should
18: ; this result be negautive (generate a borrow), -n mod 65537 = 1-n
19: ; mod 65536. Or in other words, if you add the borrow bit back on,
20: ; you get the right answer.
21:
22: ; This is what the assembly code does. It forms a zero, and adds
23: ; that on with carry.
24:
25: ; Another useful optimisation takes advantage of the fact that
26: ; a and b are equal only if the answer is congruent to 0 mod 65537.
27: ; Since 65537 is prime, this happens only if one of the inputs is
28: ; congruent to 0 mod 65537. Since the inputs are all less than 65537,
29: ; this means it must have been zero.
30:
31: ; The code below tests for a zero result of the subtraction, and if
32: ; one arises, it branches out of line to figure out what happened.
33:
34:
35: ; This code implemets the IDEA encryption algorithm.
36: ; It follows in pseudo-C, where the * operator operates
37: ; modulo 65537, as Idea needs. (If you don't understand,
38: ; learn IDEA better.)
39:
40: ; IDEA is works on 16-bit units. If you're processing bytes,
41: ; it's defined to be big-endian, so an Intel machine needs to
42: ; swap the bytes around.
43:
44: ; void Idea(u_int16 *in, u_int16 *out, u_int16 *key)
45: ; {
46: ; register u+int16 x0, x1, x2, x3, s1, s2, round;
47: ;
48: ; x0 = *in++; x1 = *in++; x2 = *in++; x3 = *in;
49: ;
50: ; for (round = 0; round < 8; round++) {
51: ; x0 *= *key++;
52: ; x1 += *key++;
53: ; x2 += *key++;
54: ; x3 *= *key++;
55: ;
56: ; s1 = x1; s2 = x2;
57: ; x2 ^= x0; x1 ^= x3;
58: ;
59: ; x2 *= *key++;
60: ; x1 += x2;
61: ; x1 *= *key++;
62: ; x2 += x1;
63: ;
64: ; x0 ^= x1; x3 ^= x2;
65: ; x1 ^= s2; x2 ^= s1;
66: ; }
67: ; *out++ = x0 * *key++;
68: ; *out++ = x2 + *key++; /* Yes, this is x2, not x1 */
69: ; *out++ = x1 + *key++;
70: ; *out = x3 * *key;
71: ; }
72:
73: ; ds:si points to key, ax, dx are temps, args in bx, cx, di, bp
74: ; Trashes *all* registers. direction flag must be clear.
75: ; Leaves es zero.
76:
77: ; Since there is no spare register to hold the loop count, I make
78: ; clever use of the stack, pushing the start of the loop several
79: ; times and using a ret instruction to do the return.
80:
81: ; Annoyingly, lods is fastest on 8086's, but other techniques are
82: ; best on 386's. Well, that's what the manual says, but real
83: ; life is different. USELODS wins on a 386SX, at least.
84: ; Leave it set for all platforms.
85:
86: USELODS equ 1
87:
88: ; bp must be x0 for some of the code below to work
89: x0 equ bp
90: x1 equ bx
91: x2 equ cx
92: x3 equ di
93: ; di must be x3 for some of the code below to work
94:
95: ;; Now, this is rather interesting. We test for zero arguments
96: ;; after the multiply. Assuming random inputs, one or both are
97: ;; zero (2^17-1)/2^32, or approximately 1/32786 of the time.
98: ;; Encryption in any feedback mode produces essentially random
99: ;; inputs, so average-case analysis is okay. While we don't
100: ;; want the out-of-line code to waste time, it is not worth
101: ;; slowing down the in-line case to speed it up.
102: ;;
103: ;; Basically, we start inverting the source x, and if that was 0,
104: ;; we use the inverse of the key instead.
105:
106: Core1Z:
107: neg x0
108: jnz Core1Za
109: if USELODS
110: sub x0,[si-2]
111: else
112: sub x0,[si]
113: endif
114: Core1Za:
115: inc x0
116: jmp Core1done
117: Core2Z:
118: neg x3
119: jnz Core2Za
120: if USELODS
121: sub x3,[si-2]
122: else
123: sub x3,[si+6]
124: endif
125: Core2Za:
126: inc x3
127: jmp Core2done
128: Core3Z:
129: neg x2
130: jnz Core3Za
131: if USELODS
132: sub x2,[si-2]
133: else
134: sub x2,[si+8]
135: endif
136: Core3Za:
137: inc x2
138: jmp Core3done
139: Core4Z:
140: neg x1
141: jnz Core4Za
142: if USELODS
143: sub x1,[si-2]
144: else
145: sub x1,[si+10]
146: endif
147: Core4Za:
148: inc x1
149: jmp Core4done
150:
151: ; We need a constant 0 that we can move into a register without affecting
152: ; the carry flag (as the classic xor ax,ax is wont to do), so we use the
153: ; es register for a constant 0 source. This is okay even in protected
154: ; mode. (I *told* you this was tricky code!)
155:
156: ; BTW, since you wanted to know, this is 8 + 78*4 + 16 = 336 instructions.
157:
158: Core proc near
159: xor ax,ax
160: mov es,ax
161: mov ax,OFFSET Finish
162: push ax
163: mov ax,OFFSET Coreloop
164: push ax ; Loop 3 times, then return
165: push ax
166: push ax
167:
168: Coreloop:
169: if USELODS
170: lodsw
171: else
172: mov ax,[si] ; x0 *= *key++
173: endif
174: mul x0
175: sub ax,dx
176: jz Core1Z
177: mov x0,es
178: adc x0,ax
179: Core1done:
180:
181: if USELODS
182: lodsw
183: add x1,ax
184: lodsw
185: add x2,ax
186: else
187: add x1,[si+2] ; x1 += *key++
188: add x2,[si+4] ; x2 += *key++
189: endif
190:
191: if USELODS
192: lodsw
193: else
194: mov ax,[si+6] ; x3 += *key++
195: endif
196: mul x3
197: sub ax,dx
198: jz Core2Z
199: mov x3,es
200: adc x3,ax
201: Core2done:
202:
203: push x1 ; s1 = x1
204: push x2 ; s2 = x2
205:
206: xor x1,x3 ; x1 ^= x3
207: xor x2,x0 ; x2 ^= x0
208:
209: if USELODS
210: lodsw
211: else
212: mov ax,[si+8] ; x2 *= *key++
213: endif
214: mul x2
215: sub ax,dx
216: jz Core3Z
217: mov x2,es
218: adc x2,ax
219: Core3done:
220:
221: add x1,x2 ; x1 += x2
222:
223: if USELODS
224: lodsw
225: else
226: mov ax,[si+10] ; x1 *= *key++
227: endif
228: mul x1
229: sub ax,dx
230: jz Core4Z
231: mov x1,es
232: adc x1,ax
233: Core4done:
234:
235: add x2,x1 ; x2 += x1
236:
237: xor x0,x1 ; x0 ^= x1
238: xor x3,x2 ; x3 ^= x2
239:
240: pop dx
241: xor x1,dx ; x1 ^= s2
242: pop dx
243: xor x2,dx ; x2 ^= s1
244:
245: ; Second unrolling of loop
246: if USELODS
247: lodsw
248: else
249: mov ax,[si+12] ; x0 *= *key++
250: endif
251: mul x0
252: sub ax,dx
253: jz Core5Z
254: mov x0,es
255: adc x0,ax
256: Core5done:
257:
258: if USELODS
259: lodsw
260: add x1,ax
261: lodsw
262: add x2,ax
263: else
264: add x1,[si+14] ; x1 += *key++
265: add x2,[si+16] ; x2 += *key++
266: endif
267:
268: if USELODS
269: lodsw
270: else
271: mov ax,[si+18] ; x3 *= *key++
272: endif
273: mul x3
274: sub ax,dx
275: jz Core6Z
276: mov x3,es
277: adc x3,ax
278: Core6done:
279:
280: push x1 ; s1 = x1
281: push x2 ; s2 = x2
282:
283: xor x1,x3 ; x1 ^= x3
284: xor x2,x0 ; x2 ^= x0
285:
286: if USELODS
287: lodsw
288: else
289: mov ax,[si+20] ; x2 *= *key++
290: endif
291: mul x2
292: sub ax,dx
293: jz Core7Z
294: mov x2,es
295: adc x2,ax
296: Core7done:
297:
298: add x1,x2 ; x1 += x2
299:
300: if USELODS
301: lodsw
302: else
303: mov ax,[si+22] ; x1 *= *key++
304: endif
305: mul x1
306: sub ax,dx
307: jz Core8Z
308: mov x1,es
309: adc x1,ax
310: Core8done:
311:
312: add x2,x1 ; x2 += x1
313:
314: xor x0,x1 ; x0 ^= x1
315: xor x3,x2 ; x3 ^= x2
316:
317: pop dx
318: xor x1,dx ; x1 ^= s2
319: pop dx
320: xor x2,dx ; x2 ^= s1
321:
322: ife USELODS
323: lea si,[si+24]
324: endif
325:
326: ret ; Used as a loop instruction!
327:
328: Core5Z:
329: neg x0
330: jnz Core5Za
331: if USELODS
332: sub x0,[si-2]
333: else
334: sub x0,[si+12]
335: endif
336: Core5Za:
337: inc x0
338: jmp Core5done
339: Core6Z:
340: neg x3
341: jnz Core6Za
342: if USELODS
343: sub x3,[si-2]
344: else
345: sub x3,[si+18]
346: endif
347: Core6Za:
348: inc x3
349: jmp Core6done
350: Core7Z:
351: neg x2
352: jnz Core7Za
353: if USELODS
354: sub x2,[si-2]
355: else
356: sub x2,[si+20]
357: endif
358: Core7Za:
359: inc x2
360: jmp Core7done
361: Core8Z:
362: neg x1
363: jnz Core8Za
364: if USELODS
365: sub x1,[si-2]
366: else
367: sub x1,[si+22]
368: endif
369: Core8Za:
370: inc x1
371: jmp Core8done
372: Core9Z:
373: neg x0
374: jnz Core9Za
375: if USELODS
376: sub x0,[si-2]
377: else
378: sub x0,[si]
379: endif
380: Core9Za:
381: inc x0
382: jmp Core9done
383: ; Special: compute into dx (zero on entry)
384: Core10Z:
385: sub dx,x3
386: jnz Core10Za
387: if USELODS
388: sub dx,[si-2]
389: else
390: sub dx,[si+6]
391: endif
392: Core10Za:
393: inc dx
394: ; jmp Core10done
395: ret
396:
397:
398: Finish:
399: if USELODS
400: lodsw
401: else
402: mov ax,[si] ; x0 *= *key++
403: endif
404: mul x0
405: sub ax,dx
406: jz Core9Z
407: mov x0,es
408: adc x0,ax
409: Core9done:
410:
411: xchg x1,x2
412: if USELODS
413: lodsw
414: add x1,ax
415: lodsw
416: add x2,ax
417: else
418: add x1,[si+2] ; x1 += *key++
419: add x2,[si+4] ; x2 += *key++
420: endif
421:
422: ; This is special: compute into dx, not x3
423: if USELODS
424: lodsw
425: else
426: mov ax,[si+6] ; x3 *= *key++
427: endif
428: mul x3
429: sub ax,dx
430: mov dx,es
431: jz Core10Z
432: adc dx,ax
433: Core10done:
434:
435: ret
436:
437: endp
438:
439:
440: ; Args are in, out, key
441: public _Idea2
442: _Idea2 proc far
443: cld
444: push bp ; Args start at [bp+6]
445: mov bp,sp
446: push si
447: push di
448: push ds ; 6 more words here, so args are at [sp+12]
449: lds si,[bp+6] ; in
450: lodsw
451: xchg ah,al
452: mov dx,ax
453: lodsw
454: xchg ah,al
455: mov x1,ax
456: lodsw
457: xchg ah,al
458: mov x2,ax
459: lodsw
460: xchg ah,al
461: mov x3,ax
462: lds si,[bp+14] ; key
463:
464: mov x0,dx
465:
466: call Core
467:
468: mov ax,x0
469: mov bp,sp
470: les di,[bp+16]
471: xchg ah,al
472: stosw
473: mov ax,x1
474: xchg ah,al
475: stosw
476: mov ax,x2
477: xchg ah,al
478: stosw
479: mov ax,x3
480: xchg ah,al
481: stosw
482:
483: pop ds
484: pop di
485: pop si
486: pop bp
487:
488: ret
489:
490: endp
491:
492: ; Okay, the basic plan for the CFB kernel is
493: ; get x0,x1,x2,x3
494: ; get key pointer
495: ; call core
496: ; get buffer pointers
497: ;Loop:
498: ; lodsw
499: ; xor ax,x0
500: ; mov x0,ax
501: ; stosw
502: ; lodsw
503: ; xor ax,x1
504: ; mov x0,ax
505: ; stosw
506: ; lodsw
507: ; xor ax,x2
508: ; mov x0,ax
509: ; stosw
510: ; lodsw
511: ; xor ax,x3
512: ; mov x3,ax
513: ; stosw
514: ; push buffer pointers
515: ; get key pointer
516: ; call core
517: ; pop buffer pointers
518: ; loop
519: ; lodsw/xor/etc.
520: ;
521: ;
522: ; This function is designed to go in the middle of a byte-granularity
523: ; CFB engine. It performs "len" encryptions of the IV, encrypting
524: ; 8*(len-1) bytes from the source to the destination. The idea is
525: ; that you first xor any odd leading bytes, then call this function,
526: ; then xor up to 8 trailing bytes.
527:
528: ; The main loop in this is 38 instructions, plus the 336 for the core
529: ; makes 374 total. That's 46.75 instructions per byte.
530: ; (It's the same for IdeaCFBx)
531:
532: ; IV, key, plain, cipher, len
533: public _IdeaCFB
534: _IdeaCFB proc far ; Args are at [sp+4]
535: cld
536: push bp
537: push si
538: push di
539: push ds ; 8 more words here, so args are at [sp+12]
540: ; To be precise, IV is at 12, key at 16, plain at 20,
541: ; cipher at 24 and len at 28
542: mov bp,sp
543: lds si,[bp+12] ; IV
544: ; Load and byte-swap IV
545: mov ax,[si]
546: xchg ah,al
547: mov x1,[si+2]
548: mov x2,[si+4]
549: xchg bh,bl
550: xchg ch,cl
551: mov dx,[si+6]
552: xchg dh,dl
553:
554: lds si,[bp+16] ; Key
555: mov x0,ax
556: mov x3,dx
557:
558: call Core
559: IdeaCFBLoop:
560: ; mov ax,x0
561: ; mov bp,sp
562: ; dec WORD PTR [bp+28] ; Decrement count
563: ; jz IdeaCFBEnd
564: ; lds si,[bp+20]
565: ; les di,[bp+24]
566: ; mov x0,ax
567: ; Alternate code: (which is faster? Two moves or three segment overrides?)
568: mov si,sp
569: dec WORD PTR ss:[si+28]
570: jz IdeaCFBEnd
571: les di,ss:[si+24]
572: lds si,ss:[si+20]
573:
574: lodsw
575: xchg ah,al
576: xor ax,x0
577: mov x0,ax
578: xchg ah,al
579: stosw
580: lodsw
581: xchg ah,al
582: xor ax,x1
583: mov x1,ax
584: xchg ah,al
585: stosw
586: lodsw
587: xchg ah,al
588: xor ax,x2
589: mov x2,ax
590: xchg ah,al
591: stosw
592: lodsw
593: xchg ah,al
594: xor ax,dx
595: mov dx,ax
596: xchg ah,al
597: stosw
598:
599: ; mov ax,x0
600: ; mov bp,sp
601: ; mov [bp+20],si ; Save source offset
602: ; mov [bp+24],di ; Save destination offset
603: ; lds si,[bp+16] ; Key
604: ; mov x0,ax ; Get x0 in place for another iteration
605: ; Alternate code for the above: (which is faster? One move or three ss:?)
606: mov ax,si
607: mov si,sp
608: mov ss:[si+20],ax
609: mov ss:[si+24],di
610: lds si,ss:[si+16]
611:
612: mov x3,dx ; Get x3 in place
613: mov ax,OFFSET IdeaCFBLoop
614: push ax
615: jmp Core
616:
617: IdeaCFBEnd:
618: ; lds si,[bp+12]
619: lds di,ss:[si+12] ; Get IV for writing back
620:
621: mov ax,x0
622: xchg ah,al
623: mov [di],ax ; Use stosw?
624: xchg bh,bl
625: xchg ch,cl
626: mov [di+2],x1
627: mov [di+4],x2
628: xchg dh,dl
629: mov [di+6],dx
630:
631: pop ds
632: pop di
633: pop si
634: pop bp
635:
636: ret
637:
638: endp
639:
640: ; This decoding step is similar, except that instead of
641: ; lods
642: ; xor x0,ax
643: ; mov ax,x0
644: ; stos
645: ; the feedback step is
646: ; lods
647: ; xchg x0,ax
648: ; xor ax,x0
649: ; stos
650:
651: ; IV, key, cipher, plain, len
652: public _IdeaCFBx
653: _IdeaCFBx proc far ; Args are at [sp+4]
654: cld
655: push bp
656: push si
657: push di
658: push ds ; 8 more words here, so args are at [sp+12]
659: mov bp,sp
660: lds si,[bp+12] ; IV
661: ; Load and byte-swap IV
662: mov ax,[si]
663: xchg ah,al
664: mov x1,[si+2]
665: mov x2,[si+4]
666: xchg bh,bl
667: xchg ch,cl
668: mov dx,[si+6]
669: xchg dh,dl
670:
671: lds si,[bp+16] ; Key
672: mov x0,ax
673: mov x3,dx
674:
675: call Core
676: IdeaCFBxLoop:
677: ; mov ax,x0
678: ; mov bp,sp
679: ; dec WORD PTR [bp+28] ; Decrement count
680: ; jz IdeaCFBxEnd
681: ; lds si,[bp+20]
682: ; les di,[bp+24]
683: ; mov x0,ax
684: ; Alternate code: (which is faster? Two moves or three segment overrides)
685: mov si,sp
686: dec WORD PTR ss:[si+28]
687: jz IdeaCFBxEnd
688: les di,ss:[si+24]
689: lds si,ss:[si+20]
690:
691: lodsw
692: xchg ah,al
693: xchg x0,ax
694: xor ax,x0
695: xchg ah,al
696: stosw
697: lodsw
698: xchg ah,al
699: xchg x1,ax
700: xor ax,x1
701: xchg ah,al
702: stosw
703: lodsw
704: xchg ah,al
705: xchg x2,ax
706: xor ax,x2
707: xchg ah,al
708: stosw
709: lodsw
710: xchg ah,al
711: xchg dx,ax
712: xor ax,dx
713: xchg ah,al
714: stosw
715:
716: ; mov ax,x0
717: ; mov bp,sp
718: ; mov [bp+20],si ; Save source offset
719: ; mov [bp+24],di ; Save destination offset
720: ; lds si,[bp+16] ; Key
721: ; mov x0,ax ; Get x0 in place for another iteration
722: ; Alternate code for the above: (which is faster? One move or three ss:?)
723: mov ax,si
724: mov si,sp
725: mov ss:[si+20],ax
726: mov ss:[si+24],di
727: lds si,ss:[si+16]
728:
729: mov x3,dx ; Get x3 in place
730: mov ax,OFFSET IdeaCFBxLoop
731: push ax
732: jmp Core
733:
734: IdeaCFBxEnd:
735: ; lds si:[bp+12]
736: lds di,ss:[si+12] ; Get IV for writing back
737:
738: mov ax,x0
739: xchg ah,al
740: mov [di],ax ; Use stosw?
741: xchg bh,bl
742: xchg ch,cl
743: mov [di+2],x1
744: mov [di+4],x2
745: xchg dh,dl
746: mov [di+6],dx
747:
748:
749: pop ds
750: pop di
751: pop si
752: pop bp
753:
754: ret
755:
756: endp
757:
758:
759:
760:
761: end
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.