|
|
1.1 root 1: /*
2: ---------------------------------------------------------------------------
3: Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
4:
5: LICENSE TERMS
6:
1.1.1.6 root 7: The free distribution and use of this software is allowed (with or without
8: changes) provided that:
1.1 root 9:
1.1.1.6 root 10: 1. source code distributions include the above copyright notice, this
11: list of conditions and the following disclaimer;
1.1 root 12:
1.1.1.6 root 13: 2. binary distributions include the above copyright notice, this list
14: of conditions and the following disclaimer in their documentation;
15:
16: 3. the name of the copyright holder is not used to endorse products
17: built using this software without specific written permission.
1.1 root 18:
19: DISCLAIMER
20:
21: This software is provided 'as is' with no explicit or implied warranties
22: in respect of its properties, including, but not limited to, correctness
23: and/or fitness for purpose.
24: ---------------------------------------------------------------------------
25: Issue Date: 31/01/2004
26:
27: My thanks to John Viega and David McGrew for their support in developing
28: this code and to David for testing it on a big-endain system.
29: */
30:
31: /*
1.1.1.9 ! root 32: ---------------------------------------------------------------------------
! 33: Portions Copyright (c) 2005 TrueCrypt Developers Association
1.1.1.3 root 34:
1.1.1.9 ! root 35: Changes:
1.1 root 36:
37: - Added multiplication in the finite field GF(2^128) optimized for
38: cases involving a 64-bit operand.
39:
40: - Added multiplication in the finite field GF(2^64).
41:
42: - Added MSB-first mode.
43:
1.1.1.2 root 44: - Added basic test algorithms.
45:
1.1 root 46: - Removed GCM.
1.1.1.9 ! root 47: ---------------------------------------------------------------------------
1.1 root 48: */
49:
50: #include <memory.h>
1.1.1.5 root 51: #include <stdlib.h>
1.1 root 52: #include "GfMul.h"
53: #include "Tcdefs.h"
1.1.1.3 root 54: #include "Common/Endian.h"
1.1 root 55:
56: /* BUFFER_ALIGN32 or BUFFER_ALIGN64 must be defined at this point to */
57: /* enable faster operation by taking advantage of memory aligned values */
58: /* NOTE: the BUFFER_ALIGN64 option has not been tested extensively */
59:
60: #define BUFFER_ALIGN32
61: #define UNROLL_LOOPS /* define to unroll some loops */
62: #define IN_LINES /* define to use inline functions */
63: /* in place of macros */
64:
65: #define mode(x) GM_##x
66:
67: #if defined(__cplusplus)
68: extern "C"
69: {
70: #endif
71:
72: typedef unsigned __int32 mode(32t);
1.1.1.7 root 73: typedef uint64 mode(64t);
1.1 root 74:
75: #define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
76: #define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
77:
78: #if BYTE_ORDER == LITTLE_ENDIAN
79: # define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN
80: #endif
81:
82: #if BYTE_ORDER == BIG_ENDIAN
83: # define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN
84: #endif
85:
86: #ifdef _MSC_VER
87: #pragma intrinsic(memcpy)
88: #define in_line __inline
89: #else
90: #define in_line
91: #endif
92:
93: #if 0 && defined(_MSC_VER)
94: #define rotl32 _lrotl
95: #define rotr32 _lrotr
96: #else
97: #define rotl32(x,n) (((x) << n) | ((x) >> (32 - n)))
98: #define rotr32(x,n) (((x) >> n) | ((x) << (32 - n)))
99: #endif
100:
101: #if !defined(bswap_32)
1.1.1.7 root 102: #define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00))
1.1 root 103: #endif
104:
105: #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
106: #define SWAP_BYTES
107: #else
108: #undef SWAP_BYTES
109: #endif
110:
111: #if defined(SWAP_BYTES)
112:
113: #if defined ( IN_LINES )
114:
115: in_line void bsw_32(void * p, unsigned int n)
116: { unsigned int i = n;
117: while(i--)
118: ((mode(32t)*)p)[i] = bswap_32(((mode(32t)*)p)[i]);
119: }
120:
121: #else
122:
123: #define bsw_32(p,n) \
124: { int _i = (n); while(_i--) ((mode(32t)*)p)[_i] = bswap_32(((mode(32t)*)p)[_i]); }
125:
126: #endif
127:
128: #else
129: #define bsw_32(p,n)
130: #endif
131:
132: /* These values are used to detect long word alignment in order */
133: /* to speed up some GCM buffer operations. This facility may */
134: /* not work on some machines */
135:
136: #define lp08(x) ((unsigned char*)(x))
137: #define lp32(x) ((mode(32t)*)(x))
138: #define lp64(x) ((mode(64t)*)(x))
139:
140: #define A32_MASK 3
141: #define A64_MASK 7
142: #define aligned32(x) (!(((mode(32t))(x)) & A32_MASK))
143: #define aligned64(x) (!(((mode(32t))(x)) & A64_MASK))
144:
145: #if defined( BUFFER_ALIGN32 )
146:
147: #define ADR_MASK A32_MASK
148: #define aligned aligned32
149: #define lp lp32
150: #define lp_inc 4
151:
152: #if defined( IN_LINES )
153:
154: in_line void move_block_aligned( void *p, const void *q)
155: {
156: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1],
157: lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3];
158: }
159:
160: in_line void move_block_aligned64( void *p, const void *q)
161: {
162: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1];
163: }
164:
165: in_line void xor_block_aligned( void *p, const void *q)
166: {
167: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1],
168: lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3];
169: }
170:
171: in_line void xor_block_aligned64( void *p, const void *q)
172: {
173: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1];
174: }
175:
176: #else
177:
178: #define move_block_aligned(p,q) \
179: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], \
180: lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3]
181:
182: #define xor_block_aligned(p,q) \
183: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], \
184: lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3]
185:
186: #endif
187:
188: #elif defined( BUFFER_ALIGN64 )
189:
190: #define ADR_MASK A64_MASK
191: #define aligned aligned64
192: #define lp lp64
193: #define lp_inc 8
194:
195: #define move_block_aligned(p,q) \
196: lp64(p)[0] = lp64(q)[0], lp64(p)[1] = lp64(q)[1]
197:
198: #define xor_block_aligned(p,q) \
199: lp64(p)[0] ^= lp64(q)[0], lp64(p)[1] ^= lp64(q)[1]
200:
201: #else
202: #define aligned(x) 0
203: #endif
204:
205: #define move_block(p,q) memcpy((p), (q), BLOCK_LEN)
206:
207: #define xor_block(p,q) \
208: lp08(p)[ 0] ^= lp08(q)[ 0], lp08(p)[ 1] ^= lp08(q)[ 1], \
209: lp08(p)[ 2] ^= lp08(q)[ 2], lp08(p)[ 3] ^= lp08(q)[ 3], \
210: lp08(p)[ 4] ^= lp08(q)[ 4], lp08(p)[ 5] ^= lp08(q)[ 5], \
211: lp08(p)[ 6] ^= lp08(q)[ 6], lp08(p)[ 7] ^= lp08(q)[ 7], \
212: lp08(p)[ 8] ^= lp08(q)[ 8], lp08(p)[ 9] ^= lp08(q)[ 9], \
213: lp08(p)[10] ^= lp08(q)[10], lp08(p)[11] ^= lp08(q)[11], \
214: lp08(p)[12] ^= lp08(q)[12], lp08(p)[13] ^= lp08(q)[13], \
215: lp08(p)[14] ^= lp08(q)[14], lp08(p)[15] ^= lp08(q)[15]
216:
217:
218: #define gf_dat(q) {\
219: q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
220: q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
221: q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
222: q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
223: q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
224: q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
225: q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
226: q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
227: q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
228: q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
229: q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
230: q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
231: q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
232: q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
233: q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
234: q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
235: q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
236: q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
237: q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
238: q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
239: q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
240: q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
241: q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
242: q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
243: q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
244: q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
245: q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
246: q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
247: q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
248: q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
249: q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
250: q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) }
251:
252: /* given the value i in 0..255 as the byte overflow when a a field */
253: /* element in GHASH is multipled by x^8, this function will return */
254: /* the values that are generated in the lo 16-bit word of the field */
255: /* value by applying the modular polynomial. The values lo_byte and */
256: /* hi_byte are returned via the macro xp_fun(lo_byte, hi_byte) so */
257: /* that the values can be assembled into memory as required by a */
258: /* suitable definition of this macro operating on the table above */
259:
260: #define xp(i) xp_fun( \
261: (i & 0x80 ? 0xe1 : 0) ^ (i & 0x40 ? 0x70 : 0) ^ \
262: (i & 0x20 ? 0x38 : 0) ^ (i & 0x10 ? 0x1c : 0) ^ \
263: (i & 0x08 ? 0x0e : 0) ^ (i & 0x04 ? 0x07 : 0) ^ \
264: (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \
265: (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x80 : 0) ^ \
266: (i & 0x20 ? 0x40 : 0) ^ (i & 0x10 ? 0x20 : 0) ^ \
267: (i & 0x08 ? 0x10 : 0) ^ (i & 0x04 ? 0x08 : 0) ^ \
268: (i & 0x02 ? 0x84 : 0) ^ (i & 0x01 ? 0xc2 : 0) )
269:
270: #define xp64(i) xp_fun( \
271: (i & 0x80 ? 0xd8 : 0) ^ (i & 0x40 ? 0x6c : 0) ^ \
272: (i & 0x20 ? 0x36 : 0) ^ (i & 0x10 ? 0x1b : 0) ^ \
273: (i & 0x08 ? 0x0d : 0) ^ (i & 0x04 ? 0x06 : 0) ^ \
274: (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \
275: (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x00 : 0) ^ \
276: (i & 0x20 ? 0x00 : 0) ^ (i & 0x10 ? 0x00 : 0) ^ \
277: (i & 0x08 ? 0x80 : 0) ^ (i & 0x04 ? 0xc0 : 0) ^ \
278: (i & 0x02 ? 0x60 : 0) ^ (i & 0x01 ? 0xb0 : 0) )
279:
280: static mode(32t) gf_poly[2] = { 0, 0xe1000000 };
281: static mode(32t) gf_poly64[2] = { 0, 0xd8000000 };
282:
283: /* Multiply of a GF128 field element by x. The field element */
284: /* is held in an array of bytes in which field bits 8n..8n + 7 */
285: /* are held in byte[n], with lower indexed bits placed in the */
286: /* more numerically significant bit positions in bytes. */
287:
288: /* This function multiples a field element x, in the polynomial */
289: /* field representation. It uses 32-bit word operations to gain */
290: /* speed but compensates for machine endianess and hence works */
291: /* correctly on both styles of machine */
292:
293: in_line void mul_x(mode(32t) x[4])
294: { mode(32t) t;
295:
296: bsw_32(x, 4);
297:
298: /* at this point the filed element bits 0..127 are set out */
299: /* as follows in 32-bit words (where the most significant */
300: /* (ms) numeric bits are to the left) */
301: /* */
302: /* x[0] x[1] x[2] x[3] */
303: /* ms ls ms ls ms ls ms ls */
304: /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */
305:
306: t = gf_poly[x[3] & 1]; /* bit 127 of the element */
307: x[3] = (x[3] >> 1) | (x[2] << 31); /* shift bits up by one */
308: x[2] = (x[2] >> 1) | (x[1] << 31); /* position */
309: x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */
310: x[0] = (x[0] >> 1) ^ t; /* the field polynomial */
311: bsw_32(x, 4);
312: }
313:
314: in_line void mul_x64(mode(32t) x[2])
315: { mode(32t) t;
316:
317: bsw_32(x, 2);
318:
319: /* at this point the filed element bits 0..127 are set out */
320: /* as follows in 32-bit words (where the most significant */
321: /* (ms) numeric bits are to the left) */
322: /* */
323: /* x[0] x[1] x[2] x[3] */
324: /* ms ls ms ls ms ls ms ls */
325: /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */
326:
327: t = gf_poly64[x[1] & 1]; /* bit 127 of the element */
328: /* shift bits up by one */
329: /* position */
330: x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */
331: x[0] = (x[0] >> 1) ^ t; /* the field polynomial */
332: bsw_32(x, 2);
333: }
334:
335: /* Multiply of a GF128 field element by x^8 using 32-bit words */
336: /* for speed - machine endianess matters here */
337:
338: #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
339:
340: #define xp_fun(x,y) ((mode(32t))(x)) | (((mode(32t))(y)) << 8)
341: static const unsigned __int16 gft_le[256] = gf_dat(xp);
342: static const unsigned __int16 gft_le64[256] = gf_dat(xp64);
343:
344: in_line void mul_lex8(mode(32t) x[4]) /* mutiply with long words */
345: { mode(32t) t = (x[3] >> 24); /* in little endian format */
346: x[3] = (x[3] << 8) | (x[2] >> 24);
347: x[2] = (x[2] << 8) | (x[1] >> 24);
348: x[1] = (x[1] << 8) | (x[0] >> 24);
349: x[0] = (x[0] << 8) ^ gft_le[t];
350: }
351:
352: in_line void mul_lex8_64(mode(32t) x[2]) /* mutiply with long words */
353: { mode(32t) t = (x[1] >> 24); /* in little endian format */
354: x[1] = (x[1] << 8) | (x[0] >> 24);
355: x[0] = (x[0] << 8) ^ gft_le64[t];
356: }
357:
358: #endif
359:
360: #if 1 || (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
361:
362: #undef xp_fun
363: #define xp_fun(x,y) ((mode(32t))(y)) | (((mode(32t))(x)) << 8)
364: static const unsigned __int16 gft_be[256] = gf_dat(xp);
365: static const unsigned __int16 gft_be64[256] = gf_dat(xp64);
366:
367: in_line void mul_bex8(mode(32t) x[4]) /* mutiply with long words */
368: { mode(32t) t = (x[3] & 0xff); /* in big endian format */
369: x[3] = (x[3] >> 8) | (x[2] << 24);
370: x[2] = (x[2] >> 8) | (x[1] << 24);
371: x[1] = (x[1] >> 8) | (x[0] << 24);
372: x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be[t]) << 16);
373: }
374:
375: in_line void mul_bex8_64(mode(32t) x[2]) /* mutiply with long words */
376: { mode(32t) t = (x[1] & 0xff); /* in big endian format */
377: x[1] = (x[1] >> 8) | (x[0] << 24);
378: x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be64[t]) << 16);
379: }
380:
381: #endif
382:
383: /* hence choose the correct version for the machine endianess */
384:
385: #if PLATFORM_BYTE_ORDER == BRG_BIG_ENDIAN
386: #define mul_x8 mul_bex8
387: #define mul_x8_64 mul_bex8_64
388: #else
389: #define mul_x8 mul_lex8
390: #define mul_x8_64 mul_lex8_64
391: #endif
392:
393: /* different versions of the general gf_mul function are provided */
394: /* here. Sadly none are very fast :-( */
395:
396: void GfMul128 (void *a, const void* b)
397: { mode(32t) r[CBLK_LEN >> 2], p[8][CBLK_LEN >> 2];
398: int i;
399:
400: move_block_aligned(p[0], b);
401: bsw_32(p[0], 4);
402: for(i = 0; i < 7; ++i)
403: {
404: p[i + 1][3] = (p[i][3] >> 1) | (p[i][2] << 31);
405: p[i + 1][2] = (p[i][2] >> 1) | (p[i][1] << 31);
406: p[i + 1][1] = (p[i][1] >> 1) | (p[i][0] << 31);
407: p[i + 1][0] = (p[i][0] >> 1) ^ gf_poly[p[i][3] & 1];
408: }
409:
410: memset(r, 0, CBLK_LEN);
411: for(i = 0; i < 16; ++i)
412: {
413: if(i) mul_bex8(r); /* order is always big endian here */
414:
415: if(((unsigned char*)a)[15 - i] & 0x80)
416: xor_block_aligned(r, p[0]);
417: if(((unsigned char*)a)[15 - i] & 0x40)
418: xor_block_aligned(r, p[1]);
419: if(((unsigned char*)a)[15 - i] & 0x20)
420: xor_block_aligned(r, p[2]);
421: if(((unsigned char*)a)[15 - i] & 0x10)
422: xor_block_aligned(r, p[3]);
423: if(((unsigned char*)a)[15 - i] & 0x08)
424: xor_block_aligned(r, p[4]);
425: if(((unsigned char*)a)[15 - i] & 0x04)
426: xor_block_aligned(r, p[5]);
427: if(((unsigned char*)a)[15 - i] & 0x02)
428: xor_block_aligned(r, p[6]);
429: if(((unsigned char*)a)[15 - i] & 0x01)
430: xor_block_aligned(r, p[7]);
431: }
432: bsw_32(r, 4);
433: move_block_aligned(a, r);
434: }
435:
436: #if defined( UNROLL_LOOPS )
437:
438: #define xor_8k(i) \
439: xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); \
440: xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4])
441:
442:
443: void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx)
444: { unsigned __int32 r[CBLK_LEN >> 2];
445:
446: move_block_aligned(r, ctx->gf_t8k[0][a[0] & 15]);
447: xor_block_aligned(r, ctx->gf_t8k[1][a[0] >> 4]);
448: xor_8k( 1); xor_8k( 2); xor_8k( 3);
449: xor_8k( 4); xor_8k( 5); xor_8k( 6); xor_8k( 7);
450: xor_8k( 8); xor_8k( 9); xor_8k(10); xor_8k(11);
451: xor_8k(12); xor_8k(13); xor_8k(14); xor_8k(15);
452: move_block_aligned(a, r);
453: }
454:
455: #else
456:
457: void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx)
458: { unsigned __int32 r[CBLK_LEN >> 2], *p;
459: int i;
460:
461: p = ctx->gf_t8k[0][a[0] & 15];
462: memcpy(r, p, CBLK_LEN);
463: p = ctx->gf_t8k[1][a[0] >> 4];
464: xor_block_aligned(r, p);
465: for(i = 1; i < CBLK_LEN; ++i)
466: {
467: xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]);
468: xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]);
469: }
470: memcpy(a, r, CBLK_LEN);
471: }
472:
473: #endif
474:
475: void compile_8k_table(unsigned __int8 *a, GfCtx8k *ctx)
476: { int i, j, k;
477:
478: memset(ctx->gf_t8k, 0, 32 * 16 * 16);
479: for(i = 0; i < 2 * CBLK_LEN; ++i)
480: {
481: if(i == 0)
482: {
483: memcpy(ctx->gf_t8k[1][8], a, CBLK_LEN);
484: for(j = 4; j > 0; j >>= 1)
485: {
486: memcpy(ctx->gf_t8k[1][j], ctx->gf_t8k[1][j + j], CBLK_LEN);
487: mul_x(ctx->gf_t8k[1][j]);
488: }
489: memcpy(ctx->gf_t8k[0][8], ctx->gf_t8k[1][1], CBLK_LEN);
490: mul_x(ctx->gf_t8k[0][8]);
491: for(j = 4; j > 0; j >>= 1)
492: {
493: memcpy(ctx->gf_t8k[0][j], ctx->gf_t8k[0][j + j], CBLK_LEN);
494: mul_x(ctx->gf_t8k[0][j]);
495: }
496: }
497: else if(i > 1)
498: for(j = 8; j > 0; j >>= 1)
499: {
500: memcpy(ctx->gf_t8k[i][j], ctx->gf_t8k[i - 2][j], CBLK_LEN);
501: mul_x8(ctx->gf_t8k[i][j]);
502: }
503:
504: for(j = 2; j < 16; j += j)
505: {
506: mode(32t) *pj = ctx->gf_t8k[i][j];
507: mode(32t) *pk = ctx->gf_t8k[i][1];
508: mode(32t) *pl = ctx->gf_t8k[i][j + 1];
509:
510: for(k = 1; k < j; ++k)
511: {
512: *pl++ = pj[0] ^ *pk++;
513: *pl++ = pj[1] ^ *pk++;
514: *pl++ = pj[2] ^ *pk++;
515: *pl++ = pj[3] ^ *pk++;
516: }
517: }
518: }
519: }
520:
521:
522: void compile_4k_table64(unsigned __int8 *a, GfCtx4k64 *ctx)
523: { int i, j, k;
524:
525: memset(ctx->gf_t4k, 0, sizeof(ctx->gf_t4k));
526: for(i = 0; i < 2 * CBLK_LEN8; ++i)
527: {
528: if(i == 0)
529: {
530: memcpy(ctx->gf_t4k[1][8], a, CBLK_LEN8);
531: for(j = 4; j > 0; j >>= 1)
532: {
533: memcpy(ctx->gf_t4k[1][j], ctx->gf_t4k[1][j + j], CBLK_LEN8);
534: mul_x64(ctx->gf_t4k[1][j]);
535: }
536: memcpy(ctx->gf_t4k[0][8], ctx->gf_t4k[1][1], CBLK_LEN8);
537: mul_x64(ctx->gf_t4k[0][8]);
538: for(j = 4; j > 0; j >>= 1)
539: {
540: memcpy(ctx->gf_t4k[0][j], ctx->gf_t4k[0][j + j], CBLK_LEN8);
541: mul_x64(ctx->gf_t4k[0][j]);
542: }
543: }
544: else if(i > 1)
545: for(j = 8; j > 0; j >>= 1)
546: {
547: memcpy(ctx->gf_t4k[i][j], ctx->gf_t4k[i - 2][j], CBLK_LEN8);
548: mul_x8_64(ctx->gf_t4k[i][j]);
549: }
550:
551: for(j = 2; j < 16; j += j)
552: {
553: mode(32t) *pj = ctx->gf_t4k[i][j];
554: mode(32t) *pk = ctx->gf_t4k[i][1];
555: mode(32t) *pl = ctx->gf_t4k[i][j + 1];
556:
557: for(k = 1; k < j; ++k)
558: {
559: *pl++ = pj[0] ^ *pk++;
560: *pl++ = pj[1] ^ *pk++;
561: *pl++ = pj[2] ^ *pk++;
562: *pl++ = pj[3] ^ *pk++;
563: }
564: }
565: }
566: }
567:
568: static int IsBitSet128 (unsigned int bit, unsigned __int8 *a)
569: {
570: return a[(127 - bit) / 8] & (0x80 >> ((127 - bit) % 8));
571: }
572:
573: static int IsBitSet64 (unsigned int bit, unsigned __int8 *a)
574: {
575: return a[(63 - bit) / 8] & (0x80 >> ((63 - bit) % 8));
576: }
577:
578: static void SetBit128 (unsigned int bit, unsigned __int8 *a)
579: {
580: a[(127 - bit) / 8] |= 0x80 >> ((127 - bit) % 8);
581: }
582:
583: static void SetBit64 (unsigned int bit, unsigned __int8 *a)
584: {
585: a[(63 - bit) / 8] |= 0x80 >> ((63 - bit) % 8);
586: }
587:
1.1.1.2 root 588: void MirrorBits128 (unsigned __int8 *a)
1.1 root 589: {
590: unsigned __int8 t[128 / 8];
591: int i;
592: memset (t,0,16);
593: for (i = 0; i < 128; i++)
594: {
595: if (IsBitSet128(i, a))
596: SetBit128 (127 - i, t);
597: }
598: memcpy (a, t, sizeof (t));
599: burn (t,sizeof (t));
600: }
601:
602: void MirrorBits64 (unsigned __int8 *a)
603: {
604: unsigned __int8 t[64 / 8];
605: int i;
606: memset (t,0,8);
607: for (i = 0; i < 64; i++)
608: {
609: if (IsBitSet64(i, a))
610: SetBit64 (63 - i, t);
611: }
612: memcpy (a, t, sizeof (t));
613: burn (t,sizeof (t));
614: }
615:
616: /* Allocate and initialize speed optimization table
617: for multiplication by 64-bit operand in MSB-first mode */
618: int Gf128Tab64Init (unsigned __int8 *a, GfCtx *ctx)
619: {
620: GfCtx8k *ctx8k;
621: unsigned __int8 am[16];
622: int i, j;
623:
624: ctx8k = (GfCtx8k *) TCalloc (sizeof (GfCtx8k));
625: if (!ctx8k)
626: return FALSE;
627:
628: memcpy (am, a, 16);
629: MirrorBits128 (am);
630: compile_8k_table (am, ctx8k);
631:
632: /* Convert 8k LSB-first table to 4k MSB-first */
633: for (i = 16; i < 32; i++)
634: {
635: for (j = 0; j < 16; j++)
636: {
637: int jm = 0;
638: jm |= (j & 0x1) << 3;
639: jm |= (j & 0x2) << 1;
640: jm |= (j & 0x4) >> 1;
641: jm |= (j & 0x8) >> 3;
642:
643: memcpy (&ctx->gf_t128[i-16][jm], (unsigned char *)&ctx8k->gf_t8k[31-i][j], 16);
644: MirrorBits128 ((unsigned char *)&ctx->gf_t128[i-16][jm]);
645: }
646: }
647:
648: burn (ctx8k ,sizeof (*ctx8k));
649: burn (am, sizeof (am));
650: TCfree (ctx8k);
651: return TRUE;
652: }
653:
654: int Gf64TabInit (unsigned __int8 *a, GfCtx *ctx)
655: {
1.1.1.4 root 656: /* Deprecated/legacy */
657:
1.1 root 658: GfCtx4k64 *ctx4k;
659: unsigned __int8 am[8];
660: int i, j;
661:
662: ctx4k = (GfCtx4k64 *) TCalloc (sizeof (GfCtx4k64));
663: if (!ctx4k)
664: return FALSE;
665:
666: memcpy (am, a, 8);
667: MirrorBits64 (am);
668: compile_4k_table64 (am, ctx4k);
669:
670: /* Convert LSB-first table to MSB-first */
671: for (i = 0; i < 16; i++)
672: {
673: for (j = 0; j < 16; j++)
674: {
675: int jm = 0;
676: jm |= (j & 0x1) << 3;
677: jm |= (j & 0x2) << 1;
678: jm |= (j & 0x4) >> 1;
679: jm |= (j & 0x8) >> 3;
680:
681: memcpy (&ctx->gf_t64[i][jm], (unsigned char *)&ctx4k->gf_t4k[15-i][j], 8);
682: MirrorBits64 ((unsigned char *)&ctx->gf_t64[i][jm]);
683: }
684: }
685:
686: burn (ctx4k,sizeof (*ctx4k));
687: burn (am, sizeof (am));
688: TCfree (ctx4k);
689: return TRUE;
690: }
691:
692: #define xor_8kt64(i) \
693: xor_block_aligned(r, ctx->gf_t128[i + i][a[i] & 15]); \
694: xor_block_aligned(r, ctx->gf_t128[i + i + 1][a[i] >> 4])
695:
696: /* Multiply a 128-bit number by a 64-bit number in the finite field GF(2^128) */
697: void Gf128MulBy64Tab (unsigned __int8 a[8], unsigned __int8 p[16], GfCtx *ctx)
698: {
699: unsigned __int32 r[CBLK_LEN >> 2];
700:
701: move_block_aligned(r, ctx->gf_t128[7*2][a[7] & 15]);
702: xor_block_aligned(r, ctx->gf_t128[7*2+1][a[7] >> 4]);
703:
704: if (*(unsigned __int16 *)a)
705: {
706: xor_8kt64(0);
707: xor_8kt64(1);
708: }
709: if (a[2])
710: {
711: xor_8kt64(2);
712: }
713: xor_8kt64(3);
714: xor_8kt64(4);
715: xor_8kt64(5);
716: xor_8kt64(6);
717:
718: move_block_aligned(p, r);
719: }
720:
721: #define xor_8k64(i) \
722: xor_block_aligned64(r, ctx->gf_t64[i + i][a[i] & 15]); \
723: xor_block_aligned64(r, ctx->gf_t64[i + i + 1][a[i] >> 4])
724:
725: /* Multiply two 64-bit numbers in the finite field GF(2^64) */
726: void Gf64MulTab (unsigned char a[8], unsigned char p[8], GfCtx *ctx)
727: {
1.1.1.4 root 728: /* Deprecated/legacy */
729:
1.1 root 730: unsigned __int32 r[CBLK_LEN8 >> 2];
731:
732: move_block_aligned64(r, ctx->gf_t64[7*2][a[7] & 15]);
733: xor_block_aligned64(r, ctx->gf_t64[7*2+1][a[7] >> 4]);
734:
735: if (*(unsigned __int16 *)a)
736: {
737: xor_8k64(0);
738: xor_8k64(1);
739: }
740: if (a[2])
741: {
742: xor_8k64(2);
743: }
744: xor_8k64(3);
745: xor_8k64(4);
746: xor_8k64(5);
747: xor_8k64(6);
748:
749: move_block_aligned64(p, r);
750: }
751:
752:
753: /* Basic algorithms for testing of optimized algorithms */
754:
1.1.1.7 root 755: static void xor128 (uint64 *a, uint64 *b)
1.1 root 756: {
757: *a++ ^= *b++;
758: *a ^= *b;
759: }
760:
761: static void shl128 (unsigned __int8 *a)
762: {
763: int i, x = 0, xx;
764: for (i = 15; i >= 0; i--)
765: {
766: xx = (a[i] & 0x80) >> 7;
1.1.1.8 root 767: a[i] = (char) ((a[i] << 1) | x);
1.1 root 768: x = xx;
769: }
770: }
771:
772: static void GfMul128Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8 *p)
773: {
774: int i;
775: unsigned __int8 la[16];
776: memcpy (la, a, 16);
777: memset (p, 0, 16);
778:
779: for (i = 0; i < 128; i++)
780: {
781: if (IsBitSet128 (i, b))
1.1.1.7 root 782: xor128 ((uint64 *)p, (uint64 *)la);
1.1 root 783:
784: if (la[0] & 0x80)
785: {
786: shl128 (la);
787: la[15] ^= 0x87;
788: }
789: else
790: {
791: shl128 (la);
792: }
793: }
794: }
795:
1.1.1.7 root 796: static void xor64 (uint64 *a, uint64 *b)
1.1 root 797: {
798: *a ^= *b;
799: }
800:
801: static void shl64 (unsigned __int8 *a)
802: {
803: int i, x = 0, xx;
804: for (i = 7; i >= 0; i--)
805: {
806: xx = (a[i] & 0x80) >> 7;
1.1.1.8 root 807: a[i] = (char) ((a[i] << 1) | x);
1.1 root 808: x = xx;
809: }
810: }
811:
812: static void GfMul64Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8* p)
813: {
1.1.1.4 root 814: /* Deprecated/legacy */
815:
1.1 root 816: int i;
817: unsigned __int8 la[8];
818: memcpy (la, a, 8);
819: memset (p, 0, 8);
820:
821: for (i = 0; i < 64; i++)
822: {
823: if (IsBitSet64 (i, b))
1.1.1.7 root 824: xor64 ((uint64 *)p, (uint64 *)la);
1.1 root 825:
826: if (la[0] & 0x80)
827: {
828: shl64 (la);
829: la[7] ^= 0x1b;
830: }
831: else
832: {
833: shl64 (la);
834: }
835: }
836: }
837:
838:
839: BOOL GfMulSelfTest ()
840: {
841: BOOL result = TRUE;
842: unsigned __int8 a[16];
843: unsigned __int8 b[16];
844: unsigned __int8 p1[16];
845: unsigned __int8 p2[16];
846: GfCtx *gfCtx = (GfCtx *) TCalloc (sizeof (GfCtx));
847: int i, j;
848:
849: if (!gfCtx)
850: return FALSE;
851:
1.1.1.4 root 852: /* GF(2^64) - deprecated/legacy */
1.1 root 853: for (i = 0; i < 0x100; i++)
854: {
855: for (j = 0; j < 8; j++)
856: {
857: a[j] = (unsigned __int8) i;
858: b[j] = a[j] ^ 0xff;
859: }
860:
861: GfMul64Basic (a, b, p1);
862:
863: Gf64TabInit (a, gfCtx);
864: Gf64MulTab (b, p2, gfCtx);
865:
866: if (memcmp (p1, p2, 8) != 0)
867: result = FALSE;
868: }
869:
870: /* GF(2^128) */
871: for (i = 0; i < 0x100; i++)
872: {
873: for (j = 0; j < 16; j++)
874: {
875: a[j] = (unsigned __int8) i;
876: b[j] = j < 8 ? 0 : a[j] ^ 0xff;
877: }
878:
879: GfMul128Basic (a, b, p1);
880:
881: Gf128Tab64Init (a, gfCtx);
882: Gf128MulBy64Tab (b + 8, p2, gfCtx);
883:
884: if (memcmp (p1, p2, 16) != 0)
885: result = FALSE;
886: }
887:
888: TCfree (gfCtx);
889: return result;
890: }
891:
892: #if defined(__cplusplus)
893: }
894: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.