|
|
1.1 root 1: /*
2: ---------------------------------------------------------------------------
3: Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
4:
5: LICENSE TERMS
6:
7: The free distribution and use of this software in both source and binary
8: form is allowed (with or without changes) provided that:
9:
10: 1. distributions of this source code include the above copyright
11: notice, this list of conditions and the following disclaimer;
12:
13: 2. distributions in binary form include the above copyright
14: notice, this list of conditions and the following disclaimer
15: in the documentation and/or other associated materials;
16:
17: 3. the copyright holder's name is not used to endorse products
18: built using this software without specific written permission.
19:
20: ALTERNATIVELY, provided that this notice is retained in full, this product
21: may be distributed under the terms of the GNU General Public License (GPL),
22: in which case the provisions of the GPL apply INSTEAD OF those given above.
23:
24: DISCLAIMER
25:
26: This software is provided 'as is' with no explicit or implied warranties
27: in respect of its properties, including, but not limited to, correctness
28: and/or fitness for purpose.
29: ---------------------------------------------------------------------------
30: Issue Date: 31/01/2004
31:
32: My thanks to John Viega and David McGrew for their support in developing
33: this code and to David for testing it on a big-endain system.
34: */
35:
36: /*
1.1.1.5 ! root 37: Portions Copyright (c) 2005 TrueCrypt Foundation
1.1.1.3 root 38:
1.1 root 39: TrueCrypt Foundation made the following changes:
40:
41: - Added multiplication in the finite field GF(2^128) optimized for
42: cases involving a 64-bit operand.
43:
44: - Added multiplication in the finite field GF(2^64).
45:
46: - Added MSB-first mode.
47:
1.1.1.2 root 48: - Added basic test algorithms.
49:
1.1 root 50: - Removed GCM.
51: */
52:
53: #ifdef LINUX_DRIVER
54: #include <linux/module.h>
55: #include <linux/string.h>
56: #else
57: #include <memory.h>
1.1.1.5 ! root 58: #include <stdlib.h>
1.1 root 59: #endif
60:
61: #include "GfMul.h"
62: #include "Tcdefs.h"
1.1.1.3 root 63: #include "Common/Endian.h"
1.1 root 64:
65: /* BUFFER_ALIGN32 or BUFFER_ALIGN64 must be defined at this point to */
66: /* enable faster operation by taking advantage of memory aligned values */
67: /* NOTE: the BUFFER_ALIGN64 option has not been tested extensively */
68:
69: #define BUFFER_ALIGN32
70: #define UNROLL_LOOPS /* define to unroll some loops */
71: #define IN_LINES /* define to use inline functions */
72: /* in place of macros */
73:
74: #define mode(x) GM_##x
75:
76: #if defined(__cplusplus)
77: extern "C"
78: {
79: #endif
80:
81: typedef unsigned __int32 mode(32t);
82: typedef unsigned __int64 mode(64t);
83:
84: #define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
85: #define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
86:
87: #if BYTE_ORDER == LITTLE_ENDIAN
88: # define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN
89: #endif
90:
91: #if BYTE_ORDER == BIG_ENDIAN
92: # define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN
93: #endif
94:
95: #ifdef _MSC_VER
96: #pragma intrinsic(memcpy)
97: #define in_line __inline
98: #else
99: #define in_line
100: #endif
101:
102: #if 0 && defined(_MSC_VER)
103: #define rotl32 _lrotl
104: #define rotr32 _lrotr
105: #else
106: #define rotl32(x,n) (((x) << n) | ((x) >> (32 - n)))
107: #define rotr32(x,n) (((x) >> n) | ((x) << (32 - n)))
108: #endif
109:
110: #if !defined(bswap_32)
111: #define bswap_32(x) (rotr32((x), 24) & 0x00ff00ff | rotr32((x), 8) & 0xff00ff00)
112: #endif
113:
114: #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
115: #define SWAP_BYTES
116: #else
117: #undef SWAP_BYTES
118: #endif
119:
120: #if defined(SWAP_BYTES)
121:
122: #if defined ( IN_LINES )
123:
124: in_line void bsw_32(void * p, unsigned int n)
125: { unsigned int i = n;
126: while(i--)
127: ((mode(32t)*)p)[i] = bswap_32(((mode(32t)*)p)[i]);
128: }
129:
130: #else
131:
132: #define bsw_32(p,n) \
133: { int _i = (n); while(_i--) ((mode(32t)*)p)[_i] = bswap_32(((mode(32t)*)p)[_i]); }
134:
135: #endif
136:
137: #else
138: #define bsw_32(p,n)
139: #endif
140:
141: /* These values are used to detect long word alignment in order */
142: /* to speed up some GCM buffer operations. This facility may */
143: /* not work on some machines */
144:
145: #define lp08(x) ((unsigned char*)(x))
146: #define lp32(x) ((mode(32t)*)(x))
147: #define lp64(x) ((mode(64t)*)(x))
148:
149: #define A32_MASK 3
150: #define A64_MASK 7
151: #define aligned32(x) (!(((mode(32t))(x)) & A32_MASK))
152: #define aligned64(x) (!(((mode(32t))(x)) & A64_MASK))
153:
154: #if defined( BUFFER_ALIGN32 )
155:
156: #define ADR_MASK A32_MASK
157: #define aligned aligned32
158: #define lp lp32
159: #define lp_inc 4
160:
161: #if defined( IN_LINES )
162:
163: in_line void move_block_aligned( void *p, const void *q)
164: {
165: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1],
166: lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3];
167: }
168:
169: in_line void move_block_aligned64( void *p, const void *q)
170: {
171: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1];
172: }
173:
174: in_line void xor_block_aligned( void *p, const void *q)
175: {
176: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1],
177: lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3];
178: }
179:
180: in_line void xor_block_aligned64( void *p, const void *q)
181: {
182: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1];
183: }
184:
185: #else
186:
187: #define move_block_aligned(p,q) \
188: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], \
189: lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3]
190:
191: #define xor_block_aligned(p,q) \
192: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], \
193: lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3]
194:
195: #endif
196:
197: #elif defined( BUFFER_ALIGN64 )
198:
199: #define ADR_MASK A64_MASK
200: #define aligned aligned64
201: #define lp lp64
202: #define lp_inc 8
203:
204: #define move_block_aligned(p,q) \
205: lp64(p)[0] = lp64(q)[0], lp64(p)[1] = lp64(q)[1]
206:
207: #define xor_block_aligned(p,q) \
208: lp64(p)[0] ^= lp64(q)[0], lp64(p)[1] ^= lp64(q)[1]
209:
210: #else
211: #define aligned(x) 0
212: #endif
213:
214: #define move_block(p,q) memcpy((p), (q), BLOCK_LEN)
215:
216: #define xor_block(p,q) \
217: lp08(p)[ 0] ^= lp08(q)[ 0], lp08(p)[ 1] ^= lp08(q)[ 1], \
218: lp08(p)[ 2] ^= lp08(q)[ 2], lp08(p)[ 3] ^= lp08(q)[ 3], \
219: lp08(p)[ 4] ^= lp08(q)[ 4], lp08(p)[ 5] ^= lp08(q)[ 5], \
220: lp08(p)[ 6] ^= lp08(q)[ 6], lp08(p)[ 7] ^= lp08(q)[ 7], \
221: lp08(p)[ 8] ^= lp08(q)[ 8], lp08(p)[ 9] ^= lp08(q)[ 9], \
222: lp08(p)[10] ^= lp08(q)[10], lp08(p)[11] ^= lp08(q)[11], \
223: lp08(p)[12] ^= lp08(q)[12], lp08(p)[13] ^= lp08(q)[13], \
224: lp08(p)[14] ^= lp08(q)[14], lp08(p)[15] ^= lp08(q)[15]
225:
226:
227: #define gf_dat(q) {\
228: q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
229: q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
230: q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
231: q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
232: q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
233: q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
234: q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
235: q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
236: q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
237: q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
238: q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
239: q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
240: q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
241: q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
242: q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
243: q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
244: q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
245: q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
246: q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
247: q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
248: q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
249: q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
250: q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
251: q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
252: q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
253: q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
254: q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
255: q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
256: q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
257: q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
258: q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
259: q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) }
260:
261: /* given the value i in 0..255 as the byte overflow when a a field */
262: /* element in GHASH is multipled by x^8, this function will return */
263: /* the values that are generated in the lo 16-bit word of the field */
264: /* value by applying the modular polynomial. The values lo_byte and */
265: /* hi_byte are returned via the macro xp_fun(lo_byte, hi_byte) so */
266: /* that the values can be assembled into memory as required by a */
267: /* suitable definition of this macro operating on the table above */
268:
269: #define xp(i) xp_fun( \
270: (i & 0x80 ? 0xe1 : 0) ^ (i & 0x40 ? 0x70 : 0) ^ \
271: (i & 0x20 ? 0x38 : 0) ^ (i & 0x10 ? 0x1c : 0) ^ \
272: (i & 0x08 ? 0x0e : 0) ^ (i & 0x04 ? 0x07 : 0) ^ \
273: (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \
274: (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x80 : 0) ^ \
275: (i & 0x20 ? 0x40 : 0) ^ (i & 0x10 ? 0x20 : 0) ^ \
276: (i & 0x08 ? 0x10 : 0) ^ (i & 0x04 ? 0x08 : 0) ^ \
277: (i & 0x02 ? 0x84 : 0) ^ (i & 0x01 ? 0xc2 : 0) )
278:
279: #define xp64(i) xp_fun( \
280: (i & 0x80 ? 0xd8 : 0) ^ (i & 0x40 ? 0x6c : 0) ^ \
281: (i & 0x20 ? 0x36 : 0) ^ (i & 0x10 ? 0x1b : 0) ^ \
282: (i & 0x08 ? 0x0d : 0) ^ (i & 0x04 ? 0x06 : 0) ^ \
283: (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \
284: (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x00 : 0) ^ \
285: (i & 0x20 ? 0x00 : 0) ^ (i & 0x10 ? 0x00 : 0) ^ \
286: (i & 0x08 ? 0x80 : 0) ^ (i & 0x04 ? 0xc0 : 0) ^ \
287: (i & 0x02 ? 0x60 : 0) ^ (i & 0x01 ? 0xb0 : 0) )
288:
289: static mode(32t) gf_poly[2] = { 0, 0xe1000000 };
290: static mode(32t) gf_poly64[2] = { 0, 0xd8000000 };
291:
292: /* Multiply of a GF128 field element by x. The field element */
293: /* is held in an array of bytes in which field bits 8n..8n + 7 */
294: /* are held in byte[n], with lower indexed bits placed in the */
295: /* more numerically significant bit positions in bytes. */
296:
297: /* This function multiples a field element x, in the polynomial */
298: /* field representation. It uses 32-bit word operations to gain */
299: /* speed but compensates for machine endianess and hence works */
300: /* correctly on both styles of machine */
301:
302: in_line void mul_x(mode(32t) x[4])
303: { mode(32t) t;
304:
305: bsw_32(x, 4);
306:
307: /* at this point the filed element bits 0..127 are set out */
308: /* as follows in 32-bit words (where the most significant */
309: /* (ms) numeric bits are to the left) */
310: /* */
311: /* x[0] x[1] x[2] x[3] */
312: /* ms ls ms ls ms ls ms ls */
313: /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */
314:
315: t = gf_poly[x[3] & 1]; /* bit 127 of the element */
316: x[3] = (x[3] >> 1) | (x[2] << 31); /* shift bits up by one */
317: x[2] = (x[2] >> 1) | (x[1] << 31); /* position */
318: x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */
319: x[0] = (x[0] >> 1) ^ t; /* the field polynomial */
320: bsw_32(x, 4);
321: }
322:
323: in_line void mul_x64(mode(32t) x[2])
324: { mode(32t) t;
325:
326: bsw_32(x, 2);
327:
328: /* at this point the filed element bits 0..127 are set out */
329: /* as follows in 32-bit words (where the most significant */
330: /* (ms) numeric bits are to the left) */
331: /* */
332: /* x[0] x[1] x[2] x[3] */
333: /* ms ls ms ls ms ls ms ls */
334: /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */
335:
336: t = gf_poly64[x[1] & 1]; /* bit 127 of the element */
337: /* shift bits up by one */
338: /* position */
339: x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */
340: x[0] = (x[0] >> 1) ^ t; /* the field polynomial */
341: bsw_32(x, 2);
342: }
343:
344: /* Multiply of a GF128 field element by x^8 using 32-bit words */
345: /* for speed - machine endianess matters here */
346:
347: #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
348:
349: #define xp_fun(x,y) ((mode(32t))(x)) | (((mode(32t))(y)) << 8)
350: static const unsigned __int16 gft_le[256] = gf_dat(xp);
351: static const unsigned __int16 gft_le64[256] = gf_dat(xp64);
352:
353: in_line void mul_lex8(mode(32t) x[4]) /* mutiply with long words */
354: { mode(32t) t = (x[3] >> 24); /* in little endian format */
355: x[3] = (x[3] << 8) | (x[2] >> 24);
356: x[2] = (x[2] << 8) | (x[1] >> 24);
357: x[1] = (x[1] << 8) | (x[0] >> 24);
358: x[0] = (x[0] << 8) ^ gft_le[t];
359: }
360:
361: in_line void mul_lex8_64(mode(32t) x[2]) /* mutiply with long words */
362: { mode(32t) t = (x[1] >> 24); /* in little endian format */
363: x[1] = (x[1] << 8) | (x[0] >> 24);
364: x[0] = (x[0] << 8) ^ gft_le64[t];
365: }
366:
367: #endif
368:
369: #if 1 || (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
370:
371: #undef xp_fun
372: #define xp_fun(x,y) ((mode(32t))(y)) | (((mode(32t))(x)) << 8)
373: static const unsigned __int16 gft_be[256] = gf_dat(xp);
374: static const unsigned __int16 gft_be64[256] = gf_dat(xp64);
375:
376: in_line void mul_bex8(mode(32t) x[4]) /* mutiply with long words */
377: { mode(32t) t = (x[3] & 0xff); /* in big endian format */
378: x[3] = (x[3] >> 8) | (x[2] << 24);
379: x[2] = (x[2] >> 8) | (x[1] << 24);
380: x[1] = (x[1] >> 8) | (x[0] << 24);
381: x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be[t]) << 16);
382: }
383:
384: in_line void mul_bex8_64(mode(32t) x[2]) /* mutiply with long words */
385: { mode(32t) t = (x[1] & 0xff); /* in big endian format */
386: x[1] = (x[1] >> 8) | (x[0] << 24);
387: x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be64[t]) << 16);
388: }
389:
390: #endif
391:
392: /* hence choose the correct version for the machine endianess */
393:
394: #if PLATFORM_BYTE_ORDER == BRG_BIG_ENDIAN
395: #define mul_x8 mul_bex8
396: #define mul_x8_64 mul_bex8_64
397: #else
398: #define mul_x8 mul_lex8
399: #define mul_x8_64 mul_lex8_64
400: #endif
401:
402: /* different versions of the general gf_mul function are provided */
403: /* here. Sadly none are very fast :-( */
404:
405: void GfMul128 (void *a, const void* b)
406: { mode(32t) r[CBLK_LEN >> 2], p[8][CBLK_LEN >> 2];
407: int i;
408:
409: move_block_aligned(p[0], b);
410: bsw_32(p[0], 4);
411: for(i = 0; i < 7; ++i)
412: {
413: p[i + 1][3] = (p[i][3] >> 1) | (p[i][2] << 31);
414: p[i + 1][2] = (p[i][2] >> 1) | (p[i][1] << 31);
415: p[i + 1][1] = (p[i][1] >> 1) | (p[i][0] << 31);
416: p[i + 1][0] = (p[i][0] >> 1) ^ gf_poly[p[i][3] & 1];
417: }
418:
419: memset(r, 0, CBLK_LEN);
420: for(i = 0; i < 16; ++i)
421: {
422: if(i) mul_bex8(r); /* order is always big endian here */
423:
424: if(((unsigned char*)a)[15 - i] & 0x80)
425: xor_block_aligned(r, p[0]);
426: if(((unsigned char*)a)[15 - i] & 0x40)
427: xor_block_aligned(r, p[1]);
428: if(((unsigned char*)a)[15 - i] & 0x20)
429: xor_block_aligned(r, p[2]);
430: if(((unsigned char*)a)[15 - i] & 0x10)
431: xor_block_aligned(r, p[3]);
432: if(((unsigned char*)a)[15 - i] & 0x08)
433: xor_block_aligned(r, p[4]);
434: if(((unsigned char*)a)[15 - i] & 0x04)
435: xor_block_aligned(r, p[5]);
436: if(((unsigned char*)a)[15 - i] & 0x02)
437: xor_block_aligned(r, p[6]);
438: if(((unsigned char*)a)[15 - i] & 0x01)
439: xor_block_aligned(r, p[7]);
440: }
441: bsw_32(r, 4);
442: move_block_aligned(a, r);
443: }
444:
445: #if defined( UNROLL_LOOPS )
446:
447: #define xor_8k(i) \
448: xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); \
449: xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4])
450:
451:
452: void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx)
453: { unsigned __int32 r[CBLK_LEN >> 2];
454:
455: move_block_aligned(r, ctx->gf_t8k[0][a[0] & 15]);
456: xor_block_aligned(r, ctx->gf_t8k[1][a[0] >> 4]);
457: xor_8k( 1); xor_8k( 2); xor_8k( 3);
458: xor_8k( 4); xor_8k( 5); xor_8k( 6); xor_8k( 7);
459: xor_8k( 8); xor_8k( 9); xor_8k(10); xor_8k(11);
460: xor_8k(12); xor_8k(13); xor_8k(14); xor_8k(15);
461: move_block_aligned(a, r);
462: }
463:
464: #else
465:
466: void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx)
467: { unsigned __int32 r[CBLK_LEN >> 2], *p;
468: int i;
469:
470: p = ctx->gf_t8k[0][a[0] & 15];
471: memcpy(r, p, CBLK_LEN);
472: p = ctx->gf_t8k[1][a[0] >> 4];
473: xor_block_aligned(r, p);
474: for(i = 1; i < CBLK_LEN; ++i)
475: {
476: xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]);
477: xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]);
478: }
479: memcpy(a, r, CBLK_LEN);
480: }
481:
482: #endif
483:
484: void compile_8k_table(unsigned __int8 *a, GfCtx8k *ctx)
485: { int i, j, k;
486:
487: memset(ctx->gf_t8k, 0, 32 * 16 * 16);
488: for(i = 0; i < 2 * CBLK_LEN; ++i)
489: {
490: if(i == 0)
491: {
492: memcpy(ctx->gf_t8k[1][8], a, CBLK_LEN);
493: for(j = 4; j > 0; j >>= 1)
494: {
495: memcpy(ctx->gf_t8k[1][j], ctx->gf_t8k[1][j + j], CBLK_LEN);
496: mul_x(ctx->gf_t8k[1][j]);
497: }
498: memcpy(ctx->gf_t8k[0][8], ctx->gf_t8k[1][1], CBLK_LEN);
499: mul_x(ctx->gf_t8k[0][8]);
500: for(j = 4; j > 0; j >>= 1)
501: {
502: memcpy(ctx->gf_t8k[0][j], ctx->gf_t8k[0][j + j], CBLK_LEN);
503: mul_x(ctx->gf_t8k[0][j]);
504: }
505: }
506: else if(i > 1)
507: for(j = 8; j > 0; j >>= 1)
508: {
509: memcpy(ctx->gf_t8k[i][j], ctx->gf_t8k[i - 2][j], CBLK_LEN);
510: mul_x8(ctx->gf_t8k[i][j]);
511: }
512:
513: for(j = 2; j < 16; j += j)
514: {
515: mode(32t) *pj = ctx->gf_t8k[i][j];
516: mode(32t) *pk = ctx->gf_t8k[i][1];
517: mode(32t) *pl = ctx->gf_t8k[i][j + 1];
518:
519: for(k = 1; k < j; ++k)
520: {
521: *pl++ = pj[0] ^ *pk++;
522: *pl++ = pj[1] ^ *pk++;
523: *pl++ = pj[2] ^ *pk++;
524: *pl++ = pj[3] ^ *pk++;
525: }
526: }
527: }
528: }
529:
530:
531: void compile_4k_table64(unsigned __int8 *a, GfCtx4k64 *ctx)
532: { int i, j, k;
533:
534: memset(ctx->gf_t4k, 0, sizeof(ctx->gf_t4k));
535: for(i = 0; i < 2 * CBLK_LEN8; ++i)
536: {
537: if(i == 0)
538: {
539: memcpy(ctx->gf_t4k[1][8], a, CBLK_LEN8);
540: for(j = 4; j > 0; j >>= 1)
541: {
542: memcpy(ctx->gf_t4k[1][j], ctx->gf_t4k[1][j + j], CBLK_LEN8);
543: mul_x64(ctx->gf_t4k[1][j]);
544: }
545: memcpy(ctx->gf_t4k[0][8], ctx->gf_t4k[1][1], CBLK_LEN8);
546: mul_x64(ctx->gf_t4k[0][8]);
547: for(j = 4; j > 0; j >>= 1)
548: {
549: memcpy(ctx->gf_t4k[0][j], ctx->gf_t4k[0][j + j], CBLK_LEN8);
550: mul_x64(ctx->gf_t4k[0][j]);
551: }
552: }
553: else if(i > 1)
554: for(j = 8; j > 0; j >>= 1)
555: {
556: memcpy(ctx->gf_t4k[i][j], ctx->gf_t4k[i - 2][j], CBLK_LEN8);
557: mul_x8_64(ctx->gf_t4k[i][j]);
558: }
559:
560: for(j = 2; j < 16; j += j)
561: {
562: mode(32t) *pj = ctx->gf_t4k[i][j];
563: mode(32t) *pk = ctx->gf_t4k[i][1];
564: mode(32t) *pl = ctx->gf_t4k[i][j + 1];
565:
566: for(k = 1; k < j; ++k)
567: {
568: *pl++ = pj[0] ^ *pk++;
569: *pl++ = pj[1] ^ *pk++;
570: *pl++ = pj[2] ^ *pk++;
571: *pl++ = pj[3] ^ *pk++;
572: }
573: }
574: }
575: }
576:
577: static int IsBitSet128 (unsigned int bit, unsigned __int8 *a)
578: {
579: return a[(127 - bit) / 8] & (0x80 >> ((127 - bit) % 8));
580: }
581:
582: static int IsBitSet64 (unsigned int bit, unsigned __int8 *a)
583: {
584: return a[(63 - bit) / 8] & (0x80 >> ((63 - bit) % 8));
585: }
586:
587: static void SetBit128 (unsigned int bit, unsigned __int8 *a)
588: {
589: a[(127 - bit) / 8] |= 0x80 >> ((127 - bit) % 8);
590: }
591:
592: static void SetBit64 (unsigned int bit, unsigned __int8 *a)
593: {
594: a[(63 - bit) / 8] |= 0x80 >> ((63 - bit) % 8);
595: }
596:
1.1.1.2 root 597: void MirrorBits128 (unsigned __int8 *a)
1.1 root 598: {
599: unsigned __int8 t[128 / 8];
600: int i;
601: memset (t,0,16);
602: for (i = 0; i < 128; i++)
603: {
604: if (IsBitSet128(i, a))
605: SetBit128 (127 - i, t);
606: }
607: memcpy (a, t, sizeof (t));
608: burn (t,sizeof (t));
609: }
610:
611: void MirrorBits64 (unsigned __int8 *a)
612: {
613: unsigned __int8 t[64 / 8];
614: int i;
615: memset (t,0,8);
616: for (i = 0; i < 64; i++)
617: {
618: if (IsBitSet64(i, a))
619: SetBit64 (63 - i, t);
620: }
621: memcpy (a, t, sizeof (t));
622: burn (t,sizeof (t));
623: }
624:
625: /* Allocate and initialize speed optimization table
626: for multiplication by 64-bit operand in MSB-first mode */
627: int Gf128Tab64Init (unsigned __int8 *a, GfCtx *ctx)
628: {
629: GfCtx8k *ctx8k;
630: unsigned __int8 am[16];
631: int i, j;
632:
633: ctx8k = (GfCtx8k *) TCalloc (sizeof (GfCtx8k));
634: if (!ctx8k)
635: return FALSE;
636:
637: memcpy (am, a, 16);
638: MirrorBits128 (am);
639: compile_8k_table (am, ctx8k);
640:
641: /* Convert 8k LSB-first table to 4k MSB-first */
642: for (i = 16; i < 32; i++)
643: {
644: for (j = 0; j < 16; j++)
645: {
646: int jm = 0;
647: jm |= (j & 0x1) << 3;
648: jm |= (j & 0x2) << 1;
649: jm |= (j & 0x4) >> 1;
650: jm |= (j & 0x8) >> 3;
651:
652: memcpy (&ctx->gf_t128[i-16][jm], (unsigned char *)&ctx8k->gf_t8k[31-i][j], 16);
653: MirrorBits128 ((unsigned char *)&ctx->gf_t128[i-16][jm]);
654: }
655: }
656:
657: burn (ctx8k ,sizeof (*ctx8k));
658: burn (am, sizeof (am));
659: TCfree (ctx8k);
660: return TRUE;
661: }
662:
663: int Gf64TabInit (unsigned __int8 *a, GfCtx *ctx)
664: {
1.1.1.4 root 665: /* Deprecated/legacy */
666:
1.1 root 667: GfCtx4k64 *ctx4k;
668: unsigned __int8 am[8];
669: int i, j;
670:
671: ctx4k = (GfCtx4k64 *) TCalloc (sizeof (GfCtx4k64));
672: if (!ctx4k)
673: return FALSE;
674:
675: memcpy (am, a, 8);
676: MirrorBits64 (am);
677: compile_4k_table64 (am, ctx4k);
678:
679: /* Convert LSB-first table to MSB-first */
680: for (i = 0; i < 16; i++)
681: {
682: for (j = 0; j < 16; j++)
683: {
684: int jm = 0;
685: jm |= (j & 0x1) << 3;
686: jm |= (j & 0x2) << 1;
687: jm |= (j & 0x4) >> 1;
688: jm |= (j & 0x8) >> 3;
689:
690: memcpy (&ctx->gf_t64[i][jm], (unsigned char *)&ctx4k->gf_t4k[15-i][j], 8);
691: MirrorBits64 ((unsigned char *)&ctx->gf_t64[i][jm]);
692: }
693: }
694:
695: burn (ctx4k,sizeof (*ctx4k));
696: burn (am, sizeof (am));
697: TCfree (ctx4k);
698: return TRUE;
699: }
700:
701: #define xor_8kt64(i) \
702: xor_block_aligned(r, ctx->gf_t128[i + i][a[i] & 15]); \
703: xor_block_aligned(r, ctx->gf_t128[i + i + 1][a[i] >> 4])
704:
705: /* Multiply a 128-bit number by a 64-bit number in the finite field GF(2^128) */
706: void Gf128MulBy64Tab (unsigned __int8 a[8], unsigned __int8 p[16], GfCtx *ctx)
707: {
708: unsigned __int32 r[CBLK_LEN >> 2];
709:
710: move_block_aligned(r, ctx->gf_t128[7*2][a[7] & 15]);
711: xor_block_aligned(r, ctx->gf_t128[7*2+1][a[7] >> 4]);
712:
713: if (*(unsigned __int16 *)a)
714: {
715: xor_8kt64(0);
716: xor_8kt64(1);
717: }
718: if (a[2])
719: {
720: xor_8kt64(2);
721: }
722: xor_8kt64(3);
723: xor_8kt64(4);
724: xor_8kt64(5);
725: xor_8kt64(6);
726:
727: move_block_aligned(p, r);
728: }
729:
730: #define xor_8k64(i) \
731: xor_block_aligned64(r, ctx->gf_t64[i + i][a[i] & 15]); \
732: xor_block_aligned64(r, ctx->gf_t64[i + i + 1][a[i] >> 4])
733:
734: /* Multiply two 64-bit numbers in the finite field GF(2^64) */
735: void Gf64MulTab (unsigned char a[8], unsigned char p[8], GfCtx *ctx)
736: {
1.1.1.4 root 737: /* Deprecated/legacy */
738:
1.1 root 739: unsigned __int32 r[CBLK_LEN8 >> 2];
740:
741: move_block_aligned64(r, ctx->gf_t64[7*2][a[7] & 15]);
742: xor_block_aligned64(r, ctx->gf_t64[7*2+1][a[7] >> 4]);
743:
744: if (*(unsigned __int16 *)a)
745: {
746: xor_8k64(0);
747: xor_8k64(1);
748: }
749: if (a[2])
750: {
751: xor_8k64(2);
752: }
753: xor_8k64(3);
754: xor_8k64(4);
755: xor_8k64(5);
756: xor_8k64(6);
757:
758: move_block_aligned64(p, r);
759: }
760:
761:
762: /* Basic algorithms for testing of optimized algorithms */
763:
764: static void xor128 (unsigned __int64 *a, unsigned __int64 *b)
765: {
766: *a++ ^= *b++;
767: *a ^= *b;
768: }
769:
770: static void shl128 (unsigned __int8 *a)
771: {
772: int i, x = 0, xx;
773: for (i = 15; i >= 0; i--)
774: {
775: xx = (a[i] & 0x80) >> 7;
776: a[i] = (a[i] << 1) | x;
777: x = xx;
778: }
779: }
780:
781: static void GfMul128Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8 *p)
782: {
783: int i;
784: unsigned __int8 la[16];
785: memcpy (la, a, 16);
786: memset (p, 0, 16);
787:
788: for (i = 0; i < 128; i++)
789: {
790: if (IsBitSet128 (i, b))
791: xor128 ((unsigned __int64 *)p, (unsigned __int64 *)la);
792:
793: if (la[0] & 0x80)
794: {
795: shl128 (la);
796: la[15] ^= 0x87;
797: }
798: else
799: {
800: shl128 (la);
801: }
802: }
803: }
804:
805: static void xor64 (unsigned __int64 *a, unsigned __int64 *b)
806: {
807: *a ^= *b;
808: }
809:
810: static void shl64 (unsigned __int8 *a)
811: {
812: int i, x = 0, xx;
813: for (i = 7; i >= 0; i--)
814: {
815: xx = (a[i] & 0x80) >> 7;
816: a[i] = (a[i] << 1) | x;
817: x = xx;
818: }
819: }
820:
821: static void GfMul64Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8* p)
822: {
1.1.1.4 root 823: /* Deprecated/legacy */
824:
1.1 root 825: int i;
826: unsigned __int8 la[8];
827: memcpy (la, a, 8);
828: memset (p, 0, 8);
829:
830: for (i = 0; i < 64; i++)
831: {
832: if (IsBitSet64 (i, b))
833: xor64 ((unsigned __int64 *)p, (unsigned __int64 *)la);
834:
835: if (la[0] & 0x80)
836: {
837: shl64 (la);
838: la[7] ^= 0x1b;
839: }
840: else
841: {
842: shl64 (la);
843: }
844: }
845: }
846:
847:
848: BOOL GfMulSelfTest ()
849: {
850: BOOL result = TRUE;
851: unsigned __int8 a[16];
852: unsigned __int8 b[16];
853: unsigned __int8 p1[16];
854: unsigned __int8 p2[16];
855: GfCtx *gfCtx = (GfCtx *) TCalloc (sizeof (GfCtx));
856: int i, j;
857:
858: if (!gfCtx)
859: return FALSE;
860:
1.1.1.4 root 861: /* GF(2^64) - deprecated/legacy */
1.1 root 862: for (i = 0; i < 0x100; i++)
863: {
864: for (j = 0; j < 8; j++)
865: {
866: a[j] = (unsigned __int8) i;
867: b[j] = a[j] ^ 0xff;
868: }
869:
870: GfMul64Basic (a, b, p1);
871:
872: Gf64TabInit (a, gfCtx);
873: Gf64MulTab (b, p2, gfCtx);
874:
875: if (memcmp (p1, p2, 8) != 0)
876: result = FALSE;
877: }
878:
879: /* GF(2^128) */
880: for (i = 0; i < 0x100; i++)
881: {
882: for (j = 0; j < 16; j++)
883: {
884: a[j] = (unsigned __int8) i;
885: b[j] = j < 8 ? 0 : a[j] ^ 0xff;
886: }
887:
888: GfMul128Basic (a, b, p1);
889:
890: Gf128Tab64Init (a, gfCtx);
891: Gf128MulBy64Tab (b + 8, p2, gfCtx);
892:
893: if (memcmp (p1, p2, 16) != 0)
894: result = FALSE;
895: }
896:
897: TCfree (gfCtx);
898: return result;
899: }
900:
901: #if defined(__cplusplus)
902: }
903: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.