|
|
1.1 ! root 1: /* ! 2: --------------------------------------------------------------------------- ! 3: Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. ! 4: ! 5: LICENSE TERMS ! 6: ! 7: The free distribution and use of this software in both source and binary ! 8: form is allowed (with or without changes) provided that: ! 9: ! 10: 1. distributions of this source code include the above copyright ! 11: notice, this list of conditions and the following disclaimer; ! 12: ! 13: 2. distributions in binary form include the above copyright ! 14: notice, this list of conditions and the following disclaimer ! 15: in the documentation and/or other associated materials; ! 16: ! 17: 3. the copyright holder's name is not used to endorse products ! 18: built using this software without specific written permission. ! 19: ! 20: ALTERNATIVELY, provided that this notice is retained in full, this product ! 21: may be distributed under the terms of the GNU General Public License (GPL), ! 22: in which case the provisions of the GPL apply INSTEAD OF those given above. ! 23: ! 24: DISCLAIMER ! 25: ! 26: This software is provided 'as is' with no explicit or implied warranties ! 27: in respect of its properties, including, but not limited to, correctness ! 28: and/or fitness for purpose. ! 29: --------------------------------------------------------------------------- ! 30: Issue Date: 31/01/2004 ! 31: ! 32: My thanks to John Viega and David McGrew for their support in developing ! 33: this code and to David for testing it on a big-endain system. ! 34: */ ! 35: ! 36: /* ! 37: TrueCrypt Foundation made the following changes: ! 38: ! 39: - Added multiplication in the finite field GF(2^128) optimized for ! 40: cases involving a 64-bit operand. ! 41: ! 42: - Added multiplication in the finite field GF(2^64). ! 43: ! 44: - Added MSB-first mode. ! 45: ! 46: - Removed GCM. ! 47: */ ! 48: ! 49: #ifdef LINUX_DRIVER ! 50: #include <linux/module.h> ! 51: #include <linux/string.h> ! 52: #else ! 53: #include <memory.h> ! 54: #endif ! 55: ! 56: #include "GfMul.h" ! 57: #include "Tcdefs.h" ! 58: #include "Endian.h" ! 59: ! 60: /* BUFFER_ALIGN32 or BUFFER_ALIGN64 must be defined at this point to */ ! 61: /* enable faster operation by taking advantage of memory aligned values */ ! 62: /* NOTE: the BUFFER_ALIGN64 option has not been tested extensively */ ! 63: ! 64: #define BUFFER_ALIGN32 ! 65: #define UNROLL_LOOPS /* define to unroll some loops */ ! 66: #define IN_LINES /* define to use inline functions */ ! 67: /* in place of macros */ ! 68: ! 69: #define mode(x) GM_##x ! 70: ! 71: #if defined(__cplusplus) ! 72: extern "C" ! 73: { ! 74: #endif ! 75: ! 76: typedef unsigned __int32 mode(32t); ! 77: typedef unsigned __int64 mode(64t); ! 78: ! 79: #define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ ! 80: #define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ ! 81: ! 82: #if BYTE_ORDER == LITTLE_ENDIAN ! 83: # define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN ! 84: #endif ! 85: ! 86: #if BYTE_ORDER == BIG_ENDIAN ! 87: # define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN ! 88: #endif ! 89: ! 90: #ifdef _MSC_VER ! 91: #pragma intrinsic(memcpy) ! 92: #define in_line __inline ! 93: #else ! 94: #define in_line ! 95: #endif ! 96: ! 97: #if 0 && defined(_MSC_VER) ! 98: #define rotl32 _lrotl ! 99: #define rotr32 _lrotr ! 100: #else ! 101: #define rotl32(x,n) (((x) << n) | ((x) >> (32 - n))) ! 102: #define rotr32(x,n) (((x) >> n) | ((x) << (32 - n))) ! 103: #endif ! 104: ! 105: #if !defined(bswap_32) ! 106: #define bswap_32(x) (rotr32((x), 24) & 0x00ff00ff | rotr32((x), 8) & 0xff00ff00) ! 107: #endif ! 108: ! 109: #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) ! 110: #define SWAP_BYTES ! 111: #else ! 112: #undef SWAP_BYTES ! 113: #endif ! 114: ! 115: #if defined(SWAP_BYTES) ! 116: ! 117: #if defined ( IN_LINES ) ! 118: ! 119: in_line void bsw_32(void * p, unsigned int n) ! 120: { unsigned int i = n; ! 121: while(i--) ! 122: ((mode(32t)*)p)[i] = bswap_32(((mode(32t)*)p)[i]); ! 123: } ! 124: ! 125: #else ! 126: ! 127: #define bsw_32(p,n) \ ! 128: { int _i = (n); while(_i--) ((mode(32t)*)p)[_i] = bswap_32(((mode(32t)*)p)[_i]); } ! 129: ! 130: #endif ! 131: ! 132: #else ! 133: #define bsw_32(p,n) ! 134: #endif ! 135: ! 136: /* These values are used to detect long word alignment in order */ ! 137: /* to speed up some GCM buffer operations. This facility may */ ! 138: /* not work on some machines */ ! 139: ! 140: #define lp08(x) ((unsigned char*)(x)) ! 141: #define lp32(x) ((mode(32t)*)(x)) ! 142: #define lp64(x) ((mode(64t)*)(x)) ! 143: ! 144: #define A32_MASK 3 ! 145: #define A64_MASK 7 ! 146: #define aligned32(x) (!(((mode(32t))(x)) & A32_MASK)) ! 147: #define aligned64(x) (!(((mode(32t))(x)) & A64_MASK)) ! 148: ! 149: #if defined( BUFFER_ALIGN32 ) ! 150: ! 151: #define ADR_MASK A32_MASK ! 152: #define aligned aligned32 ! 153: #define lp lp32 ! 154: #define lp_inc 4 ! 155: ! 156: #if defined( IN_LINES ) ! 157: ! 158: in_line void move_block_aligned( void *p, const void *q) ! 159: { ! 160: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], ! 161: lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3]; ! 162: } ! 163: ! 164: in_line void move_block_aligned64( void *p, const void *q) ! 165: { ! 166: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1]; ! 167: } ! 168: ! 169: in_line void xor_block_aligned( void *p, const void *q) ! 170: { ! 171: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], ! 172: lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3]; ! 173: } ! 174: ! 175: in_line void xor_block_aligned64( void *p, const void *q) ! 176: { ! 177: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1]; ! 178: } ! 179: ! 180: #else ! 181: ! 182: #define move_block_aligned(p,q) \ ! 183: lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], \ ! 184: lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3] ! 185: ! 186: #define xor_block_aligned(p,q) \ ! 187: lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], \ ! 188: lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3] ! 189: ! 190: #endif ! 191: ! 192: #elif defined( BUFFER_ALIGN64 ) ! 193: ! 194: #define ADR_MASK A64_MASK ! 195: #define aligned aligned64 ! 196: #define lp lp64 ! 197: #define lp_inc 8 ! 198: ! 199: #define move_block_aligned(p,q) \ ! 200: lp64(p)[0] = lp64(q)[0], lp64(p)[1] = lp64(q)[1] ! 201: ! 202: #define xor_block_aligned(p,q) \ ! 203: lp64(p)[0] ^= lp64(q)[0], lp64(p)[1] ^= lp64(q)[1] ! 204: ! 205: #else ! 206: #define aligned(x) 0 ! 207: #endif ! 208: ! 209: #define move_block(p,q) memcpy((p), (q), BLOCK_LEN) ! 210: ! 211: #define xor_block(p,q) \ ! 212: lp08(p)[ 0] ^= lp08(q)[ 0], lp08(p)[ 1] ^= lp08(q)[ 1], \ ! 213: lp08(p)[ 2] ^= lp08(q)[ 2], lp08(p)[ 3] ^= lp08(q)[ 3], \ ! 214: lp08(p)[ 4] ^= lp08(q)[ 4], lp08(p)[ 5] ^= lp08(q)[ 5], \ ! 215: lp08(p)[ 6] ^= lp08(q)[ 6], lp08(p)[ 7] ^= lp08(q)[ 7], \ ! 216: lp08(p)[ 8] ^= lp08(q)[ 8], lp08(p)[ 9] ^= lp08(q)[ 9], \ ! 217: lp08(p)[10] ^= lp08(q)[10], lp08(p)[11] ^= lp08(q)[11], \ ! 218: lp08(p)[12] ^= lp08(q)[12], lp08(p)[13] ^= lp08(q)[13], \ ! 219: lp08(p)[14] ^= lp08(q)[14], lp08(p)[15] ^= lp08(q)[15] ! 220: ! 221: ! 222: #define gf_dat(q) {\ ! 223: q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\ ! 224: q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\ ! 225: q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\ ! 226: q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\ ! 227: q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\ ! 228: q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\ ! 229: q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\ ! 230: q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\ ! 231: q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\ ! 232: q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\ ! 233: q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\ ! 234: q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\ ! 235: q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\ ! 236: q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\ ! 237: q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\ ! 238: q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\ ! 239: q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\ ! 240: q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\ ! 241: q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\ ! 242: q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\ ! 243: q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\ ! 244: q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\ ! 245: q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\ ! 246: q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\ ! 247: q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\ ! 248: q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\ ! 249: q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\ ! 250: q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\ ! 251: q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\ ! 252: q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\ ! 253: q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\ ! 254: q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) } ! 255: ! 256: /* given the value i in 0..255 as the byte overflow when a a field */ ! 257: /* element in GHASH is multipled by x^8, this function will return */ ! 258: /* the values that are generated in the lo 16-bit word of the field */ ! 259: /* value by applying the modular polynomial. The values lo_byte and */ ! 260: /* hi_byte are returned via the macro xp_fun(lo_byte, hi_byte) so */ ! 261: /* that the values can be assembled into memory as required by a */ ! 262: /* suitable definition of this macro operating on the table above */ ! 263: ! 264: #define xp(i) xp_fun( \ ! 265: (i & 0x80 ? 0xe1 : 0) ^ (i & 0x40 ? 0x70 : 0) ^ \ ! 266: (i & 0x20 ? 0x38 : 0) ^ (i & 0x10 ? 0x1c : 0) ^ \ ! 267: (i & 0x08 ? 0x0e : 0) ^ (i & 0x04 ? 0x07 : 0) ^ \ ! 268: (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \ ! 269: (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x80 : 0) ^ \ ! 270: (i & 0x20 ? 0x40 : 0) ^ (i & 0x10 ? 0x20 : 0) ^ \ ! 271: (i & 0x08 ? 0x10 : 0) ^ (i & 0x04 ? 0x08 : 0) ^ \ ! 272: (i & 0x02 ? 0x84 : 0) ^ (i & 0x01 ? 0xc2 : 0) ) ! 273: ! 274: #define xp64(i) xp_fun( \ ! 275: (i & 0x80 ? 0xd8 : 0) ^ (i & 0x40 ? 0x6c : 0) ^ \ ! 276: (i & 0x20 ? 0x36 : 0) ^ (i & 0x10 ? 0x1b : 0) ^ \ ! 277: (i & 0x08 ? 0x0d : 0) ^ (i & 0x04 ? 0x06 : 0) ^ \ ! 278: (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \ ! 279: (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x00 : 0) ^ \ ! 280: (i & 0x20 ? 0x00 : 0) ^ (i & 0x10 ? 0x00 : 0) ^ \ ! 281: (i & 0x08 ? 0x80 : 0) ^ (i & 0x04 ? 0xc0 : 0) ^ \ ! 282: (i & 0x02 ? 0x60 : 0) ^ (i & 0x01 ? 0xb0 : 0) ) ! 283: ! 284: static mode(32t) gf_poly[2] = { 0, 0xe1000000 }; ! 285: static mode(32t) gf_poly64[2] = { 0, 0xd8000000 }; ! 286: ! 287: /* Multiply of a GF128 field element by x. The field element */ ! 288: /* is held in an array of bytes in which field bits 8n..8n + 7 */ ! 289: /* are held in byte[n], with lower indexed bits placed in the */ ! 290: /* more numerically significant bit positions in bytes. */ ! 291: ! 292: /* This function multiples a field element x, in the polynomial */ ! 293: /* field representation. It uses 32-bit word operations to gain */ ! 294: /* speed but compensates for machine endianess and hence works */ ! 295: /* correctly on both styles of machine */ ! 296: ! 297: in_line void mul_x(mode(32t) x[4]) ! 298: { mode(32t) t; ! 299: ! 300: bsw_32(x, 4); ! 301: ! 302: /* at this point the filed element bits 0..127 are set out */ ! 303: /* as follows in 32-bit words (where the most significant */ ! 304: /* (ms) numeric bits are to the left) */ ! 305: /* */ ! 306: /* x[0] x[1] x[2] x[3] */ ! 307: /* ms ls ms ls ms ls ms ls */ ! 308: /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */ ! 309: ! 310: t = gf_poly[x[3] & 1]; /* bit 127 of the element */ ! 311: x[3] = (x[3] >> 1) | (x[2] << 31); /* shift bits up by one */ ! 312: x[2] = (x[2] >> 1) | (x[1] << 31); /* position */ ! 313: x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */ ! 314: x[0] = (x[0] >> 1) ^ t; /* the field polynomial */ ! 315: bsw_32(x, 4); ! 316: } ! 317: ! 318: in_line void mul_x64(mode(32t) x[2]) ! 319: { mode(32t) t; ! 320: ! 321: bsw_32(x, 2); ! 322: ! 323: /* at this point the filed element bits 0..127 are set out */ ! 324: /* as follows in 32-bit words (where the most significant */ ! 325: /* (ms) numeric bits are to the left) */ ! 326: /* */ ! 327: /* x[0] x[1] x[2] x[3] */ ! 328: /* ms ls ms ls ms ls ms ls */ ! 329: /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */ ! 330: ! 331: t = gf_poly64[x[1] & 1]; /* bit 127 of the element */ ! 332: /* shift bits up by one */ ! 333: /* position */ ! 334: x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */ ! 335: x[0] = (x[0] >> 1) ^ t; /* the field polynomial */ ! 336: bsw_32(x, 2); ! 337: } ! 338: ! 339: /* Multiply of a GF128 field element by x^8 using 32-bit words */ ! 340: /* for speed - machine endianess matters here */ ! 341: ! 342: #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) ! 343: ! 344: #define xp_fun(x,y) ((mode(32t))(x)) | (((mode(32t))(y)) << 8) ! 345: static const unsigned __int16 gft_le[256] = gf_dat(xp); ! 346: static const unsigned __int16 gft_le64[256] = gf_dat(xp64); ! 347: ! 348: in_line void mul_lex8(mode(32t) x[4]) /* mutiply with long words */ ! 349: { mode(32t) t = (x[3] >> 24); /* in little endian format */ ! 350: x[3] = (x[3] << 8) | (x[2] >> 24); ! 351: x[2] = (x[2] << 8) | (x[1] >> 24); ! 352: x[1] = (x[1] << 8) | (x[0] >> 24); ! 353: x[0] = (x[0] << 8) ^ gft_le[t]; ! 354: } ! 355: ! 356: in_line void mul_lex8_64(mode(32t) x[2]) /* mutiply with long words */ ! 357: { mode(32t) t = (x[1] >> 24); /* in little endian format */ ! 358: x[1] = (x[1] << 8) | (x[0] >> 24); ! 359: x[0] = (x[0] << 8) ^ gft_le64[t]; ! 360: } ! 361: ! 362: #endif ! 363: ! 364: #if 1 || (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) ! 365: ! 366: #undef xp_fun ! 367: #define xp_fun(x,y) ((mode(32t))(y)) | (((mode(32t))(x)) << 8) ! 368: static const unsigned __int16 gft_be[256] = gf_dat(xp); ! 369: static const unsigned __int16 gft_be64[256] = gf_dat(xp64); ! 370: ! 371: in_line void mul_bex8(mode(32t) x[4]) /* mutiply with long words */ ! 372: { mode(32t) t = (x[3] & 0xff); /* in big endian format */ ! 373: x[3] = (x[3] >> 8) | (x[2] << 24); ! 374: x[2] = (x[2] >> 8) | (x[1] << 24); ! 375: x[1] = (x[1] >> 8) | (x[0] << 24); ! 376: x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be[t]) << 16); ! 377: } ! 378: ! 379: in_line void mul_bex8_64(mode(32t) x[2]) /* mutiply with long words */ ! 380: { mode(32t) t = (x[1] & 0xff); /* in big endian format */ ! 381: x[1] = (x[1] >> 8) | (x[0] << 24); ! 382: x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be64[t]) << 16); ! 383: } ! 384: ! 385: #endif ! 386: ! 387: /* hence choose the correct version for the machine endianess */ ! 388: ! 389: #if PLATFORM_BYTE_ORDER == BRG_BIG_ENDIAN ! 390: #define mul_x8 mul_bex8 ! 391: #define mul_x8_64 mul_bex8_64 ! 392: #else ! 393: #define mul_x8 mul_lex8 ! 394: #define mul_x8_64 mul_lex8_64 ! 395: #endif ! 396: ! 397: /* different versions of the general gf_mul function are provided */ ! 398: /* here. Sadly none are very fast :-( */ ! 399: ! 400: void GfMul128 (void *a, const void* b) ! 401: { mode(32t) r[CBLK_LEN >> 2], p[8][CBLK_LEN >> 2]; ! 402: int i; ! 403: ! 404: move_block_aligned(p[0], b); ! 405: bsw_32(p[0], 4); ! 406: for(i = 0; i < 7; ++i) ! 407: { ! 408: p[i + 1][3] = (p[i][3] >> 1) | (p[i][2] << 31); ! 409: p[i + 1][2] = (p[i][2] >> 1) | (p[i][1] << 31); ! 410: p[i + 1][1] = (p[i][1] >> 1) | (p[i][0] << 31); ! 411: p[i + 1][0] = (p[i][0] >> 1) ^ gf_poly[p[i][3] & 1]; ! 412: } ! 413: ! 414: memset(r, 0, CBLK_LEN); ! 415: for(i = 0; i < 16; ++i) ! 416: { ! 417: if(i) mul_bex8(r); /* order is always big endian here */ ! 418: ! 419: if(((unsigned char*)a)[15 - i] & 0x80) ! 420: xor_block_aligned(r, p[0]); ! 421: if(((unsigned char*)a)[15 - i] & 0x40) ! 422: xor_block_aligned(r, p[1]); ! 423: if(((unsigned char*)a)[15 - i] & 0x20) ! 424: xor_block_aligned(r, p[2]); ! 425: if(((unsigned char*)a)[15 - i] & 0x10) ! 426: xor_block_aligned(r, p[3]); ! 427: if(((unsigned char*)a)[15 - i] & 0x08) ! 428: xor_block_aligned(r, p[4]); ! 429: if(((unsigned char*)a)[15 - i] & 0x04) ! 430: xor_block_aligned(r, p[5]); ! 431: if(((unsigned char*)a)[15 - i] & 0x02) ! 432: xor_block_aligned(r, p[6]); ! 433: if(((unsigned char*)a)[15 - i] & 0x01) ! 434: xor_block_aligned(r, p[7]); ! 435: } ! 436: bsw_32(r, 4); ! 437: move_block_aligned(a, r); ! 438: } ! 439: ! 440: #if defined( UNROLL_LOOPS ) ! 441: ! 442: #define xor_8k(i) \ ! 443: xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); \ ! 444: xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]) ! 445: ! 446: ! 447: void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx) ! 448: { unsigned __int32 r[CBLK_LEN >> 2]; ! 449: ! 450: move_block_aligned(r, ctx->gf_t8k[0][a[0] & 15]); ! 451: xor_block_aligned(r, ctx->gf_t8k[1][a[0] >> 4]); ! 452: xor_8k( 1); xor_8k( 2); xor_8k( 3); ! 453: xor_8k( 4); xor_8k( 5); xor_8k( 6); xor_8k( 7); ! 454: xor_8k( 8); xor_8k( 9); xor_8k(10); xor_8k(11); ! 455: xor_8k(12); xor_8k(13); xor_8k(14); xor_8k(15); ! 456: move_block_aligned(a, r); ! 457: } ! 458: ! 459: #else ! 460: ! 461: void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx) ! 462: { unsigned __int32 r[CBLK_LEN >> 2], *p; ! 463: int i; ! 464: ! 465: p = ctx->gf_t8k[0][a[0] & 15]; ! 466: memcpy(r, p, CBLK_LEN); ! 467: p = ctx->gf_t8k[1][a[0] >> 4]; ! 468: xor_block_aligned(r, p); ! 469: for(i = 1; i < CBLK_LEN; ++i) ! 470: { ! 471: xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); ! 472: xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]); ! 473: } ! 474: memcpy(a, r, CBLK_LEN); ! 475: } ! 476: ! 477: #endif ! 478: ! 479: void compile_8k_table(unsigned __int8 *a, GfCtx8k *ctx) ! 480: { int i, j, k; ! 481: ! 482: memset(ctx->gf_t8k, 0, 32 * 16 * 16); ! 483: for(i = 0; i < 2 * CBLK_LEN; ++i) ! 484: { ! 485: if(i == 0) ! 486: { ! 487: memcpy(ctx->gf_t8k[1][8], a, CBLK_LEN); ! 488: for(j = 4; j > 0; j >>= 1) ! 489: { ! 490: memcpy(ctx->gf_t8k[1][j], ctx->gf_t8k[1][j + j], CBLK_LEN); ! 491: mul_x(ctx->gf_t8k[1][j]); ! 492: } ! 493: memcpy(ctx->gf_t8k[0][8], ctx->gf_t8k[1][1], CBLK_LEN); ! 494: mul_x(ctx->gf_t8k[0][8]); ! 495: for(j = 4; j > 0; j >>= 1) ! 496: { ! 497: memcpy(ctx->gf_t8k[0][j], ctx->gf_t8k[0][j + j], CBLK_LEN); ! 498: mul_x(ctx->gf_t8k[0][j]); ! 499: } ! 500: } ! 501: else if(i > 1) ! 502: for(j = 8; j > 0; j >>= 1) ! 503: { ! 504: memcpy(ctx->gf_t8k[i][j], ctx->gf_t8k[i - 2][j], CBLK_LEN); ! 505: mul_x8(ctx->gf_t8k[i][j]); ! 506: } ! 507: ! 508: for(j = 2; j < 16; j += j) ! 509: { ! 510: mode(32t) *pj = ctx->gf_t8k[i][j]; ! 511: mode(32t) *pk = ctx->gf_t8k[i][1]; ! 512: mode(32t) *pl = ctx->gf_t8k[i][j + 1]; ! 513: ! 514: for(k = 1; k < j; ++k) ! 515: { ! 516: *pl++ = pj[0] ^ *pk++; ! 517: *pl++ = pj[1] ^ *pk++; ! 518: *pl++ = pj[2] ^ *pk++; ! 519: *pl++ = pj[3] ^ *pk++; ! 520: } ! 521: } ! 522: } ! 523: } ! 524: ! 525: ! 526: void compile_4k_table64(unsigned __int8 *a, GfCtx4k64 *ctx) ! 527: { int i, j, k; ! 528: ! 529: memset(ctx->gf_t4k, 0, sizeof(ctx->gf_t4k)); ! 530: for(i = 0; i < 2 * CBLK_LEN8; ++i) ! 531: { ! 532: if(i == 0) ! 533: { ! 534: memcpy(ctx->gf_t4k[1][8], a, CBLK_LEN8); ! 535: for(j = 4; j > 0; j >>= 1) ! 536: { ! 537: memcpy(ctx->gf_t4k[1][j], ctx->gf_t4k[1][j + j], CBLK_LEN8); ! 538: mul_x64(ctx->gf_t4k[1][j]); ! 539: } ! 540: memcpy(ctx->gf_t4k[0][8], ctx->gf_t4k[1][1], CBLK_LEN8); ! 541: mul_x64(ctx->gf_t4k[0][8]); ! 542: for(j = 4; j > 0; j >>= 1) ! 543: { ! 544: memcpy(ctx->gf_t4k[0][j], ctx->gf_t4k[0][j + j], CBLK_LEN8); ! 545: mul_x64(ctx->gf_t4k[0][j]); ! 546: } ! 547: } ! 548: else if(i > 1) ! 549: for(j = 8; j > 0; j >>= 1) ! 550: { ! 551: memcpy(ctx->gf_t4k[i][j], ctx->gf_t4k[i - 2][j], CBLK_LEN8); ! 552: mul_x8_64(ctx->gf_t4k[i][j]); ! 553: } ! 554: ! 555: for(j = 2; j < 16; j += j) ! 556: { ! 557: mode(32t) *pj = ctx->gf_t4k[i][j]; ! 558: mode(32t) *pk = ctx->gf_t4k[i][1]; ! 559: mode(32t) *pl = ctx->gf_t4k[i][j + 1]; ! 560: ! 561: for(k = 1; k < j; ++k) ! 562: { ! 563: *pl++ = pj[0] ^ *pk++; ! 564: *pl++ = pj[1] ^ *pk++; ! 565: *pl++ = pj[2] ^ *pk++; ! 566: *pl++ = pj[3] ^ *pk++; ! 567: } ! 568: } ! 569: } ! 570: } ! 571: ! 572: static int IsBitSet128 (unsigned int bit, unsigned __int8 *a) ! 573: { ! 574: return a[(127 - bit) / 8] & (0x80 >> ((127 - bit) % 8)); ! 575: } ! 576: ! 577: static int IsBitSet64 (unsigned int bit, unsigned __int8 *a) ! 578: { ! 579: return a[(63 - bit) / 8] & (0x80 >> ((63 - bit) % 8)); ! 580: } ! 581: ! 582: static void SetBit128 (unsigned int bit, unsigned __int8 *a) ! 583: { ! 584: a[(127 - bit) / 8] |= 0x80 >> ((127 - bit) % 8); ! 585: } ! 586: ! 587: static void SetBit64 (unsigned int bit, unsigned __int8 *a) ! 588: { ! 589: a[(63 - bit) / 8] |= 0x80 >> ((63 - bit) % 8); ! 590: } ! 591: ! 592: static void MirrorBits128 (unsigned __int8 *a) ! 593: { ! 594: unsigned __int8 t[128 / 8]; ! 595: int i; ! 596: memset (t,0,16); ! 597: for (i = 0; i < 128; i++) ! 598: { ! 599: if (IsBitSet128(i, a)) ! 600: SetBit128 (127 - i, t); ! 601: } ! 602: memcpy (a, t, sizeof (t)); ! 603: burn (t,sizeof (t)); ! 604: } ! 605: ! 606: void MirrorBits64 (unsigned __int8 *a) ! 607: { ! 608: unsigned __int8 t[64 / 8]; ! 609: int i; ! 610: memset (t,0,8); ! 611: for (i = 0; i < 64; i++) ! 612: { ! 613: if (IsBitSet64(i, a)) ! 614: SetBit64 (63 - i, t); ! 615: } ! 616: memcpy (a, t, sizeof (t)); ! 617: burn (t,sizeof (t)); ! 618: } ! 619: ! 620: /* Allocate and initialize speed optimization table ! 621: for multiplication by 64-bit operand in MSB-first mode */ ! 622: int Gf128Tab64Init (unsigned __int8 *a, GfCtx *ctx) ! 623: { ! 624: GfCtx8k *ctx8k; ! 625: unsigned __int8 am[16]; ! 626: int i, j; ! 627: ! 628: ctx8k = (GfCtx8k *) TCalloc (sizeof (GfCtx8k)); ! 629: if (!ctx8k) ! 630: return FALSE; ! 631: ! 632: memcpy (am, a, 16); ! 633: MirrorBits128 (am); ! 634: compile_8k_table (am, ctx8k); ! 635: ! 636: /* Convert 8k LSB-first table to 4k MSB-first */ ! 637: for (i = 16; i < 32; i++) ! 638: { ! 639: for (j = 0; j < 16; j++) ! 640: { ! 641: int jm = 0; ! 642: jm |= (j & 0x1) << 3; ! 643: jm |= (j & 0x2) << 1; ! 644: jm |= (j & 0x4) >> 1; ! 645: jm |= (j & 0x8) >> 3; ! 646: ! 647: memcpy (&ctx->gf_t128[i-16][jm], (unsigned char *)&ctx8k->gf_t8k[31-i][j], 16); ! 648: MirrorBits128 ((unsigned char *)&ctx->gf_t128[i-16][jm]); ! 649: } ! 650: } ! 651: ! 652: burn (ctx8k ,sizeof (*ctx8k)); ! 653: burn (am, sizeof (am)); ! 654: TCfree (ctx8k); ! 655: return TRUE; ! 656: } ! 657: ! 658: int Gf64TabInit (unsigned __int8 *a, GfCtx *ctx) ! 659: { ! 660: GfCtx4k64 *ctx4k; ! 661: unsigned __int8 am[8]; ! 662: int i, j; ! 663: ! 664: ctx4k = (GfCtx4k64 *) TCalloc (sizeof (GfCtx4k64)); ! 665: if (!ctx4k) ! 666: return FALSE; ! 667: ! 668: memcpy (am, a, 8); ! 669: MirrorBits64 (am); ! 670: compile_4k_table64 (am, ctx4k); ! 671: ! 672: /* Convert LSB-first table to MSB-first */ ! 673: for (i = 0; i < 16; i++) ! 674: { ! 675: for (j = 0; j < 16; j++) ! 676: { ! 677: int jm = 0; ! 678: jm |= (j & 0x1) << 3; ! 679: jm |= (j & 0x2) << 1; ! 680: jm |= (j & 0x4) >> 1; ! 681: jm |= (j & 0x8) >> 3; ! 682: ! 683: memcpy (&ctx->gf_t64[i][jm], (unsigned char *)&ctx4k->gf_t4k[15-i][j], 8); ! 684: MirrorBits64 ((unsigned char *)&ctx->gf_t64[i][jm]); ! 685: } ! 686: } ! 687: ! 688: burn (ctx4k,sizeof (*ctx4k)); ! 689: burn (am, sizeof (am)); ! 690: TCfree (ctx4k); ! 691: return TRUE; ! 692: } ! 693: ! 694: #define xor_8kt64(i) \ ! 695: xor_block_aligned(r, ctx->gf_t128[i + i][a[i] & 15]); \ ! 696: xor_block_aligned(r, ctx->gf_t128[i + i + 1][a[i] >> 4]) ! 697: ! 698: /* Multiply a 128-bit number by a 64-bit number in the finite field GF(2^128) */ ! 699: void Gf128MulBy64Tab (unsigned __int8 a[8], unsigned __int8 p[16], GfCtx *ctx) ! 700: { ! 701: unsigned __int32 r[CBLK_LEN >> 2]; ! 702: ! 703: move_block_aligned(r, ctx->gf_t128[7*2][a[7] & 15]); ! 704: xor_block_aligned(r, ctx->gf_t128[7*2+1][a[7] >> 4]); ! 705: ! 706: if (*(unsigned __int16 *)a) ! 707: { ! 708: xor_8kt64(0); ! 709: xor_8kt64(1); ! 710: } ! 711: if (a[2]) ! 712: { ! 713: xor_8kt64(2); ! 714: } ! 715: xor_8kt64(3); ! 716: xor_8kt64(4); ! 717: xor_8kt64(5); ! 718: xor_8kt64(6); ! 719: ! 720: move_block_aligned(p, r); ! 721: } ! 722: ! 723: #define xor_8k64(i) \ ! 724: xor_block_aligned64(r, ctx->gf_t64[i + i][a[i] & 15]); \ ! 725: xor_block_aligned64(r, ctx->gf_t64[i + i + 1][a[i] >> 4]) ! 726: ! 727: /* Multiply two 64-bit numbers in the finite field GF(2^64) */ ! 728: void Gf64MulTab (unsigned char a[8], unsigned char p[8], GfCtx *ctx) ! 729: { ! 730: unsigned __int32 r[CBLK_LEN8 >> 2]; ! 731: ! 732: move_block_aligned64(r, ctx->gf_t64[7*2][a[7] & 15]); ! 733: xor_block_aligned64(r, ctx->gf_t64[7*2+1][a[7] >> 4]); ! 734: ! 735: if (*(unsigned __int16 *)a) ! 736: { ! 737: xor_8k64(0); ! 738: xor_8k64(1); ! 739: } ! 740: if (a[2]) ! 741: { ! 742: xor_8k64(2); ! 743: } ! 744: xor_8k64(3); ! 745: xor_8k64(4); ! 746: xor_8k64(5); ! 747: xor_8k64(6); ! 748: ! 749: move_block_aligned64(p, r); ! 750: } ! 751: ! 752: ! 753: /* Basic algorithms for testing of optimized algorithms */ ! 754: ! 755: static void xor128 (unsigned __int64 *a, unsigned __int64 *b) ! 756: { ! 757: *a++ ^= *b++; ! 758: *a ^= *b; ! 759: } ! 760: ! 761: static void shl128 (unsigned __int8 *a) ! 762: { ! 763: int i, x = 0, xx; ! 764: for (i = 15; i >= 0; i--) ! 765: { ! 766: xx = (a[i] & 0x80) >> 7; ! 767: a[i] = (a[i] << 1) | x; ! 768: x = xx; ! 769: } ! 770: } ! 771: ! 772: static void GfMul128Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8 *p) ! 773: { ! 774: int i; ! 775: unsigned __int8 la[16]; ! 776: memcpy (la, a, 16); ! 777: memset (p, 0, 16); ! 778: ! 779: for (i = 0; i < 128; i++) ! 780: { ! 781: if (IsBitSet128 (i, b)) ! 782: xor128 ((unsigned __int64 *)p, (unsigned __int64 *)la); ! 783: ! 784: if (la[0] & 0x80) ! 785: { ! 786: shl128 (la); ! 787: la[15] ^= 0x87; ! 788: } ! 789: else ! 790: { ! 791: shl128 (la); ! 792: } ! 793: } ! 794: } ! 795: ! 796: static void xor64 (unsigned __int64 *a, unsigned __int64 *b) ! 797: { ! 798: *a ^= *b; ! 799: } ! 800: ! 801: static void shl64 (unsigned __int8 *a) ! 802: { ! 803: int i, x = 0, xx; ! 804: for (i = 7; i >= 0; i--) ! 805: { ! 806: xx = (a[i] & 0x80) >> 7; ! 807: a[i] = (a[i] << 1) | x; ! 808: x = xx; ! 809: } ! 810: } ! 811: ! 812: static void GfMul64Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8* p) ! 813: { ! 814: int i; ! 815: unsigned __int8 la[8]; ! 816: memcpy (la, a, 8); ! 817: memset (p, 0, 8); ! 818: ! 819: for (i = 0; i < 64; i++) ! 820: { ! 821: if (IsBitSet64 (i, b)) ! 822: xor64 ((unsigned __int64 *)p, (unsigned __int64 *)la); ! 823: ! 824: if (la[0] & 0x80) ! 825: { ! 826: shl64 (la); ! 827: la[7] ^= 0x1b; ! 828: } ! 829: else ! 830: { ! 831: shl64 (la); ! 832: } ! 833: } ! 834: } ! 835: ! 836: ! 837: BOOL GfMulSelfTest () ! 838: { ! 839: BOOL result = TRUE; ! 840: unsigned __int8 a[16]; ! 841: unsigned __int8 b[16]; ! 842: unsigned __int8 p1[16]; ! 843: unsigned __int8 p2[16]; ! 844: GfCtx *gfCtx = (GfCtx *) TCalloc (sizeof (GfCtx)); ! 845: int i, j; ! 846: ! 847: if (!gfCtx) ! 848: return FALSE; ! 849: ! 850: /* GF(2^64) */ ! 851: for (i = 0; i < 0x100; i++) ! 852: { ! 853: for (j = 0; j < 8; j++) ! 854: { ! 855: a[j] = (unsigned __int8) i; ! 856: b[j] = a[j] ^ 0xff; ! 857: } ! 858: ! 859: GfMul64Basic (a, b, p1); ! 860: ! 861: Gf64TabInit (a, gfCtx); ! 862: Gf64MulTab (b, p2, gfCtx); ! 863: ! 864: if (memcmp (p1, p2, 8) != 0) ! 865: result = FALSE; ! 866: } ! 867: ! 868: /* GF(2^128) */ ! 869: for (i = 0; i < 0x100; i++) ! 870: { ! 871: for (j = 0; j < 16; j++) ! 872: { ! 873: a[j] = (unsigned __int8) i; ! 874: b[j] = j < 8 ? 0 : a[j] ^ 0xff; ! 875: } ! 876: ! 877: GfMul128Basic (a, b, p1); ! 878: ! 879: Gf128Tab64Init (a, gfCtx); ! 880: Gf128MulBy64Tab (b + 8, p2, gfCtx); ! 881: ! 882: if (memcmp (p1, p2, 16) != 0) ! 883: result = FALSE; ! 884: } ! 885: ! 886: TCfree (gfCtx); ! 887: return result; ! 888: } ! 889: ! 890: #if defined(__cplusplus) ! 891: } ! 892: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.