truecrypt/common/gfmul.c - annotate

Return to gfmul.c CVS log
Up to [truecrypt] / truecrypt / common
Annotation of truecrypt/common/gfmul.c, revision 1.1

1.1     ! root        1: /*
        !             2:  ---------------------------------------------------------------------------
        !             3:  Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
        !             4: 
        !             5:  LICENSE TERMS
        !             6: 
        !             7:  The free distribution and use of this software in both source and binary
        !             8:  form is allowed (with or without changes) provided that:
        !             9: 
        !            10:    1. distributions of this source code include the above copyright
        !            11:       notice, this list of conditions and the following disclaimer;
        !            12: 
        !            13:    2. distributions in binary form include the above copyright
        !            14:       notice, this list of conditions and the following disclaimer
        !            15:       in the documentation and/or other associated materials;
        !            16: 
        !            17:    3. the copyright holder's name is not used to endorse products
        !            18:       built using this software without specific written permission.
        !            19: 
        !            20:  ALTERNATIVELY, provided that this notice is retained in full, this product
        !            21:  may be distributed under the terms of the GNU General Public License (GPL),
        !            22:  in which case the provisions of the GPL apply INSTEAD OF those given above.
        !            23: 
        !            24:  DISCLAIMER
        !            25: 
        !            26:  This software is provided 'as is' with no explicit or implied warranties
        !            27:  in respect of its properties, including, but not limited to, correctness
        !            28:  and/or fitness for purpose.
        !            29:  ---------------------------------------------------------------------------
        !            30:  Issue Date: 31/01/2004
        !            31: 
        !            32:  My thanks to John Viega and David McGrew for their support in developing 
        !            33:  this code and to David for testing it on a big-endain system.
        !            34: */
        !            35: 
        !            36: /* 
        !            37:  TrueCrypt Foundation made the following changes:
        !            38: 
        !            39:    - Added multiplication in the finite field GF(2^128) optimized for
        !            40:      cases involving a 64-bit operand.
        !            41: 
        !            42:    - Added multiplication in the finite field GF(2^64).
        !            43: 
        !            44:    - Added MSB-first mode.
        !            45: 
        !            46:    - Removed GCM.
        !            47: */
        !            48: 
        !            49: #ifdef LINUX_DRIVER
        !            50: #include <linux/module.h>
        !            51: #include <linux/string.h>
        !            52: #else
        !            53: #include <memory.h>
        !            54: #endif
        !            55: 
        !            56: #include "GfMul.h"
        !            57: #include "Tcdefs.h"
        !            58: #include "Endian.h"
        !            59: 
        !            60: /* BUFFER_ALIGN32 or BUFFER_ALIGN64 must be defined at this point to    */
        !            61: /* enable faster operation by taking advantage of memory aligned values */
        !            62: /* NOTE: the BUFFER_ALIGN64 option has not been tested extensively      */
        !            63: 
        !            64: #define BUFFER_ALIGN32
        !            65: #define UNROLL_LOOPS    /* define to unroll some loops      */
        !            66: #define IN_LINES        /* define to use inline functions   */
        !            67:                         /* in place of macros               */
        !            68: 
        !            69: #define mode(x)                        GM_##x
        !            70: 
        !            71: #if defined(__cplusplus)
        !            72: extern "C"
        !            73: {
        !            74: #endif
        !            75: 
        !            76: typedef unsigned __int32 mode(32t);
        !            77: typedef unsigned __int64 mode(64t);
        !            78: 
        !            79: #define BRG_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
        !            80: #define BRG_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
        !            81: 
        !            82: #if BYTE_ORDER == LITTLE_ENDIAN
        !            83: #  define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN
        !            84: #endif
        !            85: 
        !            86: #if BYTE_ORDER == BIG_ENDIAN
        !            87: #  define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN
        !            88: #endif
        !            89: 
        !            90: #ifdef _MSC_VER
        !            91: #pragma intrinsic(memcpy)
        !            92: #define in_line __inline
        !            93: #else
        !            94: #define in_line
        !            95: #endif
        !            96: 
        !            97: #if 0 && defined(_MSC_VER)
        !            98: #define rotl32 _lrotl
        !            99: #define rotr32 _lrotr
        !           100: #else
        !           101: #define rotl32(x,n)   (((x) << n) | ((x) >> (32 - n)))
        !           102: #define rotr32(x,n)   (((x) >> n) | ((x) << (32 - n)))
        !           103: #endif
        !           104: 
        !           105: #if !defined(bswap_32)
        !           106: #define bswap_32(x) (rotr32((x), 24) & 0x00ff00ff | rotr32((x), 8) & 0xff00ff00)
        !           107: #endif
        !           108: 
        !           109: #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
        !           110: #define SWAP_BYTES
        !           111: #else
        !           112: #undef  SWAP_BYTES
        !           113: #endif
        !           114: 
        !           115: #if defined(SWAP_BYTES)
        !           116: 
        !           117: #if defined ( IN_LINES )
        !           118: 
        !           119: in_line void bsw_32(void * p, unsigned int n)
        !           120: {   unsigned int i = n;
        !           121:     while(i--)
        !           122:         ((mode(32t)*)p)[i] = bswap_32(((mode(32t)*)p)[i]);
        !           123: }
        !           124: 
        !           125: #else
        !           126: 
        !           127: #define bsw_32(p,n) \
        !           128:     { int _i = (n); while(_i--) ((mode(32t)*)p)[_i] = bswap_32(((mode(32t)*)p)[_i]); }
        !           129: 
        !           130: #endif
        !           131: 
        !           132: #else
        !           133: #define bsw_32(p,n)
        !           134: #endif
        !           135: 
        !           136: /* These values are used to detect long word alignment in order */
        !           137: /* to speed up some GCM buffer operations. This facility may    */
        !           138: /* not work on some machines                                    */
        !           139: 
        !           140: #define lp08(x)      ((unsigned char*)(x))
        !           141: #define lp32(x)      ((mode(32t)*)(x))
        !           142: #define lp64(x)      ((mode(64t)*)(x))
        !           143: 
        !           144: #define A32_MASK     3
        !           145: #define A64_MASK     7
        !           146: #define aligned32(x) (!(((mode(32t))(x)) & A32_MASK))
        !           147: #define aligned64(x) (!(((mode(32t))(x)) & A64_MASK))
        !           148: 
        !           149: #if defined( BUFFER_ALIGN32 )
        !           150: 
        !           151: #define ADR_MASK    A32_MASK
        !           152: #define aligned     aligned32
        !           153: #define lp          lp32
        !           154: #define lp_inc      4
        !           155: 
        !           156: #if defined( IN_LINES )
        !           157: 
        !           158: in_line void move_block_aligned( void *p, const void *q)
        !           159: {
        !           160:     lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1],
        !           161:     lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3];
        !           162: }
        !           163: 
        !           164: in_line void move_block_aligned64( void *p, const void *q)
        !           165: {
        !           166:     lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1];
        !           167: }
        !           168: 
        !           169: in_line void xor_block_aligned( void *p, const void *q)
        !           170: {
        !           171:     lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1],
        !           172:     lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3];
        !           173: }
        !           174: 
        !           175: in_line void xor_block_aligned64( void *p, const void *q)
        !           176: {
        !           177:     lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1];
        !           178: }
        !           179: 
        !           180: #else
        !           181: 
        !           182: #define move_block_aligned(p,q) \
        !           183:     lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], \
        !           184:     lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3]
        !           185: 
        !           186: #define xor_block_aligned(p,q) \
        !           187:     lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], \
        !           188:     lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3]
        !           189: 
        !           190: #endif
        !           191: 
        !           192: #elif defined( BUFFER_ALIGN64 )
        !           193: 
        !           194: #define ADR_MASK    A64_MASK
        !           195: #define aligned     aligned64
        !           196: #define lp          lp64
        !           197: #define lp_inc      8
        !           198: 
        !           199: #define move_block_aligned(p,q) \
        !           200:     lp64(p)[0] = lp64(q)[0], lp64(p)[1] = lp64(q)[1]
        !           201: 
        !           202: #define xor_block_aligned(p,q) \
        !           203:     lp64(p)[0] ^= lp64(q)[0], lp64(p)[1] ^= lp64(q)[1]
        !           204: 
        !           205: #else
        !           206: #define aligned(x) 0
        !           207: #endif
        !           208: 
        !           209: #define move_block(p,q) memcpy((p), (q), BLOCK_LEN)
        !           210: 
        !           211: #define xor_block(p,q) \
        !           212:     lp08(p)[ 0] ^= lp08(q)[ 0], lp08(p)[ 1] ^= lp08(q)[ 1], \
        !           213:     lp08(p)[ 2] ^= lp08(q)[ 2], lp08(p)[ 3] ^= lp08(q)[ 3], \
        !           214:     lp08(p)[ 4] ^= lp08(q)[ 4], lp08(p)[ 5] ^= lp08(q)[ 5], \
        !           215:     lp08(p)[ 6] ^= lp08(q)[ 6], lp08(p)[ 7] ^= lp08(q)[ 7], \
        !           216:     lp08(p)[ 8] ^= lp08(q)[ 8], lp08(p)[ 9] ^= lp08(q)[ 9], \
        !           217:     lp08(p)[10] ^= lp08(q)[10], lp08(p)[11] ^= lp08(q)[11], \
        !           218:     lp08(p)[12] ^= lp08(q)[12], lp08(p)[13] ^= lp08(q)[13], \
        !           219:     lp08(p)[14] ^= lp08(q)[14], lp08(p)[15] ^= lp08(q)[15]
        !           220: 
        !           221: 
        !           222: #define gf_dat(q) {\
        !           223:     q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
        !           224:     q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
        !           225:     q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
        !           226:     q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
        !           227:     q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
        !           228:     q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
        !           229:     q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
        !           230:     q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
        !           231:     q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
        !           232:     q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
        !           233:     q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
        !           234:     q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
        !           235:     q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
        !           236:     q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
        !           237:     q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
        !           238:     q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
        !           239:     q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
        !           240:     q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
        !           241:     q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
        !           242:     q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
        !           243:     q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
        !           244:     q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
        !           245:     q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
        !           246:     q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
        !           247:     q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
        !           248:     q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
        !           249:     q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
        !           250:     q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
        !           251:     q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
        !           252:     q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
        !           253:     q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
        !           254:     q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) }
        !           255: 
        !           256: /* given the value i in 0..255 as the byte overflow when a a field  */
        !           257: /* element in GHASH is multipled by x^8, this function will return  */
        !           258: /* the values that are generated in the lo 16-bit word of the field */
        !           259: /* value by applying the modular polynomial. The values lo_byte and */
        !           260: /* hi_byte are returned via the macro xp_fun(lo_byte, hi_byte) so   */
        !           261: /* that the values can be assembled into memory as required by a    */
        !           262: /* suitable definition of this macro operating on the table above   */
        !           263: 
        !           264: #define xp(i) xp_fun( \
        !           265:     (i & 0x80 ? 0xe1 : 0) ^ (i & 0x40 ? 0x70 : 0) ^ \
        !           266:     (i & 0x20 ? 0x38 : 0) ^ (i & 0x10 ? 0x1c : 0) ^ \
        !           267:     (i & 0x08 ? 0x0e : 0) ^ (i & 0x04 ? 0x07 : 0) ^ \
        !           268:     (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0),  \
        !           269:     (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x80 : 0) ^ \
        !           270:     (i & 0x20 ? 0x40 : 0) ^ (i & 0x10 ? 0x20 : 0) ^ \
        !           271:     (i & 0x08 ? 0x10 : 0) ^ (i & 0x04 ? 0x08 : 0) ^ \
        !           272:     (i & 0x02 ? 0x84 : 0) ^ (i & 0x01 ? 0xc2 : 0) )
        !           273: 
        !           274: #define xp64(i) xp_fun( \
        !           275:     (i & 0x80 ? 0xd8 : 0) ^ (i & 0x40 ? 0x6c : 0) ^ \
        !           276:     (i & 0x20 ? 0x36 : 0) ^ (i & 0x10 ? 0x1b : 0) ^ \
        !           277:     (i & 0x08 ? 0x0d : 0) ^ (i & 0x04 ? 0x06 : 0) ^ \
        !           278:     (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0),  \
        !           279:     (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x00 : 0) ^ \
        !           280:     (i & 0x20 ? 0x00 : 0) ^ (i & 0x10 ? 0x00 : 0) ^ \
        !           281:     (i & 0x08 ? 0x80 : 0) ^ (i & 0x04 ? 0xc0 : 0) ^ \
        !           282:     (i & 0x02 ? 0x60 : 0) ^ (i & 0x01 ? 0xb0 : 0) )
        !           283: 
        !           284: static mode(32t) gf_poly[2] = { 0, 0xe1000000 };
        !           285: static mode(32t) gf_poly64[2] = { 0, 0xd8000000 };
        !           286: 
        !           287: /* Multiply of a GF128 field element by x.   The field element  */
        !           288: /* is held in an array of bytes in which field bits 8n..8n + 7  */
        !           289: /* are held in byte[n], with lower indexed bits placed in the   */
        !           290: /* more numerically significant bit positions in bytes.         */
        !           291: 
        !           292: /* This function multiples a field element x, in the polynomial */
        !           293: /* field representation. It uses 32-bit word operations to gain */
        !           294: /* speed but compensates for machine endianess and hence works  */
        !           295: /* correctly on both styles of machine                          */
        !           296: 
        !           297: in_line void mul_x(mode(32t) x[4])
        !           298: {   mode(32t)   t;
        !           299: 
        !           300:     bsw_32(x, 4);
        !           301: 
        !           302:     /* at this point the filed element bits 0..127 are set out  */
        !           303:     /* as follows in 32-bit words (where the most significant   */
        !           304:     /* (ms) numeric bits are to the left)                       */
        !           305:     /*                                                          */
        !           306:     /*            x[0]      x[1]      x[2]      x[3]            */
        !           307:     /*          ms    ls  ms    ls  ms    ls  ms     ls         */
        !           308:     /* field:   0 ... 31  32 .. 63  64 .. 95  96 .. 127         */
        !           309: 
        !           310:     t = gf_poly[x[3] & 1];          /* bit 127 of the element   */
        !           311:     x[3] = (x[3] >> 1) | (x[2] << 31);  /* shift bits up by one */
        !           312:     x[2] = (x[2] >> 1) | (x[1] << 31);  /* position             */
        !           313:     x[1] = (x[1] >> 1) | (x[0] << 31);  /* if bit 7 is 1 xor in */
        !           314:     x[0] = (x[0] >> 1) ^ t;             /* the field polynomial */
        !           315:     bsw_32(x, 4);
        !           316: }
        !           317: 
        !           318: in_line void mul_x64(mode(32t) x[2])
        !           319: {   mode(32t)   t;
        !           320: 
        !           321:     bsw_32(x, 2);
        !           322: 
        !           323:     /* at this point the filed element bits 0..127 are set out  */
        !           324:     /* as follows in 32-bit words (where the most significant   */
        !           325:     /* (ms) numeric bits are to the left)                       */
        !           326:     /*                                                          */
        !           327:     /*            x[0]      x[1]      x[2]      x[3]            */
        !           328:     /*          ms    ls  ms    ls  ms    ls  ms     ls         */
        !           329:     /* field:   0 ... 31  32 .. 63  64 .. 95  96 .. 127         */
        !           330: 
        !           331:     t = gf_poly64[x[1] & 1];          /* bit 127 of the element   */
        !           332:                                                                                /* shift bits up by one */
        !           333:                                                                                /* position             */
        !           334:     x[1] = (x[1] >> 1) | (x[0] << 31);  /* if bit 7 is 1 xor in */
        !           335:     x[0] = (x[0] >> 1) ^ t;             /* the field polynomial */
        !           336:     bsw_32(x, 2);
        !           337: }
        !           338: 
        !           339: /* Multiply of a GF128 field element by x^8 using 32-bit words  */
        !           340: /* for speed - machine endianess matters here                   */
        !           341: 
        !           342: #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
        !           343: 
        !           344: #define xp_fun(x,y)    ((mode(32t))(x)) | (((mode(32t))(y)) << 8)
        !           345: static const unsigned __int16 gft_le[256] = gf_dat(xp);
        !           346: static const unsigned __int16 gft_le64[256] = gf_dat(xp64);
        !           347: 
        !           348: in_line void mul_lex8(mode(32t) x[4])   /* mutiply with long words  */
        !           349: {   mode(32t)   t = (x[3] >> 24);       /* in little endian format  */
        !           350:     x[3] = (x[3] << 8) | (x[2] >> 24);
        !           351:     x[2] = (x[2] << 8) | (x[1] >> 24);
        !           352:     x[1] = (x[1] << 8) | (x[0] >> 24);
        !           353:     x[0] = (x[0] << 8) ^ gft_le[t];
        !           354: }
        !           355: 
        !           356: in_line void mul_lex8_64(mode(32t) x[2])   /* mutiply with long words  */
        !           357: {   mode(32t)   t = (x[1] >> 24);       /* in little endian format  */
        !           358:     x[1] = (x[1] << 8) | (x[0] >> 24);
        !           359:     x[0] = (x[0] << 8) ^ gft_le64[t];
        !           360: }
        !           361: 
        !           362: #endif
        !           363: 
        !           364: #if 1 || (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
        !           365: 
        !           366: #undef  xp_fun
        !           367: #define xp_fun(x,y)    ((mode(32t))(y)) | (((mode(32t))(x)) << 8)
        !           368: static const unsigned __int16 gft_be[256] = gf_dat(xp);
        !           369: static const unsigned __int16 gft_be64[256] = gf_dat(xp64);
        !           370: 
        !           371: in_line void mul_bex8(mode(32t) x[4])   /* mutiply with long words  */
        !           372: {   mode(32t)   t = (x[3] & 0xff);      /* in big endian format     */
        !           373:     x[3] = (x[3] >> 8) | (x[2] << 24);
        !           374:     x[2] = (x[2] >> 8) | (x[1] << 24);
        !           375:     x[1] = (x[1] >> 8) | (x[0] << 24);
        !           376:     x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be[t]) << 16);
        !           377: }
        !           378: 
        !           379: in_line void mul_bex8_64(mode(32t) x[2])   /* mutiply with long words  */
        !           380: {   mode(32t)   t = (x[1] & 0xff);      /* in big endian format     */
        !           381:     x[1] = (x[1] >> 8) | (x[0] << 24);
        !           382:     x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be64[t]) << 16);
        !           383: }
        !           384: 
        !           385: #endif
        !           386: 
        !           387: /* hence choose the correct version for the machine endianess       */
        !           388: 
        !           389: #if PLATFORM_BYTE_ORDER == BRG_BIG_ENDIAN
        !           390: #define mul_x8  mul_bex8
        !           391: #define mul_x8_64  mul_bex8_64
        !           392: #else
        !           393: #define mul_x8  mul_lex8
        !           394: #define mul_x8_64  mul_lex8_64
        !           395: #endif
        !           396: 
        !           397: /* different versions of the general gf_mul function are provided   */
        !           398: /* here. Sadly none are very fast :-(                               */
        !           399: 
        !           400: void GfMul128 (void *a, const void* b)
        !           401: {   mode(32t) r[CBLK_LEN >> 2], p[8][CBLK_LEN >> 2];
        !           402:     int i;
        !           403: 
        !           404:     move_block_aligned(p[0], b);
        !           405:     bsw_32(p[0], 4);
        !           406:     for(i = 0; i < 7; ++i)
        !           407:     {
        !           408:         p[i + 1][3] = (p[i][3] >> 1) | (p[i][2] << 31);
        !           409:         p[i + 1][2] = (p[i][2] >> 1) | (p[i][1] << 31);
        !           410:         p[i + 1][1] = (p[i][1] >> 1) | (p[i][0] << 31);
        !           411:         p[i + 1][0] = (p[i][0] >> 1) ^ gf_poly[p[i][3] & 1];
        !           412:     }
        !           413: 
        !           414:     memset(r, 0, CBLK_LEN);
        !           415:     for(i = 0; i < 16; ++i)
        !           416:     {
        !           417:         if(i) mul_bex8(r);  /* order is always big endian here */
        !           418: 
        !           419:         if(((unsigned char*)a)[15 - i] & 0x80)
        !           420:             xor_block_aligned(r, p[0]);
        !           421:         if(((unsigned char*)a)[15 - i] & 0x40)
        !           422:             xor_block_aligned(r, p[1]);
        !           423:         if(((unsigned char*)a)[15 - i] & 0x20)
        !           424:             xor_block_aligned(r, p[2]);
        !           425:         if(((unsigned char*)a)[15 - i] & 0x10)
        !           426:             xor_block_aligned(r, p[3]);
        !           427:         if(((unsigned char*)a)[15 - i] & 0x08)
        !           428:             xor_block_aligned(r, p[4]);
        !           429:         if(((unsigned char*)a)[15 - i] & 0x04)
        !           430:             xor_block_aligned(r, p[5]);
        !           431:         if(((unsigned char*)a)[15 - i] & 0x02)
        !           432:             xor_block_aligned(r, p[6]);
        !           433:         if(((unsigned char*)a)[15 - i] & 0x01)
        !           434:             xor_block_aligned(r, p[7]);
        !           435:     }
        !           436:     bsw_32(r, 4);
        !           437:     move_block_aligned(a, r);
        !           438: }
        !           439: 
        !           440: #if defined( UNROLL_LOOPS )
        !           441: 
        !           442: #define xor_8k(i)   \
        !           443:     xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); \
        !           444:     xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4])
        !           445: 
        !           446: 
        !           447: void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx)
        !           448: {   unsigned __int32 r[CBLK_LEN >> 2];
        !           449: 
        !           450:     move_block_aligned(r, ctx->gf_t8k[0][a[0] & 15]);
        !           451:     xor_block_aligned(r, ctx->gf_t8k[1][a[0] >> 4]);
        !           452:                 xor_8k( 1); xor_8k( 2); xor_8k( 3);
        !           453:     xor_8k( 4); xor_8k( 5); xor_8k( 6); xor_8k( 7);
        !           454:     xor_8k( 8); xor_8k( 9); xor_8k(10); xor_8k(11);
        !           455:     xor_8k(12); xor_8k(13); xor_8k(14); xor_8k(15);
        !           456:     move_block_aligned(a, r);
        !           457: }
        !           458: 
        !           459: #else
        !           460: 
        !           461: void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx)
        !           462: {   unsigned __int32 r[CBLK_LEN >> 2], *p;
        !           463:     int i;
        !           464: 
        !           465:     p = ctx->gf_t8k[0][a[0] & 15];
        !           466:     memcpy(r, p, CBLK_LEN);
        !           467:     p = ctx->gf_t8k[1][a[0] >> 4];
        !           468:     xor_block_aligned(r, p);
        !           469:     for(i = 1; i < CBLK_LEN; ++i)
        !           470:     {
        !           471:         xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]);
        !           472:         xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]);
        !           473:     }
        !           474:     memcpy(a, r, CBLK_LEN);
        !           475: }
        !           476: 
        !           477: #endif
        !           478: 
        !           479: void compile_8k_table(unsigned __int8 *a, GfCtx8k *ctx)
        !           480: {   int i, j, k;
        !           481: 
        !           482:     memset(ctx->gf_t8k, 0, 32 * 16 * 16);
        !           483:     for(i = 0; i < 2 * CBLK_LEN; ++i)
        !           484:     {
        !           485:         if(i == 0)
        !           486:         {
        !           487:             memcpy(ctx->gf_t8k[1][8], a, CBLK_LEN);
        !           488:             for(j = 4; j > 0; j >>= 1)
        !           489:             {
        !           490:                 memcpy(ctx->gf_t8k[1][j], ctx->gf_t8k[1][j + j], CBLK_LEN);
        !           491:                 mul_x(ctx->gf_t8k[1][j]);
        !           492:             }
        !           493:             memcpy(ctx->gf_t8k[0][8], ctx->gf_t8k[1][1], CBLK_LEN);
        !           494:             mul_x(ctx->gf_t8k[0][8]);
        !           495:             for(j = 4; j > 0; j >>= 1)
        !           496:             {
        !           497:                 memcpy(ctx->gf_t8k[0][j], ctx->gf_t8k[0][j + j], CBLK_LEN);
        !           498:                 mul_x(ctx->gf_t8k[0][j]);
        !           499:             }
        !           500:         }
        !           501:         else if(i > 1)
        !           502:             for(j = 8; j > 0; j >>= 1)
        !           503:             {
        !           504:                 memcpy(ctx->gf_t8k[i][j], ctx->gf_t8k[i - 2][j], CBLK_LEN);
        !           505:                 mul_x8(ctx->gf_t8k[i][j]);
        !           506:             }
        !           507: 
        !           508:         for(j = 2; j < 16; j += j)
        !           509:         {
        !           510:             mode(32t) *pj = ctx->gf_t8k[i][j];
        !           511:             mode(32t) *pk = ctx->gf_t8k[i][1];
        !           512:             mode(32t) *pl = ctx->gf_t8k[i][j + 1];
        !           513: 
        !           514:             for(k = 1; k < j; ++k)
        !           515:             {
        !           516:                 *pl++ = pj[0] ^ *pk++;
        !           517:                 *pl++ = pj[1] ^ *pk++;
        !           518:                 *pl++ = pj[2] ^ *pk++;
        !           519:                 *pl++ = pj[3] ^ *pk++;
        !           520:             }
        !           521:         }
        !           522:     }
        !           523: }
        !           524: 
        !           525: 
        !           526: void compile_4k_table64(unsigned __int8 *a, GfCtx4k64 *ctx)
        !           527: {   int i, j, k;
        !           528: 
        !           529:     memset(ctx->gf_t4k, 0, sizeof(ctx->gf_t4k));
        !           530:     for(i = 0; i < 2 * CBLK_LEN8; ++i)
        !           531:     {
        !           532:         if(i == 0)
        !           533:         {
        !           534:             memcpy(ctx->gf_t4k[1][8], a, CBLK_LEN8);
        !           535:             for(j = 4; j > 0; j >>= 1)
        !           536:             {
        !           537:                 memcpy(ctx->gf_t4k[1][j], ctx->gf_t4k[1][j + j], CBLK_LEN8);
        !           538:                 mul_x64(ctx->gf_t4k[1][j]);
        !           539:             }
        !           540:             memcpy(ctx->gf_t4k[0][8], ctx->gf_t4k[1][1], CBLK_LEN8);
        !           541:             mul_x64(ctx->gf_t4k[0][8]);
        !           542:             for(j = 4; j > 0; j >>= 1)
        !           543:             {
        !           544:                 memcpy(ctx->gf_t4k[0][j], ctx->gf_t4k[0][j + j], CBLK_LEN8);
        !           545:                 mul_x64(ctx->gf_t4k[0][j]);
        !           546:             }
        !           547:         }
        !           548:         else if(i > 1)
        !           549:             for(j = 8; j > 0; j >>= 1)
        !           550:             {
        !           551:                 memcpy(ctx->gf_t4k[i][j], ctx->gf_t4k[i - 2][j], CBLK_LEN8);
        !           552:                 mul_x8_64(ctx->gf_t4k[i][j]);
        !           553:             }
        !           554: 
        !           555:         for(j = 2; j < 16; j += j)
        !           556:         {
        !           557:             mode(32t) *pj = ctx->gf_t4k[i][j];
        !           558:             mode(32t) *pk = ctx->gf_t4k[i][1];
        !           559:             mode(32t) *pl = ctx->gf_t4k[i][j + 1];
        !           560: 
        !           561:             for(k = 1; k < j; ++k)
        !           562:             {
        !           563:                 *pl++ = pj[0] ^ *pk++;
        !           564:                 *pl++ = pj[1] ^ *pk++;
        !           565:                 *pl++ = pj[2] ^ *pk++;
        !           566:                 *pl++ = pj[3] ^ *pk++;
        !           567:             }
        !           568:         }
        !           569:     }
        !           570: }
        !           571: 
        !           572: static int IsBitSet128 (unsigned int bit, unsigned __int8 *a)
        !           573: {
        !           574:        return a[(127 - bit) / 8] & (0x80 >> ((127 - bit) % 8));
        !           575: }
        !           576: 
        !           577: static int IsBitSet64 (unsigned int bit, unsigned __int8 *a)
        !           578: {
        !           579:        return a[(63 - bit) / 8] & (0x80 >> ((63 - bit) % 8));
        !           580: }
        !           581: 
        !           582: static void SetBit128 (unsigned int bit, unsigned __int8 *a)
        !           583: {
        !           584:        a[(127 - bit) / 8] |= 0x80 >> ((127 - bit) % 8);
        !           585: }
        !           586: 
        !           587: static void SetBit64 (unsigned int bit, unsigned __int8 *a)
        !           588: {
        !           589:        a[(63 - bit) / 8] |= 0x80 >> ((63 - bit) % 8);
        !           590: }
        !           591: 
        !           592: static void MirrorBits128 (unsigned __int8 *a)
        !           593: {
        !           594:        unsigned __int8 t[128 / 8];
        !           595:        int i;
        !           596:        memset (t,0,16);
        !           597:        for (i = 0; i < 128; i++)
        !           598:        {
        !           599:                if (IsBitSet128(i, a))
        !           600:                        SetBit128 (127 - i, t);
        !           601:        }
        !           602:        memcpy (a, t, sizeof (t));
        !           603:        burn (t,sizeof (t));
        !           604: }
        !           605: 
        !           606: void MirrorBits64 (unsigned __int8 *a)
        !           607: {
        !           608:        unsigned __int8 t[64 / 8];
        !           609:        int i;
        !           610:        memset (t,0,8);
        !           611:        for (i = 0; i < 64; i++)
        !           612:        {
        !           613:                if (IsBitSet64(i, a))
        !           614:                        SetBit64 (63 - i, t);
        !           615:        }
        !           616:        memcpy (a, t, sizeof (t));
        !           617:        burn (t,sizeof (t));
        !           618: }
        !           619: 
        !           620: /* Allocate and initialize speed optimization table
        !           621:    for multiplication by 64-bit operand in MSB-first mode */
        !           622: int Gf128Tab64Init (unsigned __int8 *a, GfCtx *ctx)
        !           623: {
        !           624:        GfCtx8k *ctx8k;
        !           625:        unsigned __int8 am[16];
        !           626:        int i, j;
        !           627: 
        !           628:        ctx8k = (GfCtx8k *) TCalloc (sizeof (GfCtx8k));
        !           629:        if (!ctx8k)
        !           630:                return FALSE;
        !           631: 
        !           632:        memcpy (am, a, 16);
        !           633:        MirrorBits128 (am);
        !           634:     compile_8k_table (am, ctx8k);
        !           635: 
        !           636:        /* Convert 8k LSB-first table to 4k MSB-first */
        !           637:        for (i = 16; i < 32; i++) 
        !           638:        {
        !           639:                for (j = 0; j < 16; j++) 
        !           640:                {
        !           641:                        int jm = 0;
        !           642:                        jm |= (j & 0x1) << 3;
        !           643:                        jm |= (j & 0x2) << 1;
        !           644:                        jm |= (j & 0x4) >> 1;
        !           645:                        jm |= (j & 0x8) >> 3;
        !           646: 
        !           647:                        memcpy (&ctx->gf_t128[i-16][jm], (unsigned char *)&ctx8k->gf_t8k[31-i][j], 16);
        !           648:                        MirrorBits128 ((unsigned char *)&ctx->gf_t128[i-16][jm]);
        !           649:                }
        !           650:        }
        !           651: 
        !           652:        burn (ctx8k ,sizeof (*ctx8k));
        !           653:        burn (am, sizeof (am));
        !           654:        TCfree (ctx8k);
        !           655:        return TRUE;
        !           656: }
        !           657: 
        !           658: int Gf64TabInit (unsigned __int8 *a, GfCtx *ctx)
        !           659: {
        !           660:        GfCtx4k64 *ctx4k;
        !           661:        unsigned __int8 am[8];
        !           662:        int i, j;
        !           663: 
        !           664:        ctx4k = (GfCtx4k64 *) TCalloc (sizeof (GfCtx4k64));
        !           665:        if (!ctx4k)
        !           666:                return FALSE;
        !           667: 
        !           668:        memcpy (am, a, 8);
        !           669:        MirrorBits64 (am);
        !           670:     compile_4k_table64 (am, ctx4k);
        !           671: 
        !           672:        /* Convert LSB-first table to MSB-first */
        !           673:        for (i = 0; i < 16; i++) 
        !           674:        {
        !           675:                for (j = 0; j < 16; j++) 
        !           676:                {
        !           677:                        int jm = 0;
        !           678:                        jm |= (j & 0x1) << 3;
        !           679:                        jm |= (j & 0x2) << 1;
        !           680:                        jm |= (j & 0x4) >> 1;
        !           681:                        jm |= (j & 0x8) >> 3;
        !           682: 
        !           683:                        memcpy (&ctx->gf_t64[i][jm], (unsigned char *)&ctx4k->gf_t4k[15-i][j], 8);
        !           684:                        MirrorBits64 ((unsigned char *)&ctx->gf_t64[i][jm]);
        !           685:                }
        !           686:        }
        !           687: 
        !           688:        burn (ctx4k,sizeof (*ctx4k));
        !           689:        burn (am, sizeof (am));
        !           690:        TCfree (ctx4k);
        !           691:        return TRUE;
        !           692: }
        !           693: 
        !           694: #define xor_8kt64(i)   \
        !           695:     xor_block_aligned(r, ctx->gf_t128[i + i][a[i] & 15]); \
        !           696:     xor_block_aligned(r, ctx->gf_t128[i + i + 1][a[i] >> 4])
        !           697: 
        !           698: /* Multiply a 128-bit number by a 64-bit number in the finite field GF(2^128) */
        !           699: void Gf128MulBy64Tab (unsigned __int8 a[8], unsigned __int8 p[16], GfCtx *ctx)
        !           700: {  
        !           701:        unsigned __int32 r[CBLK_LEN >> 2];
        !           702: 
        !           703:        move_block_aligned(r, ctx->gf_t128[7*2][a[7] & 15]);
        !           704:     xor_block_aligned(r,  ctx->gf_t128[7*2+1][a[7] >> 4]);
        !           705: 
        !           706:        if (*(unsigned __int16 *)a)
        !           707:        {
        !           708:                xor_8kt64(0);
        !           709:                xor_8kt64(1);
        !           710:        }
        !           711:        if (a[2])
        !           712:        {
        !           713:                xor_8kt64(2);
        !           714:        }
        !           715:        xor_8kt64(3);
        !           716:     xor_8kt64(4);
        !           717:        xor_8kt64(5);
        !           718:        xor_8kt64(6);
        !           719: 
        !           720:     move_block_aligned(p, r);
        !           721: }
        !           722: 
        !           723: #define xor_8k64(i)   \
        !           724:     xor_block_aligned64(r, ctx->gf_t64[i + i][a[i] & 15]); \
        !           725:     xor_block_aligned64(r, ctx->gf_t64[i + i + 1][a[i] >> 4])
        !           726: 
        !           727: /* Multiply two 64-bit numbers in the finite field GF(2^64) */
        !           728: void Gf64MulTab (unsigned char a[8], unsigned char p[8], GfCtx *ctx)
        !           729: {  
        !           730:        unsigned __int32 r[CBLK_LEN8 >> 2];
        !           731: 
        !           732:        move_block_aligned64(r, ctx->gf_t64[7*2][a[7] & 15]);
        !           733:     xor_block_aligned64(r,  ctx->gf_t64[7*2+1][a[7] >> 4]);
        !           734: 
        !           735:        if (*(unsigned __int16 *)a)
        !           736:        {
        !           737:                xor_8k64(0);
        !           738:                xor_8k64(1);
        !           739:        }
        !           740:        if (a[2])
        !           741:        {
        !           742:                xor_8k64(2);
        !           743:        }
        !           744:        xor_8k64(3);
        !           745:     xor_8k64(4);
        !           746:        xor_8k64(5);
        !           747:        xor_8k64(6);
        !           748: 
        !           749:     move_block_aligned64(p, r);
        !           750: }
        !           751: 
        !           752: 
        !           753: /* Basic algorithms for testing of optimized algorithms */
        !           754: 
        !           755: static void xor128 (unsigned __int64 *a, unsigned __int64 *b)
        !           756: {
        !           757:        *a++ ^= *b++;
        !           758:        *a ^= *b;
        !           759: }
        !           760: 
        !           761: static void shl128 (unsigned __int8 *a)
        !           762: {
        !           763:        int i, x = 0, xx;
        !           764:        for (i = 15; i >= 0; i--)
        !           765:        {
        !           766:                xx = (a[i] & 0x80) >> 7;
        !           767:                a[i] = (a[i] << 1) | x;
        !           768:                x = xx;
        !           769:        }
        !           770: }
        !           771: 
        !           772: static void GfMul128Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8 *p)
        !           773: {
        !           774:        int i;
        !           775:        unsigned __int8 la[16];
        !           776:        memcpy (la, a, 16);
        !           777:        memset (p, 0, 16);
        !           778: 
        !           779:        for (i = 0; i < 128; i++)
        !           780:        {
        !           781:                if (IsBitSet128 (i, b))
        !           782:                        xor128 ((unsigned __int64 *)p, (unsigned __int64 *)la);
        !           783: 
        !           784:                if (la[0] & 0x80)
        !           785:                {
        !           786:                        shl128 (la);
        !           787:                        la[15] ^= 0x87;
        !           788:                }
        !           789:                else
        !           790:                {
        !           791:                        shl128 (la);
        !           792:                }
        !           793:        }
        !           794: }
        !           795: 
        !           796: static void xor64 (unsigned __int64 *a, unsigned __int64 *b)
        !           797: {
        !           798:        *a ^= *b;
        !           799: }
        !           800: 
        !           801: static void shl64 (unsigned __int8 *a)
        !           802: {
        !           803:        int i, x = 0, xx;
        !           804:        for (i = 7; i >= 0; i--)
        !           805:        {
        !           806:                xx = (a[i] & 0x80) >> 7;
        !           807:                a[i] = (a[i] << 1) | x;
        !           808:                x = xx;
        !           809:        }
        !           810: }
        !           811: 
        !           812: static void GfMul64Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8* p)
        !           813: {
        !           814:        int i;
        !           815:        unsigned __int8 la[8];
        !           816:        memcpy (la, a, 8);
        !           817:        memset (p, 0, 8);
        !           818: 
        !           819:        for (i = 0; i < 64; i++)
        !           820:        {
        !           821:                if (IsBitSet64 (i, b))
        !           822:                        xor64 ((unsigned __int64 *)p, (unsigned __int64 *)la);
        !           823: 
        !           824:                if (la[0] & 0x80)
        !           825:                {
        !           826:                        shl64 (la);
        !           827:                        la[7] ^= 0x1b;
        !           828:                }
        !           829:                else
        !           830:                {
        !           831:                        shl64 (la);
        !           832:                }
        !           833:        }
        !           834: }
        !           835: 
        !           836: 
        !           837: BOOL GfMulSelfTest ()
        !           838: {
        !           839:        BOOL result = TRUE;
        !           840:        unsigned __int8 a[16];
        !           841:        unsigned __int8 b[16];
        !           842:        unsigned __int8 p1[16];
        !           843:        unsigned __int8 p2[16];
        !           844:        GfCtx *gfCtx = (GfCtx *) TCalloc (sizeof (GfCtx));
        !           845:        int i, j;
        !           846: 
        !           847:        if (!gfCtx)
        !           848:                return FALSE;
        !           849: 
        !           850:        /* GF(2^64) */
        !           851:        for (i = 0; i < 0x100; i++)
        !           852:        {
        !           853:                for (j = 0; j < 8; j++)
        !           854:                {
        !           855:                        a[j] = (unsigned __int8) i;
        !           856:                        b[j] = a[j] ^ 0xff;
        !           857:                }
        !           858: 
        !           859:                GfMul64Basic (a, b, p1);
        !           860:        
        !           861:                Gf64TabInit (a, gfCtx);
        !           862:                Gf64MulTab (b, p2, gfCtx);
        !           863: 
        !           864:                if (memcmp (p1, p2, 8) != 0)
        !           865:                        result = FALSE;
        !           866:        }
        !           867: 
        !           868:        /* GF(2^128) */
        !           869:        for (i = 0; i < 0x100; i++)
        !           870:        {
        !           871:                for (j = 0; j < 16; j++)
        !           872:                {
        !           873:                        a[j] = (unsigned __int8) i;
        !           874:                        b[j] = j < 8 ? 0 : a[j] ^ 0xff;
        !           875:                }
        !           876: 
        !           877:                GfMul128Basic (a, b, p1);
        !           878:        
        !           879:                Gf128Tab64Init (a, gfCtx);
        !           880:                Gf128MulBy64Tab (b + 8, p2, gfCtx);
        !           881: 
        !           882:                if (memcmp (p1, p2, 16) != 0)
        !           883:                        result = FALSE;
        !           884:        }
        !           885: 
        !           886:        TCfree (gfCtx);
        !           887:        return result;
        !           888: }
        !           889: 
        !           890: #if defined(__cplusplus)
        !           891: }
        !           892: #endif
unix.superglobalmegacorp.com
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.