File:  [Qemu by Fabrice Bellard] / qemu / target-i386 / ops_sse.h
Revision 1.1.1.8 (vendor branch): download - view: text, annotated - select for diffs
Tue Apr 24 19:31:10 2018 UTC (3 years, 3 months ago) by root
Branches: qemu, MAIN
CVS tags: qemu1001, HEAD
qemu 1.0.1

    1: /*
    2:  *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
    3:  *
    4:  *  Copyright (c) 2005 Fabrice Bellard
    5:  *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
    6:  *
    7:  * This library is free software; you can redistribute it and/or
    8:  * modify it under the terms of the GNU Lesser General Public
    9:  * License as published by the Free Software Foundation; either
   10:  * version 2 of the License, or (at your option) any later version.
   11:  *
   12:  * This library is distributed in the hope that it will be useful,
   13:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   14:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   15:  * Lesser General Public License for more details.
   16:  *
   17:  * You should have received a copy of the GNU Lesser General Public
   18:  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
   19:  */
   20: #if SHIFT == 0
   21: #define Reg MMXReg
   22: #define XMM_ONLY(...)
   23: #define B(n) MMX_B(n)
   24: #define W(n) MMX_W(n)
   25: #define L(n) MMX_L(n)
   26: #define Q(n) q
   27: #define SUFFIX _mmx
   28: #else
   29: #define Reg XMMReg
   30: #define XMM_ONLY(...) __VA_ARGS__
   31: #define B(n) XMM_B(n)
   32: #define W(n) XMM_W(n)
   33: #define L(n) XMM_L(n)
   34: #define Q(n) XMM_Q(n)
   35: #define SUFFIX _xmm
   36: #endif
   37: 
   38: void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s)
   39: {
   40:     int shift;
   41: 
   42:     if (s->Q(0) > 15) {
   43:         d->Q(0) = 0;
   44: #if SHIFT == 1
   45:         d->Q(1) = 0;
   46: #endif
   47:     } else {
   48:         shift = s->B(0);
   49:         d->W(0) >>= shift;
   50:         d->W(1) >>= shift;
   51:         d->W(2) >>= shift;
   52:         d->W(3) >>= shift;
   53: #if SHIFT == 1
   54:         d->W(4) >>= shift;
   55:         d->W(5) >>= shift;
   56:         d->W(6) >>= shift;
   57:         d->W(7) >>= shift;
   58: #endif
   59:     }
   60: }
   61: 
   62: void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s)
   63: {
   64:     int shift;
   65: 
   66:     if (s->Q(0) > 15) {
   67:         shift = 15;
   68:     } else {
   69:         shift = s->B(0);
   70:     }
   71:     d->W(0) = (int16_t)d->W(0) >> shift;
   72:     d->W(1) = (int16_t)d->W(1) >> shift;
   73:     d->W(2) = (int16_t)d->W(2) >> shift;
   74:     d->W(3) = (int16_t)d->W(3) >> shift;
   75: #if SHIFT == 1
   76:     d->W(4) = (int16_t)d->W(4) >> shift;
   77:     d->W(5) = (int16_t)d->W(5) >> shift;
   78:     d->W(6) = (int16_t)d->W(6) >> shift;
   79:     d->W(7) = (int16_t)d->W(7) >> shift;
   80: #endif
   81: }
   82: 
   83: void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s)
   84: {
   85:     int shift;
   86: 
   87:     if (s->Q(0) > 15) {
   88:         d->Q(0) = 0;
   89: #if SHIFT == 1
   90:         d->Q(1) = 0;
   91: #endif
   92:     } else {
   93:         shift = s->B(0);
   94:         d->W(0) <<= shift;
   95:         d->W(1) <<= shift;
   96:         d->W(2) <<= shift;
   97:         d->W(3) <<= shift;
   98: #if SHIFT == 1
   99:         d->W(4) <<= shift;
  100:         d->W(5) <<= shift;
  101:         d->W(6) <<= shift;
  102:         d->W(7) <<= shift;
  103: #endif
  104:     }
  105: }
  106: 
  107: void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s)
  108: {
  109:     int shift;
  110: 
  111:     if (s->Q(0) > 31) {
  112:         d->Q(0) = 0;
  113: #if SHIFT == 1
  114:         d->Q(1) = 0;
  115: #endif
  116:     } else {
  117:         shift = s->B(0);
  118:         d->L(0) >>= shift;
  119:         d->L(1) >>= shift;
  120: #if SHIFT == 1
  121:         d->L(2) >>= shift;
  122:         d->L(3) >>= shift;
  123: #endif
  124:     }
  125: }
  126: 
  127: void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s)
  128: {
  129:     int shift;
  130: 
  131:     if (s->Q(0) > 31) {
  132:         shift = 31;
  133:     } else {
  134:         shift = s->B(0);
  135:     }
  136:     d->L(0) = (int32_t)d->L(0) >> shift;
  137:     d->L(1) = (int32_t)d->L(1) >> shift;
  138: #if SHIFT == 1
  139:     d->L(2) = (int32_t)d->L(2) >> shift;
  140:     d->L(3) = (int32_t)d->L(3) >> shift;
  141: #endif
  142: }
  143: 
  144: void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s)
  145: {
  146:     int shift;
  147: 
  148:     if (s->Q(0) > 31) {
  149:         d->Q(0) = 0;
  150: #if SHIFT == 1
  151:         d->Q(1) = 0;
  152: #endif
  153:     } else {
  154:         shift = s->B(0);
  155:         d->L(0) <<= shift;
  156:         d->L(1) <<= shift;
  157: #if SHIFT == 1
  158:         d->L(2) <<= shift;
  159:         d->L(3) <<= shift;
  160: #endif
  161:     }
  162: }
  163: 
  164: void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s)
  165: {
  166:     int shift;
  167: 
  168:     if (s->Q(0) > 63) {
  169:         d->Q(0) = 0;
  170: #if SHIFT == 1
  171:         d->Q(1) = 0;
  172: #endif
  173:     } else {
  174:         shift = s->B(0);
  175:         d->Q(0) >>= shift;
  176: #if SHIFT == 1
  177:         d->Q(1) >>= shift;
  178: #endif
  179:     }
  180: }
  181: 
  182: void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s)
  183: {
  184:     int shift;
  185: 
  186:     if (s->Q(0) > 63) {
  187:         d->Q(0) = 0;
  188: #if SHIFT == 1
  189:         d->Q(1) = 0;
  190: #endif
  191:     } else {
  192:         shift = s->B(0);
  193:         d->Q(0) <<= shift;
  194: #if SHIFT == 1
  195:         d->Q(1) <<= shift;
  196: #endif
  197:     }
  198: }
  199: 
  200: #if SHIFT == 1
  201: void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s)
  202: {
  203:     int shift, i;
  204: 
  205:     shift = s->L(0);
  206:     if (shift > 16)
  207:         shift = 16;
  208:     for(i = 0; i < 16 - shift; i++)
  209:         d->B(i) = d->B(i + shift);
  210:     for(i = 16 - shift; i < 16; i++)
  211:         d->B(i) = 0;
  212: }
  213: 
  214: void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
  215: {
  216:     int shift, i;
  217: 
  218:     shift = s->L(0);
  219:     if (shift > 16)
  220:         shift = 16;
  221:     for(i = 15; i >= shift; i--)
  222:         d->B(i) = d->B(i - shift);
  223:     for(i = 0; i < shift; i++)
  224:         d->B(i) = 0;
  225: }
  226: #endif
  227: 
  228: #define SSE_HELPER_B(name, F)\
  229: void glue(name, SUFFIX) (Reg *d, Reg *s)\
  230: {\
  231:     d->B(0) = F(d->B(0), s->B(0));\
  232:     d->B(1) = F(d->B(1), s->B(1));\
  233:     d->B(2) = F(d->B(2), s->B(2));\
  234:     d->B(3) = F(d->B(3), s->B(3));\
  235:     d->B(4) = F(d->B(4), s->B(4));\
  236:     d->B(5) = F(d->B(5), s->B(5));\
  237:     d->B(6) = F(d->B(6), s->B(6));\
  238:     d->B(7) = F(d->B(7), s->B(7));\
  239:     XMM_ONLY(\
  240:     d->B(8) = F(d->B(8), s->B(8));\
  241:     d->B(9) = F(d->B(9), s->B(9));\
  242:     d->B(10) = F(d->B(10), s->B(10));\
  243:     d->B(11) = F(d->B(11), s->B(11));\
  244:     d->B(12) = F(d->B(12), s->B(12));\
  245:     d->B(13) = F(d->B(13), s->B(13));\
  246:     d->B(14) = F(d->B(14), s->B(14));\
  247:     d->B(15) = F(d->B(15), s->B(15));\
  248:     )\
  249: }
  250: 
  251: #define SSE_HELPER_W(name, F)\
  252: void glue(name, SUFFIX) (Reg *d, Reg *s)\
  253: {\
  254:     d->W(0) = F(d->W(0), s->W(0));\
  255:     d->W(1) = F(d->W(1), s->W(1));\
  256:     d->W(2) = F(d->W(2), s->W(2));\
  257:     d->W(3) = F(d->W(3), s->W(3));\
  258:     XMM_ONLY(\
  259:     d->W(4) = F(d->W(4), s->W(4));\
  260:     d->W(5) = F(d->W(5), s->W(5));\
  261:     d->W(6) = F(d->W(6), s->W(6));\
  262:     d->W(7) = F(d->W(7), s->W(7));\
  263:     )\
  264: }
  265: 
  266: #define SSE_HELPER_L(name, F)\
  267: void glue(name, SUFFIX) (Reg *d, Reg *s)\
  268: {\
  269:     d->L(0) = F(d->L(0), s->L(0));\
  270:     d->L(1) = F(d->L(1), s->L(1));\
  271:     XMM_ONLY(\
  272:     d->L(2) = F(d->L(2), s->L(2));\
  273:     d->L(3) = F(d->L(3), s->L(3));\
  274:     )\
  275: }
  276: 
  277: #define SSE_HELPER_Q(name, F)\
  278: void glue(name, SUFFIX) (Reg *d, Reg *s)\
  279: {\
  280:     d->Q(0) = F(d->Q(0), s->Q(0));\
  281:     XMM_ONLY(\
  282:     d->Q(1) = F(d->Q(1), s->Q(1));\
  283:     )\
  284: }
  285: 
  286: #if SHIFT == 0
  287: static inline int satub(int x)
  288: {
  289:     if (x < 0)
  290:         return 0;
  291:     else if (x > 255)
  292:         return 255;
  293:     else
  294:         return x;
  295: }
  296: 
  297: static inline int satuw(int x)
  298: {
  299:     if (x < 0)
  300:         return 0;
  301:     else if (x > 65535)
  302:         return 65535;
  303:     else
  304:         return x;
  305: }
  306: 
  307: static inline int satsb(int x)
  308: {
  309:     if (x < -128)
  310:         return -128;
  311:     else if (x > 127)
  312:         return 127;
  313:     else
  314:         return x;
  315: }
  316: 
  317: static inline int satsw(int x)
  318: {
  319:     if (x < -32768)
  320:         return -32768;
  321:     else if (x > 32767)
  322:         return 32767;
  323:     else
  324:         return x;
  325: }
  326: 
  327: #define FADD(a, b) ((a) + (b))
  328: #define FADDUB(a, b) satub((a) + (b))
  329: #define FADDUW(a, b) satuw((a) + (b))
  330: #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
  331: #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
  332: 
  333: #define FSUB(a, b) ((a) - (b))
  334: #define FSUBUB(a, b) satub((a) - (b))
  335: #define FSUBUW(a, b) satuw((a) - (b))
  336: #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
  337: #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
  338: #define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
  339: #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
  340: #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
  341: #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
  342: 
  343: #define FAND(a, b) (a) & (b)
  344: #define FANDN(a, b) ((~(a)) & (b))
  345: #define FOR(a, b) (a) | (b)
  346: #define FXOR(a, b) (a) ^ (b)
  347: 
  348: #define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0
  349: #define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0
  350: #define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0
  351: #define FCMPEQ(a, b) (a) == (b) ? -1 : 0
  352: 
  353: #define FMULLW(a, b) (a) * (b)
  354: #define FMULHRW(a, b) ((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16
  355: #define FMULHUW(a, b) (a) * (b) >> 16
  356: #define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16
  357: 
  358: #define FAVG(a, b) ((a) + (b) + 1) >> 1
  359: #endif
  360: 
  361: SSE_HELPER_B(helper_paddb, FADD)
  362: SSE_HELPER_W(helper_paddw, FADD)
  363: SSE_HELPER_L(helper_paddl, FADD)
  364: SSE_HELPER_Q(helper_paddq, FADD)
  365: 
  366: SSE_HELPER_B(helper_psubb, FSUB)
  367: SSE_HELPER_W(helper_psubw, FSUB)
  368: SSE_HELPER_L(helper_psubl, FSUB)
  369: SSE_HELPER_Q(helper_psubq, FSUB)
  370: 
  371: SSE_HELPER_B(helper_paddusb, FADDUB)
  372: SSE_HELPER_B(helper_paddsb, FADDSB)
  373: SSE_HELPER_B(helper_psubusb, FSUBUB)
  374: SSE_HELPER_B(helper_psubsb, FSUBSB)
  375: 
  376: SSE_HELPER_W(helper_paddusw, FADDUW)
  377: SSE_HELPER_W(helper_paddsw, FADDSW)
  378: SSE_HELPER_W(helper_psubusw, FSUBUW)
  379: SSE_HELPER_W(helper_psubsw, FSUBSW)
  380: 
  381: SSE_HELPER_B(helper_pminub, FMINUB)
  382: SSE_HELPER_B(helper_pmaxub, FMAXUB)
  383: 
  384: SSE_HELPER_W(helper_pminsw, FMINSW)
  385: SSE_HELPER_W(helper_pmaxsw, FMAXSW)
  386: 
  387: SSE_HELPER_Q(helper_pand, FAND)
  388: SSE_HELPER_Q(helper_pandn, FANDN)
  389: SSE_HELPER_Q(helper_por, FOR)
  390: SSE_HELPER_Q(helper_pxor, FXOR)
  391: 
  392: SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
  393: SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
  394: SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
  395: 
  396: SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
  397: SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
  398: SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
  399: 
  400: SSE_HELPER_W(helper_pmullw, FMULLW)
  401: #if SHIFT == 0
  402: SSE_HELPER_W(helper_pmulhrw, FMULHRW)
  403: #endif
  404: SSE_HELPER_W(helper_pmulhuw, FMULHUW)
  405: SSE_HELPER_W(helper_pmulhw, FMULHW)
  406: 
  407: SSE_HELPER_B(helper_pavgb, FAVG)
  408: SSE_HELPER_W(helper_pavgw, FAVG)
  409: 
  410: void glue(helper_pmuludq, SUFFIX) (Reg *d, Reg *s)
  411: {
  412:     d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
  413: #if SHIFT == 1
  414:     d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
  415: #endif
  416: }
  417: 
  418: void glue(helper_pmaddwd, SUFFIX) (Reg *d, Reg *s)
  419: {
  420:     int i;
  421: 
  422:     for(i = 0; i < (2 << SHIFT); i++) {
  423:         d->L(i) = (int16_t)s->W(2*i) * (int16_t)d->W(2*i) +
  424:             (int16_t)s->W(2*i+1) * (int16_t)d->W(2*i+1);
  425:     }
  426: }
  427: 
  428: #if SHIFT == 0
  429: static inline int abs1(int a)
  430: {
  431:     if (a < 0)
  432:         return -a;
  433:     else
  434:         return a;
  435: }
  436: #endif
  437: void glue(helper_psadbw, SUFFIX) (Reg *d, Reg *s)
  438: {
  439:     unsigned int val;
  440: 
  441:     val = 0;
  442:     val += abs1(d->B(0) - s->B(0));
  443:     val += abs1(d->B(1) - s->B(1));
  444:     val += abs1(d->B(2) - s->B(2));
  445:     val += abs1(d->B(3) - s->B(3));
  446:     val += abs1(d->B(4) - s->B(4));
  447:     val += abs1(d->B(5) - s->B(5));
  448:     val += abs1(d->B(6) - s->B(6));
  449:     val += abs1(d->B(7) - s->B(7));
  450:     d->Q(0) = val;
  451: #if SHIFT == 1
  452:     val = 0;
  453:     val += abs1(d->B(8) - s->B(8));
  454:     val += abs1(d->B(9) - s->B(9));
  455:     val += abs1(d->B(10) - s->B(10));
  456:     val += abs1(d->B(11) - s->B(11));
  457:     val += abs1(d->B(12) - s->B(12));
  458:     val += abs1(d->B(13) - s->B(13));
  459:     val += abs1(d->B(14) - s->B(14));
  460:     val += abs1(d->B(15) - s->B(15));
  461:     d->Q(1) = val;
  462: #endif
  463: }
  464: 
  465: void glue(helper_maskmov, SUFFIX) (Reg *d, Reg *s, target_ulong a0)
  466: {
  467:     int i;
  468:     for(i = 0; i < (8 << SHIFT); i++) {
  469:         if (s->B(i) & 0x80)
  470:             stb(a0 + i, d->B(i));
  471:     }
  472: }
  473: 
  474: void glue(helper_movl_mm_T0, SUFFIX) (Reg *d, uint32_t val)
  475: {
  476:     d->L(0) = val;
  477:     d->L(1) = 0;
  478: #if SHIFT == 1
  479:     d->Q(1) = 0;
  480: #endif
  481: }
  482: 
  483: #ifdef TARGET_X86_64
  484: void glue(helper_movq_mm_T0, SUFFIX) (Reg *d, uint64_t val)
  485: {
  486:     d->Q(0) = val;
  487: #if SHIFT == 1
  488:     d->Q(1) = 0;
  489: #endif
  490: }
  491: #endif
  492: 
  493: #if SHIFT == 0
  494: void glue(helper_pshufw, SUFFIX) (Reg *d, Reg *s, int order)
  495: {
  496:     Reg r;
  497:     r.W(0) = s->W(order & 3);
  498:     r.W(1) = s->W((order >> 2) & 3);
  499:     r.W(2) = s->W((order >> 4) & 3);
  500:     r.W(3) = s->W((order >> 6) & 3);
  501:     *d = r;
  502: }
  503: #else
  504: void helper_shufps(Reg *d, Reg *s, int order)
  505: {
  506:     Reg r;
  507:     r.L(0) = d->L(order & 3);
  508:     r.L(1) = d->L((order >> 2) & 3);
  509:     r.L(2) = s->L((order >> 4) & 3);
  510:     r.L(3) = s->L((order >> 6) & 3);
  511:     *d = r;
  512: }
  513: 
  514: void helper_shufpd(Reg *d, Reg *s, int order)
  515: {
  516:     Reg r;
  517:     r.Q(0) = d->Q(order & 1);
  518:     r.Q(1) = s->Q((order >> 1) & 1);
  519:     *d = r;
  520: }
  521: 
  522: void glue(helper_pshufd, SUFFIX) (Reg *d, Reg *s, int order)
  523: {
  524:     Reg r;
  525:     r.L(0) = s->L(order & 3);
  526:     r.L(1) = s->L((order >> 2) & 3);
  527:     r.L(2) = s->L((order >> 4) & 3);
  528:     r.L(3) = s->L((order >> 6) & 3);
  529:     *d = r;
  530: }
  531: 
  532: void glue(helper_pshuflw, SUFFIX) (Reg *d, Reg *s, int order)
  533: {
  534:     Reg r;
  535:     r.W(0) = s->W(order & 3);
  536:     r.W(1) = s->W((order >> 2) & 3);
  537:     r.W(2) = s->W((order >> 4) & 3);
  538:     r.W(3) = s->W((order >> 6) & 3);
  539:     r.Q(1) = s->Q(1);
  540:     *d = r;
  541: }
  542: 
  543: void glue(helper_pshufhw, SUFFIX) (Reg *d, Reg *s, int order)
  544: {
  545:     Reg r;
  546:     r.Q(0) = s->Q(0);
  547:     r.W(4) = s->W(4 + (order & 3));
  548:     r.W(5) = s->W(4 + ((order >> 2) & 3));
  549:     r.W(6) = s->W(4 + ((order >> 4) & 3));
  550:     r.W(7) = s->W(4 + ((order >> 6) & 3));
  551:     *d = r;
  552: }
  553: #endif
  554: 
  555: #if SHIFT == 1
  556: /* FPU ops */
  557: /* XXX: not accurate */
  558: 
  559: #define SSE_HELPER_S(name, F)\
  560: void helper_ ## name ## ps (Reg *d, Reg *s)\
  561: {\
  562:     d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
  563:     d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
  564:     d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
  565:     d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
  566: }\
  567: \
  568: void helper_ ## name ## ss (Reg *d, Reg *s)\
  569: {\
  570:     d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
  571: }\
  572: void helper_ ## name ## pd (Reg *d, Reg *s)\
  573: {\
  574:     d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
  575:     d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
  576: }\
  577: \
  578: void helper_ ## name ## sd (Reg *d, Reg *s)\
  579: {\
  580:     d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
  581: }
  582: 
  583: #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
  584: #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
  585: #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
  586: #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
  587: #define FPU_MIN(size, a, b) (a) < (b) ? (a) : (b)
  588: #define FPU_MAX(size, a, b) (a) > (b) ? (a) : (b)
  589: #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
  590: 
  591: SSE_HELPER_S(add, FPU_ADD)
  592: SSE_HELPER_S(sub, FPU_SUB)
  593: SSE_HELPER_S(mul, FPU_MUL)
  594: SSE_HELPER_S(div, FPU_DIV)
  595: SSE_HELPER_S(min, FPU_MIN)
  596: SSE_HELPER_S(max, FPU_MAX)
  597: SSE_HELPER_S(sqrt, FPU_SQRT)
  598: 
  599: 
  600: /* float to float conversions */
  601: void helper_cvtps2pd(Reg *d, Reg *s)
  602: {
  603:     float32 s0, s1;
  604:     s0 = s->XMM_S(0);
  605:     s1 = s->XMM_S(1);
  606:     d->XMM_D(0) = float32_to_float64(s0, &env->sse_status);
  607:     d->XMM_D(1) = float32_to_float64(s1, &env->sse_status);
  608: }
  609: 
  610: void helper_cvtpd2ps(Reg *d, Reg *s)
  611: {
  612:     d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
  613:     d->XMM_S(1) = float64_to_float32(s->XMM_D(1), &env->sse_status);
  614:     d->Q(1) = 0;
  615: }
  616: 
  617: void helper_cvtss2sd(Reg *d, Reg *s)
  618: {
  619:     d->XMM_D(0) = float32_to_float64(s->XMM_S(0), &env->sse_status);
  620: }
  621: 
  622: void helper_cvtsd2ss(Reg *d, Reg *s)
  623: {
  624:     d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
  625: }
  626: 
  627: /* integer to float */
  628: void helper_cvtdq2ps(Reg *d, Reg *s)
  629: {
  630:     d->XMM_S(0) = int32_to_float32(s->XMM_L(0), &env->sse_status);
  631:     d->XMM_S(1) = int32_to_float32(s->XMM_L(1), &env->sse_status);
  632:     d->XMM_S(2) = int32_to_float32(s->XMM_L(2), &env->sse_status);
  633:     d->XMM_S(3) = int32_to_float32(s->XMM_L(3), &env->sse_status);
  634: }
  635: 
  636: void helper_cvtdq2pd(Reg *d, Reg *s)
  637: {
  638:     int32_t l0, l1;
  639:     l0 = (int32_t)s->XMM_L(0);
  640:     l1 = (int32_t)s->XMM_L(1);
  641:     d->XMM_D(0) = int32_to_float64(l0, &env->sse_status);
  642:     d->XMM_D(1) = int32_to_float64(l1, &env->sse_status);
  643: }
  644: 
  645: void helper_cvtpi2ps(XMMReg *d, MMXReg *s)
  646: {
  647:     d->XMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
  648:     d->XMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
  649: }
  650: 
  651: void helper_cvtpi2pd(XMMReg *d, MMXReg *s)
  652: {
  653:     d->XMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
  654:     d->XMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
  655: }
  656: 
  657: void helper_cvtsi2ss(XMMReg *d, uint32_t val)
  658: {
  659:     d->XMM_S(0) = int32_to_float32(val, &env->sse_status);
  660: }
  661: 
  662: void helper_cvtsi2sd(XMMReg *d, uint32_t val)
  663: {
  664:     d->XMM_D(0) = int32_to_float64(val, &env->sse_status);
  665: }
  666: 
  667: #ifdef TARGET_X86_64
  668: void helper_cvtsq2ss(XMMReg *d, uint64_t val)
  669: {
  670:     d->XMM_S(0) = int64_to_float32(val, &env->sse_status);
  671: }
  672: 
  673: void helper_cvtsq2sd(XMMReg *d, uint64_t val)
  674: {
  675:     d->XMM_D(0) = int64_to_float64(val, &env->sse_status);
  676: }
  677: #endif
  678: 
  679: /* float to integer */
  680: void helper_cvtps2dq(XMMReg *d, XMMReg *s)
  681: {
  682:     d->XMM_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
  683:     d->XMM_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
  684:     d->XMM_L(2) = float32_to_int32(s->XMM_S(2), &env->sse_status);
  685:     d->XMM_L(3) = float32_to_int32(s->XMM_S(3), &env->sse_status);
  686: }
  687: 
  688: void helper_cvtpd2dq(XMMReg *d, XMMReg *s)
  689: {
  690:     d->XMM_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
  691:     d->XMM_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
  692:     d->XMM_Q(1) = 0;
  693: }
  694: 
  695: void helper_cvtps2pi(MMXReg *d, XMMReg *s)
  696: {
  697:     d->MMX_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
  698:     d->MMX_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
  699: }
  700: 
  701: void helper_cvtpd2pi(MMXReg *d, XMMReg *s)
  702: {
  703:     d->MMX_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
  704:     d->MMX_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
  705: }
  706: 
  707: int32_t helper_cvtss2si(XMMReg *s)
  708: {
  709:     return float32_to_int32(s->XMM_S(0), &env->sse_status);
  710: }
  711: 
  712: int32_t helper_cvtsd2si(XMMReg *s)
  713: {
  714:     return float64_to_int32(s->XMM_D(0), &env->sse_status);
  715: }
  716: 
  717: #ifdef TARGET_X86_64
  718: int64_t helper_cvtss2sq(XMMReg *s)
  719: {
  720:     return float32_to_int64(s->XMM_S(0), &env->sse_status);
  721: }
  722: 
  723: int64_t helper_cvtsd2sq(XMMReg *s)
  724: {
  725:     return float64_to_int64(s->XMM_D(0), &env->sse_status);
  726: }
  727: #endif
  728: 
  729: /* float to integer truncated */
  730: void helper_cvttps2dq(XMMReg *d, XMMReg *s)
  731: {
  732:     d->XMM_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
  733:     d->XMM_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
  734:     d->XMM_L(2) = float32_to_int32_round_to_zero(s->XMM_S(2), &env->sse_status);
  735:     d->XMM_L(3) = float32_to_int32_round_to_zero(s->XMM_S(3), &env->sse_status);
  736: }
  737: 
  738: void helper_cvttpd2dq(XMMReg *d, XMMReg *s)
  739: {
  740:     d->XMM_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
  741:     d->XMM_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
  742:     d->XMM_Q(1) = 0;
  743: }
  744: 
  745: void helper_cvttps2pi(MMXReg *d, XMMReg *s)
  746: {
  747:     d->MMX_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
  748:     d->MMX_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
  749: }
  750: 
  751: void helper_cvttpd2pi(MMXReg *d, XMMReg *s)
  752: {
  753:     d->MMX_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
  754:     d->MMX_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
  755: }
  756: 
  757: int32_t helper_cvttss2si(XMMReg *s)
  758: {
  759:     return float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
  760: }
  761: 
  762: int32_t helper_cvttsd2si(XMMReg *s)
  763: {
  764:     return float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
  765: }
  766: 
  767: #ifdef TARGET_X86_64
  768: int64_t helper_cvttss2sq(XMMReg *s)
  769: {
  770:     return float32_to_int64_round_to_zero(s->XMM_S(0), &env->sse_status);
  771: }
  772: 
  773: int64_t helper_cvttsd2sq(XMMReg *s)
  774: {
  775:     return float64_to_int64_round_to_zero(s->XMM_D(0), &env->sse_status);
  776: }
  777: #endif
  778: 
  779: void helper_rsqrtps(XMMReg *d, XMMReg *s)
  780: {
  781:     d->XMM_S(0) = float32_div(float32_one,
  782:                               float32_sqrt(s->XMM_S(0), &env->sse_status),
  783:                               &env->sse_status);
  784:     d->XMM_S(1) = float32_div(float32_one,
  785:                               float32_sqrt(s->XMM_S(1), &env->sse_status),
  786:                               &env->sse_status);
  787:     d->XMM_S(2) = float32_div(float32_one,
  788:                               float32_sqrt(s->XMM_S(2), &env->sse_status),
  789:                               &env->sse_status);
  790:     d->XMM_S(3) = float32_div(float32_one,
  791:                               float32_sqrt(s->XMM_S(3), &env->sse_status),
  792:                               &env->sse_status);
  793: }
  794: 
  795: void helper_rsqrtss(XMMReg *d, XMMReg *s)
  796: {
  797:     d->XMM_S(0) = float32_div(float32_one,
  798:                               float32_sqrt(s->XMM_S(0), &env->sse_status),
  799:                               &env->sse_status);
  800: }
  801: 
  802: void helper_rcpps(XMMReg *d, XMMReg *s)
  803: {
  804:     d->XMM_S(0) = float32_div(float32_one, s->XMM_S(0), &env->sse_status);
  805:     d->XMM_S(1) = float32_div(float32_one, s->XMM_S(1), &env->sse_status);
  806:     d->XMM_S(2) = float32_div(float32_one, s->XMM_S(2), &env->sse_status);
  807:     d->XMM_S(3) = float32_div(float32_one, s->XMM_S(3), &env->sse_status);
  808: }
  809: 
  810: void helper_rcpss(XMMReg *d, XMMReg *s)
  811: {
  812:     d->XMM_S(0) = float32_div(float32_one, s->XMM_S(0), &env->sse_status);
  813: }
  814: 
  815: static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
  816: {
  817:     uint64_t mask;
  818: 
  819:     if (len == 0) {
  820:         mask = ~0LL;
  821:     } else {
  822:         mask = (1ULL << len) - 1;
  823:     }
  824:     return (src >> shift) & mask;
  825: }
  826: 
  827: void helper_extrq_r(XMMReg *d, XMMReg *s)
  828: {
  829:     d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), s->XMM_B(1), s->XMM_B(0));
  830: }
  831: 
  832: void helper_extrq_i(XMMReg *d, int index, int length)
  833: {
  834:     d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), index, length);
  835: }
  836: 
  837: static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
  838: {
  839:     uint64_t mask;
  840: 
  841:     if (len == 0) {
  842:         mask = ~0ULL;
  843:     } else {
  844:         mask = (1ULL << len) - 1;
  845:     }
  846:     return (src & ~(mask << shift)) | ((src & mask) << shift);
  847: }
  848: 
  849: void helper_insertq_r(XMMReg *d, XMMReg *s)
  850: {
  851:     d->XMM_Q(0) = helper_insertq(s->XMM_Q(0), s->XMM_B(9), s->XMM_B(8));
  852: }
  853: 
  854: void helper_insertq_i(XMMReg *d, int index, int length)
  855: {
  856:     d->XMM_Q(0) = helper_insertq(d->XMM_Q(0), index, length);
  857: }
  858: 
  859: void helper_haddps(XMMReg *d, XMMReg *s)
  860: {
  861:     XMMReg r;
  862:     r.XMM_S(0) = float32_add(d->XMM_S(0), d->XMM_S(1), &env->sse_status);
  863:     r.XMM_S(1) = float32_add(d->XMM_S(2), d->XMM_S(3), &env->sse_status);
  864:     r.XMM_S(2) = float32_add(s->XMM_S(0), s->XMM_S(1), &env->sse_status);
  865:     r.XMM_S(3) = float32_add(s->XMM_S(2), s->XMM_S(3), &env->sse_status);
  866:     *d = r;
  867: }
  868: 
  869: void helper_haddpd(XMMReg *d, XMMReg *s)
  870: {
  871:     XMMReg r;
  872:     r.XMM_D(0) = float64_add(d->XMM_D(0), d->XMM_D(1), &env->sse_status);
  873:     r.XMM_D(1) = float64_add(s->XMM_D(0), s->XMM_D(1), &env->sse_status);
  874:     *d = r;
  875: }
  876: 
  877: void helper_hsubps(XMMReg *d, XMMReg *s)
  878: {
  879:     XMMReg r;
  880:     r.XMM_S(0) = float32_sub(d->XMM_S(0), d->XMM_S(1), &env->sse_status);
  881:     r.XMM_S(1) = float32_sub(d->XMM_S(2), d->XMM_S(3), &env->sse_status);
  882:     r.XMM_S(2) = float32_sub(s->XMM_S(0), s->XMM_S(1), &env->sse_status);
  883:     r.XMM_S(3) = float32_sub(s->XMM_S(2), s->XMM_S(3), &env->sse_status);
  884:     *d = r;
  885: }
  886: 
  887: void helper_hsubpd(XMMReg *d, XMMReg *s)
  888: {
  889:     XMMReg r;
  890:     r.XMM_D(0) = float64_sub(d->XMM_D(0), d->XMM_D(1), &env->sse_status);
  891:     r.XMM_D(1) = float64_sub(s->XMM_D(0), s->XMM_D(1), &env->sse_status);
  892:     *d = r;
  893: }
  894: 
  895: void helper_addsubps(XMMReg *d, XMMReg *s)
  896: {
  897:     d->XMM_S(0) = float32_sub(d->XMM_S(0), s->XMM_S(0), &env->sse_status);
  898:     d->XMM_S(1) = float32_add(d->XMM_S(1), s->XMM_S(1), &env->sse_status);
  899:     d->XMM_S(2) = float32_sub(d->XMM_S(2), s->XMM_S(2), &env->sse_status);
  900:     d->XMM_S(3) = float32_add(d->XMM_S(3), s->XMM_S(3), &env->sse_status);
  901: }
  902: 
  903: void helper_addsubpd(XMMReg *d, XMMReg *s)
  904: {
  905:     d->XMM_D(0) = float64_sub(d->XMM_D(0), s->XMM_D(0), &env->sse_status);
  906:     d->XMM_D(1) = float64_add(d->XMM_D(1), s->XMM_D(1), &env->sse_status);
  907: }
  908: 
  909: /* XXX: unordered */
  910: #define SSE_HELPER_CMP(name, F)\
  911: void helper_ ## name ## ps (Reg *d, Reg *s)\
  912: {\
  913:     d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
  914:     d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
  915:     d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
  916:     d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
  917: }\
  918: \
  919: void helper_ ## name ## ss (Reg *d, Reg *s)\
  920: {\
  921:     d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
  922: }\
  923: void helper_ ## name ## pd (Reg *d, Reg *s)\
  924: {\
  925:     d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
  926:     d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
  927: }\
  928: \
  929: void helper_ ## name ## sd (Reg *d, Reg *s)\
  930: {\
  931:     d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
  932: }
  933: 
  934: #define FPU_CMPEQ(size, a, b) float ## size ## _eq_quiet(a, b, &env->sse_status) ? -1 : 0
  935: #define FPU_CMPLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0
  936: #define FPU_CMPLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? -1 : 0
  937: #define FPU_CMPUNORD(size, a, b) float ## size ## _unordered_quiet(a, b, &env->sse_status) ? - 1 : 0
  938: #define FPU_CMPNEQ(size, a, b) float ## size ## _eq_quiet(a, b, &env->sse_status) ? 0 : -1
  939: #define FPU_CMPNLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1
  940: #define FPU_CMPNLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? 0 : -1
  941: #define FPU_CMPORD(size, a, b) float ## size ## _unordered_quiet(a, b, &env->sse_status) ? 0 : -1
  942: 
  943: SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
  944: SSE_HELPER_CMP(cmplt, FPU_CMPLT)
  945: SSE_HELPER_CMP(cmple, FPU_CMPLE)
  946: SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
  947: SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
  948: SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
  949: SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
  950: SSE_HELPER_CMP(cmpord, FPU_CMPORD)
  951: 
  952: static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
  953: 
  954: void helper_ucomiss(Reg *d, Reg *s)
  955: {
  956:     int ret;
  957:     float32 s0, s1;
  958: 
  959:     s0 = d->XMM_S(0);
  960:     s1 = s->XMM_S(0);
  961:     ret = float32_compare_quiet(s0, s1, &env->sse_status);
  962:     CC_SRC = comis_eflags[ret + 1];
  963: }
  964: 
  965: void helper_comiss(Reg *d, Reg *s)
  966: {
  967:     int ret;
  968:     float32 s0, s1;
  969: 
  970:     s0 = d->XMM_S(0);
  971:     s1 = s->XMM_S(0);
  972:     ret = float32_compare(s0, s1, &env->sse_status);
  973:     CC_SRC = comis_eflags[ret + 1];
  974: }
  975: 
  976: void helper_ucomisd(Reg *d, Reg *s)
  977: {
  978:     int ret;
  979:     float64 d0, d1;
  980: 
  981:     d0 = d->XMM_D(0);
  982:     d1 = s->XMM_D(0);
  983:     ret = float64_compare_quiet(d0, d1, &env->sse_status);
  984:     CC_SRC = comis_eflags[ret + 1];
  985: }
  986: 
  987: void helper_comisd(Reg *d, Reg *s)
  988: {
  989:     int ret;
  990:     float64 d0, d1;
  991: 
  992:     d0 = d->XMM_D(0);
  993:     d1 = s->XMM_D(0);
  994:     ret = float64_compare(d0, d1, &env->sse_status);
  995:     CC_SRC = comis_eflags[ret + 1];
  996: }
  997: 
  998: uint32_t helper_movmskps(Reg *s)
  999: {
 1000:     int b0, b1, b2, b3;
 1001:     b0 = s->XMM_L(0) >> 31;
 1002:     b1 = s->XMM_L(1) >> 31;
 1003:     b2 = s->XMM_L(2) >> 31;
 1004:     b3 = s->XMM_L(3) >> 31;
 1005:     return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
 1006: }
 1007: 
 1008: uint32_t helper_movmskpd(Reg *s)
 1009: {
 1010:     int b0, b1;
 1011:     b0 = s->XMM_L(1) >> 31;
 1012:     b1 = s->XMM_L(3) >> 31;
 1013:     return b0 | (b1 << 1);
 1014: }
 1015: 
 1016: #endif
 1017: 
 1018: uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s)
 1019: {
 1020:     uint32_t val;
 1021:     val = 0;
 1022:     val |= (s->B(0) >> 7);
 1023:     val |= (s->B(1) >> 6) & 0x02;
 1024:     val |= (s->B(2) >> 5) & 0x04;
 1025:     val |= (s->B(3) >> 4) & 0x08;
 1026:     val |= (s->B(4) >> 3) & 0x10;
 1027:     val |= (s->B(5) >> 2) & 0x20;
 1028:     val |= (s->B(6) >> 1) & 0x40;
 1029:     val |= (s->B(7)) & 0x80;
 1030: #if SHIFT == 1
 1031:     val |= (s->B(8) << 1) & 0x0100;
 1032:     val |= (s->B(9) << 2) & 0x0200;
 1033:     val |= (s->B(10) << 3) & 0x0400;
 1034:     val |= (s->B(11) << 4) & 0x0800;
 1035:     val |= (s->B(12) << 5) & 0x1000;
 1036:     val |= (s->B(13) << 6) & 0x2000;
 1037:     val |= (s->B(14) << 7) & 0x4000;
 1038:     val |= (s->B(15) << 8) & 0x8000;
 1039: #endif
 1040:     return val;
 1041: }
 1042: 
 1043: void glue(helper_packsswb, SUFFIX) (Reg *d, Reg *s)
 1044: {
 1045:     Reg r;
 1046: 
 1047:     r.B(0) = satsb((int16_t)d->W(0));
 1048:     r.B(1) = satsb((int16_t)d->W(1));
 1049:     r.B(2) = satsb((int16_t)d->W(2));
 1050:     r.B(3) = satsb((int16_t)d->W(3));
 1051: #if SHIFT == 1
 1052:     r.B(4) = satsb((int16_t)d->W(4));
 1053:     r.B(5) = satsb((int16_t)d->W(5));
 1054:     r.B(6) = satsb((int16_t)d->W(6));
 1055:     r.B(7) = satsb((int16_t)d->W(7));
 1056: #endif
 1057:     r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
 1058:     r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
 1059:     r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
 1060:     r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
 1061: #if SHIFT == 1
 1062:     r.B(12) = satsb((int16_t)s->W(4));
 1063:     r.B(13) = satsb((int16_t)s->W(5));
 1064:     r.B(14) = satsb((int16_t)s->W(6));
 1065:     r.B(15) = satsb((int16_t)s->W(7));
 1066: #endif
 1067:     *d = r;
 1068: }
 1069: 
 1070: void glue(helper_packuswb, SUFFIX) (Reg *d, Reg *s)
 1071: {
 1072:     Reg r;
 1073: 
 1074:     r.B(0) = satub((int16_t)d->W(0));
 1075:     r.B(1) = satub((int16_t)d->W(1));
 1076:     r.B(2) = satub((int16_t)d->W(2));
 1077:     r.B(3) = satub((int16_t)d->W(3));
 1078: #if SHIFT == 1
 1079:     r.B(4) = satub((int16_t)d->W(4));
 1080:     r.B(5) = satub((int16_t)d->W(5));
 1081:     r.B(6) = satub((int16_t)d->W(6));
 1082:     r.B(7) = satub((int16_t)d->W(7));
 1083: #endif
 1084:     r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
 1085:     r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
 1086:     r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
 1087:     r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
 1088: #if SHIFT == 1
 1089:     r.B(12) = satub((int16_t)s->W(4));
 1090:     r.B(13) = satub((int16_t)s->W(5));
 1091:     r.B(14) = satub((int16_t)s->W(6));
 1092:     r.B(15) = satub((int16_t)s->W(7));
 1093: #endif
 1094:     *d = r;
 1095: }
 1096: 
 1097: void glue(helper_packssdw, SUFFIX) (Reg *d, Reg *s)
 1098: {
 1099:     Reg r;
 1100: 
 1101:     r.W(0) = satsw(d->L(0));
 1102:     r.W(1) = satsw(d->L(1));
 1103: #if SHIFT == 1
 1104:     r.W(2) = satsw(d->L(2));
 1105:     r.W(3) = satsw(d->L(3));
 1106: #endif
 1107:     r.W((2 << SHIFT) + 0) = satsw(s->L(0));
 1108:     r.W((2 << SHIFT) + 1) = satsw(s->L(1));
 1109: #if SHIFT == 1
 1110:     r.W(6) = satsw(s->L(2));
 1111:     r.W(7) = satsw(s->L(3));
 1112: #endif
 1113:     *d = r;
 1114: }
 1115: 
 1116: #define UNPCK_OP(base_name, base)                               \
 1117:                                                                 \
 1118: void glue(helper_punpck ## base_name ## bw, SUFFIX) (Reg *d, Reg *s)   \
 1119: {                                                               \
 1120:     Reg r;                                              \
 1121:                                                                 \
 1122:     r.B(0) = d->B((base << (SHIFT + 2)) + 0);                   \
 1123:     r.B(1) = s->B((base << (SHIFT + 2)) + 0);                   \
 1124:     r.B(2) = d->B((base << (SHIFT + 2)) + 1);                   \
 1125:     r.B(3) = s->B((base << (SHIFT + 2)) + 1);                   \
 1126:     r.B(4) = d->B((base << (SHIFT + 2)) + 2);                   \
 1127:     r.B(5) = s->B((base << (SHIFT + 2)) + 2);                   \
 1128:     r.B(6) = d->B((base << (SHIFT + 2)) + 3);                   \
 1129:     r.B(7) = s->B((base << (SHIFT + 2)) + 3);                   \
 1130: XMM_ONLY(                                                       \
 1131:     r.B(8) = d->B((base << (SHIFT + 2)) + 4);                   \
 1132:     r.B(9) = s->B((base << (SHIFT + 2)) + 4);                   \
 1133:     r.B(10) = d->B((base << (SHIFT + 2)) + 5);                  \
 1134:     r.B(11) = s->B((base << (SHIFT + 2)) + 5);                  \
 1135:     r.B(12) = d->B((base << (SHIFT + 2)) + 6);                  \
 1136:     r.B(13) = s->B((base << (SHIFT + 2)) + 6);                  \
 1137:     r.B(14) = d->B((base << (SHIFT + 2)) + 7);                  \
 1138:     r.B(15) = s->B((base << (SHIFT + 2)) + 7);                  \
 1139: )                                                               \
 1140:     *d = r;                                                     \
 1141: }                                                               \
 1142:                                                                 \
 1143: void glue(helper_punpck ## base_name ## wd, SUFFIX) (Reg *d, Reg *s)   \
 1144: {                                                               \
 1145:     Reg r;                                              \
 1146:                                                                 \
 1147:     r.W(0) = d->W((base << (SHIFT + 1)) + 0);                   \
 1148:     r.W(1) = s->W((base << (SHIFT + 1)) + 0);                   \
 1149:     r.W(2) = d->W((base << (SHIFT + 1)) + 1);                   \
 1150:     r.W(3) = s->W((base << (SHIFT + 1)) + 1);                   \
 1151: XMM_ONLY(                                                       \
 1152:     r.W(4) = d->W((base << (SHIFT + 1)) + 2);                   \
 1153:     r.W(5) = s->W((base << (SHIFT + 1)) + 2);                   \
 1154:     r.W(6) = d->W((base << (SHIFT + 1)) + 3);                   \
 1155:     r.W(7) = s->W((base << (SHIFT + 1)) + 3);                   \
 1156: )                                                               \
 1157:     *d = r;                                                     \
 1158: }                                                               \
 1159:                                                                 \
 1160: void glue(helper_punpck ## base_name ## dq, SUFFIX) (Reg *d, Reg *s)   \
 1161: {                                                               \
 1162:     Reg r;                                              \
 1163:                                                                 \
 1164:     r.L(0) = d->L((base << SHIFT) + 0);                         \
 1165:     r.L(1) = s->L((base << SHIFT) + 0);                         \
 1166: XMM_ONLY(                                                       \
 1167:     r.L(2) = d->L((base << SHIFT) + 1);                         \
 1168:     r.L(3) = s->L((base << SHIFT) + 1);                         \
 1169: )                                                               \
 1170:     *d = r;                                                     \
 1171: }                                                               \
 1172:                                                                 \
 1173: XMM_ONLY(                                                       \
 1174: void glue(helper_punpck ## base_name ## qdq, SUFFIX) (Reg *d, Reg *s)  \
 1175: {                                                               \
 1176:     Reg r;                                              \
 1177:                                                                 \
 1178:     r.Q(0) = d->Q(base);                                        \
 1179:     r.Q(1) = s->Q(base);                                        \
 1180:     *d = r;                                                     \
 1181: }                                                               \
 1182: )
 1183: 
 1184: UNPCK_OP(l, 0)
 1185: UNPCK_OP(h, 1)
 1186: 
 1187: /* 3DNow! float ops */
 1188: #if SHIFT == 0
 1189: void helper_pi2fd(MMXReg *d, MMXReg *s)
 1190: {
 1191:     d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
 1192:     d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
 1193: }
 1194: 
 1195: void helper_pi2fw(MMXReg *d, MMXReg *s)
 1196: {
 1197:     d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
 1198:     d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
 1199: }
 1200: 
 1201: void helper_pf2id(MMXReg *d, MMXReg *s)
 1202: {
 1203:     d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
 1204:     d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
 1205: }
 1206: 
 1207: void helper_pf2iw(MMXReg *d, MMXReg *s)
 1208: {
 1209:     d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status));
 1210:     d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status));
 1211: }
 1212: 
 1213: void helper_pfacc(MMXReg *d, MMXReg *s)
 1214: {
 1215:     MMXReg r;
 1216:     r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
 1217:     r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
 1218:     *d = r;
 1219: }
 1220: 
 1221: void helper_pfadd(MMXReg *d, MMXReg *s)
 1222: {
 1223:     d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
 1224:     d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
 1225: }
 1226: 
 1227: void helper_pfcmpeq(MMXReg *d, MMXReg *s)
 1228: {
 1229:     d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0), &env->mmx_status) ? -1 : 0;
 1230:     d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1), &env->mmx_status) ? -1 : 0;
 1231: }
 1232: 
 1233: void helper_pfcmpge(MMXReg *d, MMXReg *s)
 1234: {
 1235:     d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
 1236:     d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
 1237: }
 1238: 
 1239: void helper_pfcmpgt(MMXReg *d, MMXReg *s)
 1240: {
 1241:     d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
 1242:     d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
 1243: }
 1244: 
 1245: void helper_pfmax(MMXReg *d, MMXReg *s)
 1246: {
 1247:     if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status))
 1248:         d->MMX_S(0) = s->MMX_S(0);
 1249:     if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status))
 1250:         d->MMX_S(1) = s->MMX_S(1);
 1251: }
 1252: 
 1253: void helper_pfmin(MMXReg *d, MMXReg *s)
 1254: {
 1255:     if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status))
 1256:         d->MMX_S(0) = s->MMX_S(0);
 1257:     if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status))
 1258:         d->MMX_S(1) = s->MMX_S(1);
 1259: }
 1260: 
 1261: void helper_pfmul(MMXReg *d, MMXReg *s)
 1262: {
 1263:     d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
 1264:     d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
 1265: }
 1266: 
 1267: void helper_pfnacc(MMXReg *d, MMXReg *s)
 1268: {
 1269:     MMXReg r;
 1270:     r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
 1271:     r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
 1272:     *d = r;
 1273: }
 1274: 
 1275: void helper_pfpnacc(MMXReg *d, MMXReg *s)
 1276: {
 1277:     MMXReg r;
 1278:     r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
 1279:     r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
 1280:     *d = r;
 1281: }
 1282: 
 1283: void helper_pfrcp(MMXReg *d, MMXReg *s)
 1284: {
 1285:     d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
 1286:     d->MMX_S(1) = d->MMX_S(0);
 1287: }
 1288: 
 1289: void helper_pfrsqrt(MMXReg *d, MMXReg *s)
 1290: {
 1291:     d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
 1292:     d->MMX_S(1) = float32_div(float32_one,
 1293:                               float32_sqrt(d->MMX_S(1), &env->mmx_status),
 1294:                               &env->mmx_status);
 1295:     d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
 1296:     d->MMX_L(0) = d->MMX_L(1);
 1297: }
 1298: 
 1299: void helper_pfsub(MMXReg *d, MMXReg *s)
 1300: {
 1301:     d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
 1302:     d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
 1303: }
 1304: 
 1305: void helper_pfsubr(MMXReg *d, MMXReg *s)
 1306: {
 1307:     d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
 1308:     d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
 1309: }
 1310: 
 1311: void helper_pswapd(MMXReg *d, MMXReg *s)
 1312: {
 1313:     MMXReg r;
 1314:     r.MMX_L(0) = s->MMX_L(1);
 1315:     r.MMX_L(1) = s->MMX_L(0);
 1316:     *d = r;
 1317: }
 1318: #endif
 1319: 
 1320: /* SSSE3 op helpers */
 1321: void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s)
 1322: {
 1323:     int i;
 1324:     Reg r;
 1325: 
 1326:     for (i = 0; i < (8 << SHIFT); i++)
 1327:         r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
 1328: 
 1329:     *d = r;
 1330: }
 1331: 
 1332: void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s)
 1333: {
 1334:     d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
 1335:     d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
 1336:     XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
 1337:     XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
 1338:     d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
 1339:     d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
 1340:     XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
 1341:     XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
 1342: }
 1343: 
 1344: void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s)
 1345: {
 1346:     d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
 1347:     XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
 1348:     d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
 1349:     XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
 1350: }
 1351: 
 1352: void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s)
 1353: {
 1354:     d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
 1355:     d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
 1356:     XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
 1357:     XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
 1358:     d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
 1359:     d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
 1360:     XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
 1361:     XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
 1362: }
 1363: 
 1364: void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s)
 1365: {
 1366:     d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) +
 1367:                     (int8_t)s->B( 1) * (uint8_t)d->B( 1));
 1368:     d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) +
 1369:                     (int8_t)s->B( 3) * (uint8_t)d->B( 3));
 1370:     d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) +
 1371:                     (int8_t)s->B( 5) * (uint8_t)d->B( 5));
 1372:     d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) +
 1373:                     (int8_t)s->B( 7) * (uint8_t)d->B( 7));
 1374: #if SHIFT == 1
 1375:     d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) +
 1376:                     (int8_t)s->B( 9) * (uint8_t)d->B( 9));
 1377:     d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
 1378:                     (int8_t)s->B(11) * (uint8_t)d->B(11));
 1379:     d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
 1380:                     (int8_t)s->B(13) * (uint8_t)d->B(13));
 1381:     d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
 1382:                     (int8_t)s->B(15) * (uint8_t)d->B(15));
 1383: #endif
 1384: }
 1385: 
 1386: void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s)
 1387: {
 1388:     d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
 1389:     d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
 1390:     XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
 1391:     XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
 1392:     d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
 1393:     d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
 1394:     XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
 1395:     XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
 1396: }
 1397: 
 1398: void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s)
 1399: {
 1400:     d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
 1401:     XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
 1402:     d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
 1403:     XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
 1404: }
 1405: 
 1406: void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s)
 1407: {
 1408:     d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
 1409:     d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
 1410:     XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
 1411:     XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
 1412:     d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
 1413:     d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
 1414:     XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
 1415:     XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
 1416: }
 1417: 
 1418: #define FABSB(_, x) x > INT8_MAX  ? -(int8_t ) x : x
 1419: #define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x
 1420: #define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x
 1421: SSE_HELPER_B(helper_pabsb, FABSB)
 1422: SSE_HELPER_W(helper_pabsw, FABSW)
 1423: SSE_HELPER_L(helper_pabsd, FABSL)
 1424: 
 1425: #define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15
 1426: SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
 1427: 
 1428: #define FSIGNB(d, s) s <= INT8_MAX  ? s ? d : 0 : -(int8_t ) d
 1429: #define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d
 1430: #define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d
 1431: SSE_HELPER_B(helper_psignb, FSIGNB)
 1432: SSE_HELPER_W(helper_psignw, FSIGNW)
 1433: SSE_HELPER_L(helper_psignd, FSIGNL)
 1434: 
 1435: void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift)
 1436: {
 1437:     Reg r;
 1438: 
 1439:     /* XXX could be checked during translation */
 1440:     if (shift >= (16 << SHIFT)) {
 1441:         r.Q(0) = 0;
 1442:         XMM_ONLY(r.Q(1) = 0);
 1443:     } else {
 1444:         shift <<= 3;
 1445: #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
 1446: #if SHIFT == 0
 1447:         r.Q(0) = SHR(s->Q(0), shift -   0) |
 1448:                  SHR(d->Q(0), shift -  64);
 1449: #else
 1450:         r.Q(0) = SHR(s->Q(0), shift -   0) |
 1451:                  SHR(s->Q(1), shift -  64) |
 1452:                  SHR(d->Q(0), shift - 128) |
 1453:                  SHR(d->Q(1), shift - 192);
 1454:         r.Q(1) = SHR(s->Q(0), shift +  64) |
 1455:                  SHR(s->Q(1), shift -   0) |
 1456:                  SHR(d->Q(0), shift -  64) |
 1457:                  SHR(d->Q(1), shift - 128);
 1458: #endif
 1459: #undef SHR
 1460:     }
 1461: 
 1462:     *d = r;
 1463: }
 1464: 
 1465: #define XMM0 env->xmm_regs[0]
 1466: 
 1467: #if SHIFT == 1
 1468: #define SSE_HELPER_V(name, elem, num, F)\
 1469: void glue(name, SUFFIX) (Reg *d, Reg *s)\
 1470: {\
 1471:     d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\
 1472:     d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\
 1473:     if (num > 2) {\
 1474:         d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\
 1475:         d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\
 1476:         if (num > 4) {\
 1477:             d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\
 1478:             d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\
 1479:             d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\
 1480:             d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\
 1481:             if (num > 8) {\
 1482:                 d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\
 1483:                 d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\
 1484:                 d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\
 1485:                 d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\
 1486:                 d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\
 1487:                 d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\
 1488:                 d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\
 1489:                 d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\
 1490:             }\
 1491:         }\
 1492:     }\
 1493: }
 1494: 
 1495: #define SSE_HELPER_I(name, elem, num, F)\
 1496: void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\
 1497: {\
 1498:     d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\
 1499:     d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\
 1500:     if (num > 2) {\
 1501:         d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\
 1502:         d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\
 1503:         if (num > 4) {\
 1504:             d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\
 1505:             d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\
 1506:             d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\
 1507:             d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\
 1508:             if (num > 8) {\
 1509:                 d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\
 1510:                 d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\
 1511:                 d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\
 1512:                 d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\
 1513:                 d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\
 1514:                 d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\
 1515:                 d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\
 1516:                 d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\
 1517:             }\
 1518:         }\
 1519:     }\
 1520: }
 1521: 
 1522: /* SSE4.1 op helpers */
 1523: #define FBLENDVB(d, s, m) (m & 0x80) ? s : d
 1524: #define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d
 1525: #define FBLENDVPD(d, s, m) (m & 0x8000000000000000LL) ? s : d
 1526: SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
 1527: SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
 1528: SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
 1529: 
 1530: void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s)
 1531: {
 1532:     uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
 1533:     uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
 1534: 
 1535:     CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
 1536: }
 1537: 
 1538: #define SSE_HELPER_F(name, elem, num, F)\
 1539: void glue(name, SUFFIX) (Reg *d, Reg *s)\
 1540: {\
 1541:     d->elem(0) = F(0);\
 1542:     d->elem(1) = F(1);\
 1543:     if (num > 2) {\
 1544:         d->elem(2) = F(2);\
 1545:         d->elem(3) = F(3);\
 1546:         if (num > 4) {\
 1547:             d->elem(4) = F(4);\
 1548:             d->elem(5) = F(5);\
 1549:             d->elem(6) = F(6);\
 1550:             d->elem(7) = F(7);\
 1551:         }\
 1552:     }\
 1553: }
 1554: 
 1555: SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
 1556: SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
 1557: SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
 1558: SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
 1559: SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
 1560: SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
 1561: SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
 1562: SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
 1563: SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
 1564: SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
 1565: SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
 1566: SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
 1567: 
 1568: void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s)
 1569: {
 1570:     d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0);
 1571:     d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2);
 1572: }
 1573: 
 1574: #define FCMPEQQ(d, s) d == s ? -1 : 0
 1575: SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
 1576: 
 1577: void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s)
 1578: {
 1579:     d->W(0) = satuw((int32_t) d->L(0));
 1580:     d->W(1) = satuw((int32_t) d->L(1));
 1581:     d->W(2) = satuw((int32_t) d->L(2));
 1582:     d->W(3) = satuw((int32_t) d->L(3));
 1583:     d->W(4) = satuw((int32_t) s->L(0));
 1584:     d->W(5) = satuw((int32_t) s->L(1));
 1585:     d->W(6) = satuw((int32_t) s->L(2));
 1586:     d->W(7) = satuw((int32_t) s->L(3));
 1587: }
 1588: 
 1589: #define FMINSB(d, s) MIN((int8_t) d, (int8_t) s)
 1590: #define FMINSD(d, s) MIN((int32_t) d, (int32_t) s)
 1591: #define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s)
 1592: #define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s)
 1593: SSE_HELPER_B(helper_pminsb, FMINSB)
 1594: SSE_HELPER_L(helper_pminsd, FMINSD)
 1595: SSE_HELPER_W(helper_pminuw, MIN)
 1596: SSE_HELPER_L(helper_pminud, MIN)
 1597: SSE_HELPER_B(helper_pmaxsb, FMAXSB)
 1598: SSE_HELPER_L(helper_pmaxsd, FMAXSD)
 1599: SSE_HELPER_W(helper_pmaxuw, MAX)
 1600: SSE_HELPER_L(helper_pmaxud, MAX)
 1601: 
 1602: #define FMULLD(d, s) (int32_t) d * (int32_t) s
 1603: SSE_HELPER_L(helper_pmulld, FMULLD)
 1604: 
 1605: void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s)
 1606: {
 1607:     int idx = 0;
 1608: 
 1609:     if (s->W(1) < s->W(idx))
 1610:         idx = 1;
 1611:     if (s->W(2) < s->W(idx))
 1612:         idx = 2;
 1613:     if (s->W(3) < s->W(idx))
 1614:         idx = 3;
 1615:     if (s->W(4) < s->W(idx))
 1616:         idx = 4;
 1617:     if (s->W(5) < s->W(idx))
 1618:         idx = 5;
 1619:     if (s->W(6) < s->W(idx))
 1620:         idx = 6;
 1621:     if (s->W(7) < s->W(idx))
 1622:         idx = 7;
 1623: 
 1624:     d->Q(1) = 0;
 1625:     d->L(1) = 0;
 1626:     d->W(1) = idx;
 1627:     d->W(0) = s->W(idx);
 1628: }
 1629: 
 1630: void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
 1631: {
 1632:     signed char prev_rounding_mode;
 1633: 
 1634:     prev_rounding_mode = env->sse_status.float_rounding_mode;
 1635:     if (!(mode & (1 << 2)))
 1636:         switch (mode & 3) {
 1637:         case 0:
 1638:             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
 1639:             break;
 1640:         case 1:
 1641:             set_float_rounding_mode(float_round_down, &env->sse_status);
 1642:             break;
 1643:         case 2:
 1644:             set_float_rounding_mode(float_round_up, &env->sse_status);
 1645:             break;
 1646:         case 3:
 1647:             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
 1648:             break;
 1649:         }
 1650: 
 1651:     d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
 1652:     d->L(1) = float64_round_to_int(s->L(1), &env->sse_status);
 1653:     d->L(2) = float64_round_to_int(s->L(2), &env->sse_status);
 1654:     d->L(3) = float64_round_to_int(s->L(3), &env->sse_status);
 1655: 
 1656: #if 0 /* TODO */
 1657:     if (mode & (1 << 3))
 1658:         set_float_exception_flags(
 1659:                         get_float_exception_flags(&env->sse_status) &
 1660:                         ~float_flag_inexact,
 1661:                         &env->sse_status);
 1662: #endif
 1663:     env->sse_status.float_rounding_mode = prev_rounding_mode;
 1664: }
 1665: 
 1666: void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
 1667: {
 1668:     signed char prev_rounding_mode;
 1669: 
 1670:     prev_rounding_mode = env->sse_status.float_rounding_mode;
 1671:     if (!(mode & (1 << 2)))
 1672:         switch (mode & 3) {
 1673:         case 0:
 1674:             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
 1675:             break;
 1676:         case 1:
 1677:             set_float_rounding_mode(float_round_down, &env->sse_status);
 1678:             break;
 1679:         case 2:
 1680:             set_float_rounding_mode(float_round_up, &env->sse_status);
 1681:             break;
 1682:         case 3:
 1683:             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
 1684:             break;
 1685:         }
 1686: 
 1687:     d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
 1688:     d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status);
 1689: 
 1690: #if 0 /* TODO */
 1691:     if (mode & (1 << 3))
 1692:         set_float_exception_flags(
 1693:                         get_float_exception_flags(&env->sse_status) &
 1694:                         ~float_flag_inexact,
 1695:                         &env->sse_status);
 1696: #endif
 1697:     env->sse_status.float_rounding_mode = prev_rounding_mode;
 1698: }
 1699: 
 1700: void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
 1701: {
 1702:     signed char prev_rounding_mode;
 1703: 
 1704:     prev_rounding_mode = env->sse_status.float_rounding_mode;
 1705:     if (!(mode & (1 << 2)))
 1706:         switch (mode & 3) {
 1707:         case 0:
 1708:             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
 1709:             break;
 1710:         case 1:
 1711:             set_float_rounding_mode(float_round_down, &env->sse_status);
 1712:             break;
 1713:         case 2:
 1714:             set_float_rounding_mode(float_round_up, &env->sse_status);
 1715:             break;
 1716:         case 3:
 1717:             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
 1718:             break;
 1719:         }
 1720: 
 1721:     d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
 1722: 
 1723: #if 0 /* TODO */
 1724:     if (mode & (1 << 3))
 1725:         set_float_exception_flags(
 1726:                         get_float_exception_flags(&env->sse_status) &
 1727:                         ~float_flag_inexact,
 1728:                         &env->sse_status);
 1729: #endif
 1730:     env->sse_status.float_rounding_mode = prev_rounding_mode;
 1731: }
 1732: 
 1733: void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
 1734: {
 1735:     signed char prev_rounding_mode;
 1736: 
 1737:     prev_rounding_mode = env->sse_status.float_rounding_mode;
 1738:     if (!(mode & (1 << 2)))
 1739:         switch (mode & 3) {
 1740:         case 0:
 1741:             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
 1742:             break;
 1743:         case 1:
 1744:             set_float_rounding_mode(float_round_down, &env->sse_status);
 1745:             break;
 1746:         case 2:
 1747:             set_float_rounding_mode(float_round_up, &env->sse_status);
 1748:             break;
 1749:         case 3:
 1750:             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
 1751:             break;
 1752:         }
 1753: 
 1754:     d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
 1755: 
 1756: #if 0 /* TODO */
 1757:     if (mode & (1 << 3))
 1758:         set_float_exception_flags(
 1759:                         get_float_exception_flags(&env->sse_status) &
 1760:                         ~float_flag_inexact,
 1761:                         &env->sse_status);
 1762: #endif
 1763:     env->sse_status.float_rounding_mode = prev_rounding_mode;
 1764: }
 1765: 
 1766: #define FBLENDP(d, s, m) m ? s : d
 1767: SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
 1768: SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
 1769: SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
 1770: 
 1771: void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
 1772: {
 1773:     float32 iresult = 0 /*float32_zero*/;
 1774: 
 1775:     if (mask & (1 << 4))
 1776:         iresult = float32_add(iresult,
 1777:                         float32_mul(d->L(0), s->L(0), &env->sse_status),
 1778:                         &env->sse_status);
 1779:     if (mask & (1 << 5))
 1780:         iresult = float32_add(iresult,
 1781:                         float32_mul(d->L(1), s->L(1), &env->sse_status),
 1782:                         &env->sse_status);
 1783:     if (mask & (1 << 6))
 1784:         iresult = float32_add(iresult,
 1785:                         float32_mul(d->L(2), s->L(2), &env->sse_status),
 1786:                         &env->sse_status);
 1787:     if (mask & (1 << 7))
 1788:         iresult = float32_add(iresult,
 1789:                         float32_mul(d->L(3), s->L(3), &env->sse_status),
 1790:                         &env->sse_status);
 1791:     d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/;
 1792:     d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/;
 1793:     d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/;
 1794:     d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/;
 1795: }
 1796: 
 1797: void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
 1798: {
 1799:     float64 iresult = 0 /*float64_zero*/;
 1800: 
 1801:     if (mask & (1 << 4))
 1802:         iresult = float64_add(iresult,
 1803:                         float64_mul(d->Q(0), s->Q(0), &env->sse_status),
 1804:                         &env->sse_status);
 1805:     if (mask & (1 << 5))
 1806:         iresult = float64_add(iresult,
 1807:                         float64_mul(d->Q(1), s->Q(1), &env->sse_status),
 1808:                         &env->sse_status);
 1809:     d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/;
 1810:     d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/;
 1811: }
 1812: 
 1813: void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset)
 1814: {
 1815:     int s0 = (offset & 3) << 2;
 1816:     int d0 = (offset & 4) << 0;
 1817:     int i;
 1818:     Reg r;
 1819: 
 1820:     for (i = 0; i < 8; i++, d0++) {
 1821:         r.W(i) = 0;
 1822:         r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
 1823:         r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
 1824:         r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
 1825:         r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
 1826:     }
 1827: 
 1828:     *d = r;
 1829: }
 1830: 
 1831: /* SSE4.2 op helpers */
 1832: /* it's unclear whether signed or unsigned */
 1833: #define FCMPGTQ(d, s) d > s ? -1 : 0
 1834: SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
 1835: 
 1836: static inline int pcmp_elen(int reg, uint32_t ctrl)
 1837: {
 1838:     int val;
 1839: 
 1840:     /* Presence of REX.W is indicated by a bit higher than 7 set */
 1841:     if (ctrl >> 8)
 1842:         val = abs1((int64_t) env->regs[reg]);
 1843:     else
 1844:         val = abs1((int32_t) env->regs[reg]);
 1845: 
 1846:     if (ctrl & 1) {
 1847:         if (val > 8)
 1848:             return 8;
 1849:     } else
 1850:         if (val > 16)
 1851:             return 16;
 1852: 
 1853:     return val;
 1854: }
 1855: 
 1856: static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
 1857: {
 1858:     int val = 0;
 1859: 
 1860:     if (ctrl & 1) {
 1861:         while (val < 8 && r->W(val))
 1862:             val++;
 1863:     } else
 1864:         while (val < 16 && r->B(val))
 1865:             val++;
 1866: 
 1867:     return val;
 1868: }
 1869: 
 1870: static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
 1871: {
 1872:     switch ((ctrl >> 0) & 3) {
 1873:     case 0:
 1874:         return r->B(i);
 1875:     case 1:
 1876:         return r->W(i);
 1877:     case 2:
 1878:         return (int8_t) r->B(i);
 1879:     case 3:
 1880:     default:
 1881:         return (int16_t) r->W(i);
 1882:     }
 1883: }
 1884: 
 1885: static inline unsigned pcmpxstrx(Reg *d, Reg *s,
 1886:                 int8_t ctrl, int valids, int validd)
 1887: {
 1888:     unsigned int res = 0;
 1889:     int v;
 1890:     int j, i;
 1891:     int upper = (ctrl & 1) ? 7 : 15;
 1892: 
 1893:     valids--;
 1894:     validd--;
 1895: 
 1896:     CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
 1897: 
 1898:     switch ((ctrl >> 2) & 3) {
 1899:     case 0:
 1900:         for (j = valids; j >= 0; j--) {
 1901:             res <<= 1;
 1902:             v = pcmp_val(s, ctrl, j);
 1903:             for (i = validd; i >= 0; i--)
 1904:                 res |= (v == pcmp_val(d, ctrl, i));
 1905:         }
 1906:         break;
 1907:     case 1:
 1908:         for (j = valids; j >= 0; j--) {
 1909:             res <<= 1;
 1910:             v = pcmp_val(s, ctrl, j);
 1911:             for (i = ((validd - 1) | 1); i >= 0; i -= 2)
 1912:                 res |= (pcmp_val(d, ctrl, i - 0) <= v &&
 1913:                         pcmp_val(d, ctrl, i - 1) >= v);
 1914:         }
 1915:         break;
 1916:     case 2:
 1917:         res = (2 << (upper - MAX(valids, validd))) - 1;
 1918:         res <<= MAX(valids, validd) - MIN(valids, validd);
 1919:         for (i = MIN(valids, validd); i >= 0; i--) {
 1920:             res <<= 1;
 1921:             v = pcmp_val(s, ctrl, i);
 1922:             res |= (v == pcmp_val(d, ctrl, i));
 1923:         }
 1924:         break;
 1925:     case 3:
 1926:         for (j = valids - validd; j >= 0; j--) {
 1927:             res <<= 1;
 1928:             res |= 1;
 1929:             for (i = MIN(upper - j, validd); i >= 0; i--)
 1930:                 res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
 1931:         }
 1932:         break;
 1933:     }
 1934: 
 1935:     switch ((ctrl >> 4) & 3) {
 1936:     case 1:
 1937:         res ^= (2 << upper) - 1;
 1938:         break;
 1939:     case 3:
 1940:         res ^= (2 << valids) - 1;
 1941:         break;
 1942:     }
 1943: 
 1944:     if (res)
 1945:        CC_SRC |= CC_C;
 1946:     if (res & 1)
 1947:        CC_SRC |= CC_O;
 1948: 
 1949:     return res;
 1950: }
 1951: 
 1952: static inline int rffs1(unsigned int val)
 1953: {
 1954:     int ret = 1, hi;
 1955: 
 1956:     for (hi = sizeof(val) * 4; hi; hi /= 2)
 1957:         if (val >> hi) {
 1958:             val >>= hi;
 1959:             ret += hi;
 1960:         }
 1961: 
 1962:     return ret;
 1963: }
 1964: 
 1965: static inline int ffs1(unsigned int val)
 1966: {
 1967:     int ret = 1, hi;
 1968: 
 1969:     for (hi = sizeof(val) * 4; hi; hi /= 2)
 1970:         if (val << hi) {
 1971:             val <<= hi;
 1972:             ret += hi;
 1973:         }
 1974: 
 1975:     return ret;
 1976: }
 1977: 
 1978: void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
 1979: {
 1980:     unsigned int res = pcmpxstrx(d, s, ctrl,
 1981:                     pcmp_elen(R_EDX, ctrl),
 1982:                     pcmp_elen(R_EAX, ctrl));
 1983: 
 1984:     if (res)
 1985:         env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
 1986:     else
 1987:         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
 1988: }
 1989: 
 1990: void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
 1991: {
 1992:     int i;
 1993:     unsigned int res = pcmpxstrx(d, s, ctrl,
 1994:                     pcmp_elen(R_EDX, ctrl),
 1995:                     pcmp_elen(R_EAX, ctrl));
 1996: 
 1997:     if ((ctrl >> 6) & 1) {
 1998:         if (ctrl & 1)
 1999:             for (i = 0; i < 8; i++, res >>= 1) {
 2000:                 d->W(i) = (res & 1) ? ~0 : 0;
 2001:             }
 2002:         else
 2003:             for (i = 0; i < 16; i++, res >>= 1) {
 2004:                 d->B(i) = (res & 1) ? ~0 : 0;
 2005:             }
 2006:     } else {
 2007:         d->Q(1) = 0;
 2008:         d->Q(0) = res;
 2009:     }
 2010: }
 2011: 
 2012: void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
 2013: {
 2014:     unsigned int res = pcmpxstrx(d, s, ctrl,
 2015:                     pcmp_ilen(s, ctrl),
 2016:                     pcmp_ilen(d, ctrl));
 2017: 
 2018:     if (res)
 2019:         env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
 2020:     else
 2021:         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
 2022: }
 2023: 
 2024: void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
 2025: {
 2026:     int i;
 2027:     unsigned int res = pcmpxstrx(d, s, ctrl,
 2028:                     pcmp_ilen(s, ctrl),
 2029:                     pcmp_ilen(d, ctrl));
 2030: 
 2031:     if ((ctrl >> 6) & 1) {
 2032:         if (ctrl & 1)
 2033:             for (i = 0; i < 8; i++, res >>= 1) {
 2034:                 d->W(i) = (res & 1) ? ~0 : 0;
 2035:             }
 2036:         else
 2037:             for (i = 0; i < 16; i++, res >>= 1) {
 2038:                 d->B(i) = (res & 1) ? ~0 : 0;
 2039:             }
 2040:     } else {
 2041:         d->Q(1) = 0;
 2042:         d->Q(0) = res;
 2043:     }
 2044: }
 2045: 
 2046: #define CRCPOLY        0x1edc6f41
 2047: #define CRCPOLY_BITREV 0x82f63b78
 2048: target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
 2049: {
 2050:     target_ulong crc = (msg & ((target_ulong) -1 >>
 2051:                             (TARGET_LONG_BITS - len))) ^ crc1;
 2052: 
 2053:     while (len--)
 2054:         crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
 2055: 
 2056:     return crc;
 2057: }
 2058: 
 2059: #define POPMASK(i)     ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
 2060: #define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i))
 2061: target_ulong helper_popcnt(target_ulong n, uint32_t type)
 2062: {
 2063:     CC_SRC = n ? 0 : CC_Z;
 2064: 
 2065:     n = POPCOUNT(n, 0);
 2066:     n = POPCOUNT(n, 1);
 2067:     n = POPCOUNT(n, 2);
 2068:     n = POPCOUNT(n, 3);
 2069:     if (type == 1)
 2070:         return n & 0xff;
 2071: 
 2072:     n = POPCOUNT(n, 4);
 2073: #ifndef TARGET_X86_64
 2074:     return n;
 2075: #else
 2076:     if (type == 2)
 2077:         return n & 0xff;
 2078: 
 2079:     return POPCOUNT(n, 5);
 2080: #endif
 2081: }
 2082: #endif
 2083: 
 2084: #undef SHIFT
 2085: #undef XMM_ONLY
 2086: #undef Reg
 2087: #undef B
 2088: #undef W
 2089: #undef L
 2090: #undef Q
 2091: #undef SUFFIX

unix.superglobalmegacorp.com