|
|
1.1 root 1: /*
2: * VIS op helpers
3: *
4: * Copyright (c) 2003-2005 Fabrice Bellard
5: *
6: * This library is free software; you can redistribute it and/or
7: * modify it under the terms of the GNU Lesser General Public
8: * License as published by the Free Software Foundation; either
9: * version 2 of the License, or (at your option) any later version.
10: *
11: * This library is distributed in the hope that it will be useful,
12: * but WITHOUT ANY WARRANTY; without even the implied warranty of
13: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14: * Lesser General Public License for more details.
15: *
16: * You should have received a copy of the GNU Lesser General Public
17: * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18: */
19:
20: #include "cpu.h"
21: #include "helper.h"
22:
23: /* This function uses non-native bit order */
24: #define GET_FIELD(X, FROM, TO) \
25: ((X) >> (63 - (TO)) & ((1ULL << ((TO) - (FROM) + 1)) - 1))
26:
27: /* This function uses the order in the manuals, i.e. bit 0 is 2^0 */
28: #define GET_FIELD_SP(X, FROM, TO) \
29: GET_FIELD(X, 63 - (TO), 63 - (FROM))
30:
31: target_ulong helper_array8(target_ulong pixel_addr, target_ulong cubesize)
32: {
33: return (GET_FIELD_SP(pixel_addr, 60, 63) << (17 + 2 * cubesize)) |
34: (GET_FIELD_SP(pixel_addr, 39, 39 + cubesize - 1) << (17 + cubesize)) |
35: (GET_FIELD_SP(pixel_addr, 17 + cubesize - 1, 17) << 17) |
36: (GET_FIELD_SP(pixel_addr, 56, 59) << 13) |
37: (GET_FIELD_SP(pixel_addr, 35, 38) << 9) |
38: (GET_FIELD_SP(pixel_addr, 13, 16) << 5) |
39: (((pixel_addr >> 55) & 1) << 4) |
40: (GET_FIELD_SP(pixel_addr, 33, 34) << 2) |
41: GET_FIELD_SP(pixel_addr, 11, 12);
42: }
43:
44: #ifdef HOST_WORDS_BIGENDIAN
45: #define VIS_B64(n) b[7 - (n)]
46: #define VIS_W64(n) w[3 - (n)]
47: #define VIS_SW64(n) sw[3 - (n)]
48: #define VIS_L64(n) l[1 - (n)]
49: #define VIS_B32(n) b[3 - (n)]
50: #define VIS_W32(n) w[1 - (n)]
51: #else
52: #define VIS_B64(n) b[n]
53: #define VIS_W64(n) w[n]
54: #define VIS_SW64(n) sw[n]
55: #define VIS_L64(n) l[n]
56: #define VIS_B32(n) b[n]
57: #define VIS_W32(n) w[n]
58: #endif
59:
60: typedef union {
61: uint8_t b[8];
62: uint16_t w[4];
63: int16_t sw[4];
64: uint32_t l[2];
65: uint64_t ll;
66: float64 d;
67: } VIS64;
68:
69: typedef union {
70: uint8_t b[4];
71: uint16_t w[2];
72: uint32_t l;
73: float32 f;
74: } VIS32;
75:
76: uint64_t helper_fpmerge(uint64_t src1, uint64_t src2)
77: {
78: VIS64 s, d;
79:
80: s.ll = src1;
81: d.ll = src2;
82:
83: /* Reverse calculation order to handle overlap */
84: d.VIS_B64(7) = s.VIS_B64(3);
85: d.VIS_B64(6) = d.VIS_B64(3);
86: d.VIS_B64(5) = s.VIS_B64(2);
87: d.VIS_B64(4) = d.VIS_B64(2);
88: d.VIS_B64(3) = s.VIS_B64(1);
89: d.VIS_B64(2) = d.VIS_B64(1);
90: d.VIS_B64(1) = s.VIS_B64(0);
91: /* d.VIS_B64(0) = d.VIS_B64(0); */
92:
93: return d.ll;
94: }
95:
96: uint64_t helper_fmul8x16(uint64_t src1, uint64_t src2)
97: {
98: VIS64 s, d;
99: uint32_t tmp;
100:
101: s.ll = src1;
102: d.ll = src2;
103:
104: #define PMUL(r) \
105: tmp = (int32_t)d.VIS_SW64(r) * (int32_t)s.VIS_B64(r); \
106: if ((tmp & 0xff) > 0x7f) { \
107: tmp += 0x100; \
108: } \
109: d.VIS_W64(r) = tmp >> 8;
110:
111: PMUL(0);
112: PMUL(1);
113: PMUL(2);
114: PMUL(3);
115: #undef PMUL
116:
117: return d.ll;
118: }
119:
120: uint64_t helper_fmul8x16al(uint64_t src1, uint64_t src2)
121: {
122: VIS64 s, d;
123: uint32_t tmp;
124:
125: s.ll = src1;
126: d.ll = src2;
127:
128: #define PMUL(r) \
129: tmp = (int32_t)d.VIS_SW64(1) * (int32_t)s.VIS_B64(r); \
130: if ((tmp & 0xff) > 0x7f) { \
131: tmp += 0x100; \
132: } \
133: d.VIS_W64(r) = tmp >> 8;
134:
135: PMUL(0);
136: PMUL(1);
137: PMUL(2);
138: PMUL(3);
139: #undef PMUL
140:
141: return d.ll;
142: }
143:
144: uint64_t helper_fmul8x16au(uint64_t src1, uint64_t src2)
145: {
146: VIS64 s, d;
147: uint32_t tmp;
148:
149: s.ll = src1;
150: d.ll = src2;
151:
152: #define PMUL(r) \
153: tmp = (int32_t)d.VIS_SW64(0) * (int32_t)s.VIS_B64(r); \
154: if ((tmp & 0xff) > 0x7f) { \
155: tmp += 0x100; \
156: } \
157: d.VIS_W64(r) = tmp >> 8;
158:
159: PMUL(0);
160: PMUL(1);
161: PMUL(2);
162: PMUL(3);
163: #undef PMUL
164:
165: return d.ll;
166: }
167:
168: uint64_t helper_fmul8sux16(uint64_t src1, uint64_t src2)
169: {
170: VIS64 s, d;
171: uint32_t tmp;
172:
173: s.ll = src1;
174: d.ll = src2;
175:
176: #define PMUL(r) \
177: tmp = (int32_t)d.VIS_SW64(r) * ((int32_t)s.VIS_SW64(r) >> 8); \
178: if ((tmp & 0xff) > 0x7f) { \
179: tmp += 0x100; \
180: } \
181: d.VIS_W64(r) = tmp >> 8;
182:
183: PMUL(0);
184: PMUL(1);
185: PMUL(2);
186: PMUL(3);
187: #undef PMUL
188:
189: return d.ll;
190: }
191:
192: uint64_t helper_fmul8ulx16(uint64_t src1, uint64_t src2)
193: {
194: VIS64 s, d;
195: uint32_t tmp;
196:
197: s.ll = src1;
198: d.ll = src2;
199:
200: #define PMUL(r) \
201: tmp = (int32_t)d.VIS_SW64(r) * ((uint32_t)s.VIS_B64(r * 2)); \
202: if ((tmp & 0xff) > 0x7f) { \
203: tmp += 0x100; \
204: } \
205: d.VIS_W64(r) = tmp >> 8;
206:
207: PMUL(0);
208: PMUL(1);
209: PMUL(2);
210: PMUL(3);
211: #undef PMUL
212:
213: return d.ll;
214: }
215:
216: uint64_t helper_fmuld8sux16(uint64_t src1, uint64_t src2)
217: {
218: VIS64 s, d;
219: uint32_t tmp;
220:
221: s.ll = src1;
222: d.ll = src2;
223:
224: #define PMUL(r) \
225: tmp = (int32_t)d.VIS_SW64(r) * ((int32_t)s.VIS_SW64(r) >> 8); \
226: if ((tmp & 0xff) > 0x7f) { \
227: tmp += 0x100; \
228: } \
229: d.VIS_L64(r) = tmp;
230:
231: /* Reverse calculation order to handle overlap */
232: PMUL(1);
233: PMUL(0);
234: #undef PMUL
235:
236: return d.ll;
237: }
238:
239: uint64_t helper_fmuld8ulx16(uint64_t src1, uint64_t src2)
240: {
241: VIS64 s, d;
242: uint32_t tmp;
243:
244: s.ll = src1;
245: d.ll = src2;
246:
247: #define PMUL(r) \
248: tmp = (int32_t)d.VIS_SW64(r) * ((uint32_t)s.VIS_B64(r * 2)); \
249: if ((tmp & 0xff) > 0x7f) { \
250: tmp += 0x100; \
251: } \
252: d.VIS_L64(r) = tmp;
253:
254: /* Reverse calculation order to handle overlap */
255: PMUL(1);
256: PMUL(0);
257: #undef PMUL
258:
259: return d.ll;
260: }
261:
262: uint64_t helper_fexpand(uint64_t src1, uint64_t src2)
263: {
264: VIS32 s;
265: VIS64 d;
266:
267: s.l = (uint32_t)src1;
268: d.ll = src2;
269: d.VIS_W64(0) = s.VIS_B32(0) << 4;
270: d.VIS_W64(1) = s.VIS_B32(1) << 4;
271: d.VIS_W64(2) = s.VIS_B32(2) << 4;
272: d.VIS_W64(3) = s.VIS_B32(3) << 4;
273:
274: return d.ll;
275: }
276:
277: #define VIS_HELPER(name, F) \
278: uint64_t name##16(uint64_t src1, uint64_t src2) \
279: { \
280: VIS64 s, d; \
281: \
282: s.ll = src1; \
283: d.ll = src2; \
284: \
285: d.VIS_W64(0) = F(d.VIS_W64(0), s.VIS_W64(0)); \
286: d.VIS_W64(1) = F(d.VIS_W64(1), s.VIS_W64(1)); \
287: d.VIS_W64(2) = F(d.VIS_W64(2), s.VIS_W64(2)); \
288: d.VIS_W64(3) = F(d.VIS_W64(3), s.VIS_W64(3)); \
289: \
290: return d.ll; \
291: } \
292: \
293: uint32_t name##16s(uint32_t src1, uint32_t src2) \
294: { \
295: VIS32 s, d; \
296: \
297: s.l = src1; \
298: d.l = src2; \
299: \
300: d.VIS_W32(0) = F(d.VIS_W32(0), s.VIS_W32(0)); \
301: d.VIS_W32(1) = F(d.VIS_W32(1), s.VIS_W32(1)); \
302: \
303: return d.l; \
304: } \
305: \
306: uint64_t name##32(uint64_t src1, uint64_t src2) \
307: { \
308: VIS64 s, d; \
309: \
310: s.ll = src1; \
311: d.ll = src2; \
312: \
313: d.VIS_L64(0) = F(d.VIS_L64(0), s.VIS_L64(0)); \
314: d.VIS_L64(1) = F(d.VIS_L64(1), s.VIS_L64(1)); \
315: \
316: return d.ll; \
317: } \
318: \
319: uint32_t name##32s(uint32_t src1, uint32_t src2) \
320: { \
321: VIS32 s, d; \
322: \
323: s.l = src1; \
324: d.l = src2; \
325: \
326: d.l = F(d.l, s.l); \
327: \
328: return d.l; \
329: }
330:
331: #define FADD(a, b) ((a) + (b))
332: #define FSUB(a, b) ((a) - (b))
333: VIS_HELPER(helper_fpadd, FADD)
334: VIS_HELPER(helper_fpsub, FSUB)
335:
336: #define VIS_CMPHELPER(name, F) \
337: uint64_t name##16(uint64_t src1, uint64_t src2) \
338: { \
339: VIS64 s, d; \
340: \
341: s.ll = src1; \
342: d.ll = src2; \
343: \
344: d.VIS_W64(0) = F(s.VIS_W64(0), d.VIS_W64(0)) ? 1 : 0; \
345: d.VIS_W64(0) |= F(s.VIS_W64(1), d.VIS_W64(1)) ? 2 : 0; \
346: d.VIS_W64(0) |= F(s.VIS_W64(2), d.VIS_W64(2)) ? 4 : 0; \
347: d.VIS_W64(0) |= F(s.VIS_W64(3), d.VIS_W64(3)) ? 8 : 0; \
348: d.VIS_W64(1) = d.VIS_W64(2) = d.VIS_W64(3) = 0; \
349: \
350: return d.ll; \
351: } \
352: \
353: uint64_t name##32(uint64_t src1, uint64_t src2) \
354: { \
355: VIS64 s, d; \
356: \
357: s.ll = src1; \
358: d.ll = src2; \
359: \
360: d.VIS_L64(0) = F(s.VIS_L64(0), d.VIS_L64(0)) ? 1 : 0; \
361: d.VIS_L64(0) |= F(s.VIS_L64(1), d.VIS_L64(1)) ? 2 : 0; \
362: d.VIS_L64(1) = 0; \
363: \
364: return d.ll; \
365: }
366:
367: #define FCMPGT(a, b) ((a) > (b))
368: #define FCMPEQ(a, b) ((a) == (b))
369: #define FCMPLE(a, b) ((a) <= (b))
370: #define FCMPNE(a, b) ((a) != (b))
371:
372: VIS_CMPHELPER(helper_fcmpgt, FCMPGT)
373: VIS_CMPHELPER(helper_fcmpeq, FCMPEQ)
374: VIS_CMPHELPER(helper_fcmple, FCMPLE)
375: VIS_CMPHELPER(helper_fcmpne, FCMPNE)
376:
377: uint64_t helper_pdist(uint64_t sum, uint64_t src1, uint64_t src2)
378: {
379: int i;
380: for (i = 0; i < 8; i++) {
381: int s1, s2;
382:
383: s1 = (src1 >> (56 - (i * 8))) & 0xff;
384: s2 = (src2 >> (56 - (i * 8))) & 0xff;
385:
386: /* Absolute value of difference. */
387: s1 -= s2;
388: if (s1 < 0) {
389: s1 = -s1;
390: }
391:
392: sum += s1;
393: }
394:
395: return sum;
396: }
397:
398: uint32_t helper_fpack16(uint64_t gsr, uint64_t rs2)
399: {
400: int scale = (gsr >> 3) & 0xf;
401: uint32_t ret = 0;
402: int byte;
403:
404: for (byte = 0; byte < 4; byte++) {
405: uint32_t val;
406: int16_t src = rs2 >> (byte * 16);
407: int32_t scaled = src << scale;
408: int32_t from_fixed = scaled >> 7;
409:
410: val = (from_fixed < 0 ? 0 :
411: from_fixed > 255 ? 255 : from_fixed);
412:
413: ret |= val << (8 * byte);
414: }
415:
416: return ret;
417: }
418:
419: uint64_t helper_fpack32(uint64_t gsr, uint64_t rs1, uint64_t rs2)
420: {
421: int scale = (gsr >> 3) & 0x1f;
422: uint64_t ret = 0;
423: int word;
424:
425: ret = (rs1 << 8) & ~(0x000000ff000000ffULL);
426: for (word = 0; word < 2; word++) {
427: uint64_t val;
428: int32_t src = rs2 >> (word * 32);
429: int64_t scaled = (int64_t)src << scale;
430: int64_t from_fixed = scaled >> 23;
431:
432: val = (from_fixed < 0 ? 0 :
433: (from_fixed > 255) ? 255 : from_fixed);
434:
435: ret |= val << (32 * word);
436: }
437:
438: return ret;
439: }
440:
441: uint32_t helper_fpackfix(uint64_t gsr, uint64_t rs2)
442: {
443: int scale = (gsr >> 3) & 0x1f;
444: uint32_t ret = 0;
445: int word;
446:
447: for (word = 0; word < 2; word++) {
448: uint32_t val;
449: int32_t src = rs2 >> (word * 32);
450: int64_t scaled = src << scale;
451: int64_t from_fixed = scaled >> 16;
452:
453: val = (from_fixed < -32768 ? -32768 :
454: from_fixed > 32767 ? 32767 : from_fixed);
455:
456: ret |= (val & 0xffff) << (word * 16);
457: }
458:
459: return ret;
460: }
461:
462: uint64 helper_bshuffle(uint64_t gsr, uint64_t src1, uint64_t src2)
463: {
464: union {
465: uint64_t ll[2];
466: uint8_t b[16];
467: } s;
468: VIS64 r;
469: uint32_t i, mask, host;
470:
471: /* Set up S such that we can index across all of the bytes. */
472: #ifdef HOST_WORDS_BIGENDIAN
473: s.ll[0] = src1;
474: s.ll[1] = src2;
475: host = 0;
476: #else
477: s.ll[1] = src1;
478: s.ll[0] = src2;
479: host = 15;
480: #endif
481: mask = gsr >> 32;
482:
483: for (i = 0; i < 8; ++i) {
484: unsigned e = (mask >> (28 - i*4)) & 0xf;
485: r.VIS_B64(i) = s.b[e ^ host];
486: }
487:
488: return r.ll;
489: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.