1
2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2017 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
39 from IR. There are also helpers for 32-bit arithmetic in here. */
40
41 #include "libvex_basictypes.h"
42 #include "main_util.h" // LIKELY, UNLIKELY
43 #include "host_generic_simd64.h"
44
45
46
47 /* Tuple/select functions for 32x2 vectors. */
48
mk32x2(UInt w1,UInt w0)49 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
50 return (((ULong)w1) << 32) | ((ULong)w0);
51 }
52
sel32x2_1(ULong w64)53 static inline UInt sel32x2_1 ( ULong w64 ) {
54 return 0xFFFFFFFF & toUInt(w64 >> 32);
55 }
sel32x2_0(ULong w64)56 static inline UInt sel32x2_0 ( ULong w64 ) {
57 return 0xFFFFFFFF & toUInt(w64);
58 }
59
60
61 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
62 with 64-bit shifts so we give it a hand. */
63
mk16x4(UShort w3,UShort w2,UShort w1,UShort w0)64 static inline ULong mk16x4 ( UShort w3, UShort w2,
65 UShort w1, UShort w0 ) {
66 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
67 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
68 return mk32x2(hi32, lo32);
69 }
70
sel16x4_3(ULong w64)71 static inline UShort sel16x4_3 ( ULong w64 ) {
72 UInt hi32 = toUInt(w64 >> 32);
73 return toUShort(0xFFFF & (hi32 >> 16));
74 }
sel16x4_2(ULong w64)75 static inline UShort sel16x4_2 ( ULong w64 ) {
76 UInt hi32 = toUInt(w64 >> 32);
77 return toUShort(0xFFFF & hi32);
78 }
sel16x4_1(ULong w64)79 static inline UShort sel16x4_1 ( ULong w64 ) {
80 UInt lo32 = (UInt)w64;
81 return toUShort(0xFFFF & (lo32 >> 16));
82 }
sel16x4_0(ULong w64)83 static inline UShort sel16x4_0 ( ULong w64 ) {
84 UInt lo32 = (UInt)w64;
85 return toUShort(0xFFFF & lo32);
86 }
87
88
89 /* Tuple/select functions for 8x8 vectors. */
90
mk8x8(UChar w7,UChar w6,UChar w5,UChar w4,UChar w3,UChar w2,UChar w1,UChar w0)91 static inline ULong mk8x8 ( UChar w7, UChar w6,
92 UChar w5, UChar w4,
93 UChar w3, UChar w2,
94 UChar w1, UChar w0 ) {
95 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
96 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
97 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
98 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
99 return mk32x2(hi32, lo32);
100 }
101
sel8x8_7(ULong w64)102 static inline UChar sel8x8_7 ( ULong w64 ) {
103 UInt hi32 = toUInt(w64 >> 32);
104 return toUChar(0xFF & (hi32 >> 24));
105 }
sel8x8_6(ULong w64)106 static inline UChar sel8x8_6 ( ULong w64 ) {
107 UInt hi32 = toUInt(w64 >> 32);
108 return toUChar(0xFF & (hi32 >> 16));
109 }
sel8x8_5(ULong w64)110 static inline UChar sel8x8_5 ( ULong w64 ) {
111 UInt hi32 = toUInt(w64 >> 32);
112 return toUChar(0xFF & (hi32 >> 8));
113 }
sel8x8_4(ULong w64)114 static inline UChar sel8x8_4 ( ULong w64 ) {
115 UInt hi32 = toUInt(w64 >> 32);
116 return toUChar(0xFF & (hi32 >> 0));
117 }
sel8x8_3(ULong w64)118 static inline UChar sel8x8_3 ( ULong w64 ) {
119 UInt lo32 = (UInt)w64;
120 return toUChar(0xFF & (lo32 >> 24));
121 }
sel8x8_2(ULong w64)122 static inline UChar sel8x8_2 ( ULong w64 ) {
123 UInt lo32 = (UInt)w64;
124 return toUChar(0xFF & (lo32 >> 16));
125 }
sel8x8_1(ULong w64)126 static inline UChar sel8x8_1 ( ULong w64 ) {
127 UInt lo32 = (UInt)w64;
128 return toUChar(0xFF & (lo32 >> 8));
129 }
sel8x8_0(ULong w64)130 static inline UChar sel8x8_0 ( ULong w64 ) {
131 UInt lo32 = (UInt)w64;
132 return toUChar(0xFF & (lo32 >> 0));
133 }
134
index8x8(ULong w64,UChar ix)135 static inline UChar index8x8 ( ULong w64, UChar ix ) {
136 ix &= 7;
137 return toUChar((w64 >> (8*ix)) & 0xFF);
138 }
139
indexOrZero8x8(ULong w64,UChar ix)140 static inline UChar indexOrZero8x8 ( ULong w64, UChar ix ) {
141 Char zeroingMask = (Char)ix;
142 zeroingMask ^= 0x80;
143 zeroingMask >>= 7;
144 ix &= 7;
145 return toUChar( ((w64 >> (8*ix)) & zeroingMask) & 0xFF );
146 }
147
148
149 /* Scalar helpers. */
150
qadd32S(Int xx,Int yy)151 static inline Int qadd32S ( Int xx, Int yy )
152 {
153 Long t = ((Long)xx) + ((Long)yy);
154 const Long loLim = -0x80000000LL;
155 const Long hiLim = 0x7FFFFFFFLL;
156 if (t < loLim) t = loLim;
157 if (t > hiLim) t = hiLim;
158 return (Int)t;
159 }
160
qadd16S(Short xx,Short yy)161 static inline Short qadd16S ( Short xx, Short yy )
162 {
163 Int t = ((Int)xx) + ((Int)yy);
164 if (t < -32768) t = -32768;
165 if (t > 32767) t = 32767;
166 return (Short)t;
167 }
168
qadd8S(Char xx,Char yy)169 static inline Char qadd8S ( Char xx, Char yy )
170 {
171 Int t = ((Int)xx) + ((Int)yy);
172 if (t < -128) t = -128;
173 if (t > 127) t = 127;
174 return (Char)t;
175 }
176
qadd16U(UShort xx,UShort yy)177 static inline UShort qadd16U ( UShort xx, UShort yy )
178 {
179 UInt t = ((UInt)xx) + ((UInt)yy);
180 if (t > 0xFFFF) t = 0xFFFF;
181 return (UShort)t;
182 }
183
qadd8U(UChar xx,UChar yy)184 static inline UChar qadd8U ( UChar xx, UChar yy )
185 {
186 UInt t = ((UInt)xx) + ((UInt)yy);
187 if (t > 0xFF) t = 0xFF;
188 return (UChar)t;
189 }
190
qsub32S(Int xx,Int yy)191 static inline Int qsub32S ( Int xx, Int yy )
192 {
193 Long t = ((Long)xx) - ((Long)yy);
194 const Long loLim = -0x80000000LL;
195 const Long hiLim = 0x7FFFFFFFLL;
196 if (t < loLim) t = loLim;
197 if (t > hiLim) t = hiLim;
198 return (Int)t;
199 }
200
qsub16S(Short xx,Short yy)201 static inline Short qsub16S ( Short xx, Short yy )
202 {
203 Int t = ((Int)xx) - ((Int)yy);
204 if (t < -32768) t = -32768;
205 if (t > 32767) t = 32767;
206 return (Short)t;
207 }
208
qsub8S(Char xx,Char yy)209 static inline Char qsub8S ( Char xx, Char yy )
210 {
211 Int t = ((Int)xx) - ((Int)yy);
212 if (t < -128) t = -128;
213 if (t > 127) t = 127;
214 return (Char)t;
215 }
216
qsub16U(UShort xx,UShort yy)217 static inline UShort qsub16U ( UShort xx, UShort yy )
218 {
219 Int t = ((Int)xx) - ((Int)yy);
220 if (t < 0) t = 0;
221 if (t > 0xFFFF) t = 0xFFFF;
222 return (UShort)t;
223 }
224
qsub8U(UChar xx,UChar yy)225 static inline UChar qsub8U ( UChar xx, UChar yy )
226 {
227 Int t = ((Int)xx) - ((Int)yy);
228 if (t < 0) t = 0;
229 if (t > 0xFF) t = 0xFF;
230 return (UChar)t;
231 }
232
mul16(Short xx,Short yy)233 static inline Short mul16 ( Short xx, Short yy )
234 {
235 Int t = ((Int)xx) * ((Int)yy);
236 return (Short)t;
237 }
238
mul32(Int xx,Int yy)239 static inline Int mul32 ( Int xx, Int yy )
240 {
241 Int t = ((Int)xx) * ((Int)yy);
242 return (Int)t;
243 }
244
mulhi16S(Short xx,Short yy)245 static inline Short mulhi16S ( Short xx, Short yy )
246 {
247 Int t = ((Int)xx) * ((Int)yy);
248 t >>=/*s*/ 16;
249 return (Short)t;
250 }
251
mulhi16U(UShort xx,UShort yy)252 static inline UShort mulhi16U ( UShort xx, UShort yy )
253 {
254 UInt t = ((UInt)xx) * ((UInt)yy);
255 t >>=/*u*/ 16;
256 return (UShort)t;
257 }
258
cmpeq32(UInt xx,UInt yy)259 static inline UInt cmpeq32 ( UInt xx, UInt yy )
260 {
261 return xx==yy ? 0xFFFFFFFF : 0;
262 }
263
cmpeq16(UShort xx,UShort yy)264 static inline UShort cmpeq16 ( UShort xx, UShort yy )
265 {
266 return toUShort(xx==yy ? 0xFFFF : 0);
267 }
268
cmpeq8(UChar xx,UChar yy)269 static inline UChar cmpeq8 ( UChar xx, UChar yy )
270 {
271 return toUChar(xx==yy ? 0xFF : 0);
272 }
273
cmpgt32S(Int xx,Int yy)274 static inline UInt cmpgt32S ( Int xx, Int yy )
275 {
276 return xx>yy ? 0xFFFFFFFF : 0;
277 }
278
cmpgt16S(Short xx,Short yy)279 static inline UShort cmpgt16S ( Short xx, Short yy )
280 {
281 return toUShort(xx>yy ? 0xFFFF : 0);
282 }
283
cmpgt8S(Char xx,Char yy)284 static inline UChar cmpgt8S ( Char xx, Char yy )
285 {
286 return toUChar(xx>yy ? 0xFF : 0);
287 }
288
cmpnez32(UInt xx)289 static inline UInt cmpnez32 ( UInt xx )
290 {
291 return xx==0 ? 0 : 0xFFFFFFFF;
292 }
293
cmpnez16(UShort xx)294 static inline UShort cmpnez16 ( UShort xx )
295 {
296 return toUShort(xx==0 ? 0 : 0xFFFF);
297 }
298
cmpnez8(UChar xx)299 static inline UChar cmpnez8 ( UChar xx )
300 {
301 return toUChar(xx==0 ? 0 : 0xFF);
302 }
303
qnarrow32Sto16S(UInt xx0)304 static inline Short qnarrow32Sto16S ( UInt xx0 )
305 {
306 Int xx = (Int)xx0;
307 if (xx < -32768) xx = -32768;
308 if (xx > 32767) xx = 32767;
309 return (Short)xx;
310 }
311
qnarrow16Sto8S(UShort xx0)312 static inline Char qnarrow16Sto8S ( UShort xx0 )
313 {
314 Short xx = (Short)xx0;
315 if (xx < -128) xx = -128;
316 if (xx > 127) xx = 127;
317 return (Char)xx;
318 }
319
qnarrow16Sto8U(UShort xx0)320 static inline UChar qnarrow16Sto8U ( UShort xx0 )
321 {
322 Short xx = (Short)xx0;
323 if (xx < 0) xx = 0;
324 if (xx > 255) xx = 255;
325 return (UChar)xx;
326 }
327
narrow32to16(UInt xx)328 static inline UShort narrow32to16 ( UInt xx )
329 {
330 return (UShort)xx;
331 }
332
narrow16to8(UShort xx)333 static inline UChar narrow16to8 ( UShort xx )
334 {
335 return (UChar)xx;
336 }
337
338 /* shifts: we don't care about out-of-range ones, since
339 that is dealt with at a higher level. */
340
shl8(UChar v,UInt n)341 static inline UChar shl8 ( UChar v, UInt n )
342 {
343 return toUChar(v << n);
344 }
345
sar8(UChar v,UInt n)346 static inline UChar sar8 ( UChar v, UInt n )
347 {
348 return toUChar(((Char)v) >> n);
349 }
350
shl16(UShort v,UInt n)351 static inline UShort shl16 ( UShort v, UInt n )
352 {
353 return toUShort(v << n);
354 }
355
shr16(UShort v,UInt n)356 static inline UShort shr16 ( UShort v, UInt n )
357 {
358 return toUShort((((UShort)v) >> n));
359 }
360
sar16(UShort v,UInt n)361 static inline UShort sar16 ( UShort v, UInt n )
362 {
363 return toUShort(((Short)v) >> n);
364 }
365
shl32(UInt v,UInt n)366 static inline UInt shl32 ( UInt v, UInt n )
367 {
368 return v << n;
369 }
370
shr32(UInt v,UInt n)371 static inline UInt shr32 ( UInt v, UInt n )
372 {
373 return (((UInt)v) >> n);
374 }
375
sar32(UInt v,UInt n)376 static inline UInt sar32 ( UInt v, UInt n )
377 {
378 return ((Int)v) >> n;
379 }
380
avg8U(UChar xx,UChar yy)381 static inline UChar avg8U ( UChar xx, UChar yy )
382 {
383 UInt xxi = (UInt)xx;
384 UInt yyi = (UInt)yy;
385 UInt r = (xxi + yyi + 1) >> 1;
386 return (UChar)r;
387 }
388
avg16U(UShort xx,UShort yy)389 static inline UShort avg16U ( UShort xx, UShort yy )
390 {
391 UInt xxi = (UInt)xx;
392 UInt yyi = (UInt)yy;
393 UInt r = (xxi + yyi + 1) >> 1;
394 return (UShort)r;
395 }
396
max16S(Short xx,Short yy)397 static inline Short max16S ( Short xx, Short yy )
398 {
399 return toUShort((xx > yy) ? xx : yy);
400 }
401
max8U(UChar xx,UChar yy)402 static inline UChar max8U ( UChar xx, UChar yy )
403 {
404 return toUChar((xx > yy) ? xx : yy);
405 }
406
min16S(Short xx,Short yy)407 static inline Short min16S ( Short xx, Short yy )
408 {
409 return toUShort((xx < yy) ? xx : yy);
410 }
411
min8U(UChar xx,UChar yy)412 static inline UChar min8U ( UChar xx, UChar yy )
413 {
414 return toUChar((xx < yy) ? xx : yy);
415 }
416
hadd16U(UShort xx,UShort yy)417 static inline UShort hadd16U ( UShort xx, UShort yy )
418 {
419 UInt xxi = (UInt)xx;
420 UInt yyi = (UInt)yy;
421 UInt r = (xxi + yyi) >> 1;
422 return (UShort)r;
423 }
424
hadd16S(Short xx,Short yy)425 static inline Short hadd16S ( Short xx, Short yy )
426 {
427 Int xxi = (Int)xx;
428 Int yyi = (Int)yy;
429 Int r = (xxi + yyi) >> 1;
430 return (Short)r;
431 }
432
hsub16U(UShort xx,UShort yy)433 static inline UShort hsub16U ( UShort xx, UShort yy )
434 {
435 UInt xxi = (UInt)xx;
436 UInt yyi = (UInt)yy;
437 UInt r = (xxi - yyi) >> 1;
438 return (UShort)r;
439 }
440
hsub16S(Short xx,Short yy)441 static inline Short hsub16S ( Short xx, Short yy )
442 {
443 Int xxi = (Int)xx;
444 Int yyi = (Int)yy;
445 Int r = (xxi - yyi) >> 1;
446 return (Short)r;
447 }
448
hadd8U(UChar xx,UChar yy)449 static inline UChar hadd8U ( UChar xx, UChar yy )
450 {
451 UInt xxi = (UInt)xx;
452 UInt yyi = (UInt)yy;
453 UInt r = (xxi + yyi) >> 1;
454 return (UChar)r;
455 }
456
hadd8S(Char xx,Char yy)457 static inline Char hadd8S ( Char xx, Char yy )
458 {
459 Int xxi = (Int)xx;
460 Int yyi = (Int)yy;
461 Int r = (xxi + yyi) >> 1;
462 return (Char)r;
463 }
464
hsub8U(UChar xx,UChar yy)465 static inline UChar hsub8U ( UChar xx, UChar yy )
466 {
467 UInt xxi = (UInt)xx;
468 UInt yyi = (UInt)yy;
469 UInt r = (xxi - yyi) >> 1;
470 return (UChar)r;
471 }
472
hsub8S(Char xx,Char yy)473 static inline Char hsub8S ( Char xx, Char yy )
474 {
475 Int xxi = (Int)xx;
476 Int yyi = (Int)yy;
477 Int r = (xxi - yyi) >> 1;
478 return (Char)r;
479 }
480
absdiff8U(UChar xx,UChar yy)481 static inline UInt absdiff8U ( UChar xx, UChar yy )
482 {
483 UInt xxu = (UChar)xx;
484 UInt yyu = (UChar)yy;
485 return xxu >= yyu ? xxu - yyu : yyu - xxu;
486 }
487
488 /* ----------------------------------------------------- */
489 /* Start of the externally visible functions. These simply
490 implement the corresponding IR primops. */
491 /* ----------------------------------------------------- */
492
493 /* ------------ Normal addition ------------ */
494
h_generic_calc_Add32x2(ULong xx,ULong yy)495 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
496 {
497 return mk32x2(
498 sel32x2_1(xx) + sel32x2_1(yy),
499 sel32x2_0(xx) + sel32x2_0(yy)
500 );
501 }
502
h_generic_calc_Add16x4(ULong xx,ULong yy)503 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
504 {
505 return mk16x4(
506 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
507 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
508 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
509 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
510 );
511 }
512
h_generic_calc_Add8x8(ULong xx,ULong yy)513 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
514 {
515 return mk8x8(
516 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
517 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
518 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
519 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
520 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
521 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
522 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
523 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
524 );
525 }
526
527 /* ------------ Saturating addition ------------ */
528
h_generic_calc_QAdd16Sx4(ULong xx,ULong yy)529 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
530 {
531 return mk16x4(
532 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
533 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
534 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
535 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
536 );
537 }
538
h_generic_calc_QAdd8Sx8(ULong xx,ULong yy)539 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
540 {
541 return mk8x8(
542 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
543 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
544 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
545 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
546 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
547 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
548 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
549 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
550 );
551 }
552
h_generic_calc_QAdd16Ux4(ULong xx,ULong yy)553 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
554 {
555 return mk16x4(
556 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
557 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
558 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
559 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
560 );
561 }
562
h_generic_calc_QAdd8Ux8(ULong xx,ULong yy)563 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
564 {
565 return mk8x8(
566 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
567 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
568 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
569 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
570 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
571 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
572 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
573 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
574 );
575 }
576
577 /* ------------ Normal subtraction ------------ */
578
h_generic_calc_Sub32x2(ULong xx,ULong yy)579 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
580 {
581 return mk32x2(
582 sel32x2_1(xx) - sel32x2_1(yy),
583 sel32x2_0(xx) - sel32x2_0(yy)
584 );
585 }
586
h_generic_calc_Sub16x4(ULong xx,ULong yy)587 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
588 {
589 return mk16x4(
590 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
591 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
592 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
593 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
594 );
595 }
596
h_generic_calc_Sub8x8(ULong xx,ULong yy)597 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
598 {
599 return mk8x8(
600 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
601 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
602 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
603 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
604 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
605 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
606 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
607 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
608 );
609 }
610
611 /* ------------ Saturating subtraction ------------ */
612
h_generic_calc_QSub16Sx4(ULong xx,ULong yy)613 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
614 {
615 return mk16x4(
616 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
617 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
618 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
619 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
620 );
621 }
622
h_generic_calc_QSub8Sx8(ULong xx,ULong yy)623 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
624 {
625 return mk8x8(
626 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
627 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
628 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
629 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
630 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
631 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
632 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
633 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
634 );
635 }
636
h_generic_calc_QSub16Ux4(ULong xx,ULong yy)637 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
638 {
639 return mk16x4(
640 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
641 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
642 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
643 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
644 );
645 }
646
h_generic_calc_QSub8Ux8(ULong xx,ULong yy)647 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
648 {
649 return mk8x8(
650 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
651 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
652 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
653 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
654 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
655 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
656 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
657 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
658 );
659 }
660
661 /* ------------ Multiplication ------------ */
662
h_generic_calc_Mul16x4(ULong xx,ULong yy)663 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
664 {
665 return mk16x4(
666 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
667 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
668 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
669 mul16( sel16x4_0(xx), sel16x4_0(yy) )
670 );
671 }
672
h_generic_calc_Mul32x2(ULong xx,ULong yy)673 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
674 {
675 return mk32x2(
676 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
677 mul32( sel32x2_0(xx), sel32x2_0(yy) )
678 );
679 }
680
h_generic_calc_MulHi16Sx4(ULong xx,ULong yy)681 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
682 {
683 return mk16x4(
684 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
685 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
686 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
687 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
688 );
689 }
690
h_generic_calc_MulHi16Ux4(ULong xx,ULong yy)691 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
692 {
693 return mk16x4(
694 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
695 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
696 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
697 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
698 );
699 }
700
701 /* ------------ Comparison ------------ */
702
h_generic_calc_CmpEQ32x2(ULong xx,ULong yy)703 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
704 {
705 return mk32x2(
706 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
707 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
708 );
709 }
710
h_generic_calc_CmpEQ16x4(ULong xx,ULong yy)711 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
712 {
713 return mk16x4(
714 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
715 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
716 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
717 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
718 );
719 }
720
h_generic_calc_CmpEQ8x8(ULong xx,ULong yy)721 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
722 {
723 return mk8x8(
724 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
725 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
726 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
727 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
728 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
729 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
730 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
731 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
732 );
733 }
734
h_generic_calc_CmpGT32Sx2(ULong xx,ULong yy)735 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
736 {
737 return mk32x2(
738 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
739 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
740 );
741 }
742
h_generic_calc_CmpGT16Sx4(ULong xx,ULong yy)743 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
744 {
745 return mk16x4(
746 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
747 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
748 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
749 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
750 );
751 }
752
h_generic_calc_CmpGT8Sx8(ULong xx,ULong yy)753 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
754 {
755 return mk8x8(
756 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
757 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
758 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
759 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
760 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
761 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
762 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
763 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
764 );
765 }
766
h_generic_calc_CmpNEZ32x2(ULong xx)767 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
768 {
769 return mk32x2(
770 cmpnez32( sel32x2_1(xx) ),
771 cmpnez32( sel32x2_0(xx) )
772 );
773 }
774
h_generic_calc_CmpNEZ16x4(ULong xx)775 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
776 {
777 return mk16x4(
778 cmpnez16( sel16x4_3(xx) ),
779 cmpnez16( sel16x4_2(xx) ),
780 cmpnez16( sel16x4_1(xx) ),
781 cmpnez16( sel16x4_0(xx) )
782 );
783 }
784
h_generic_calc_CmpNEZ8x8(ULong xx)785 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
786 {
787 return mk8x8(
788 cmpnez8( sel8x8_7(xx) ),
789 cmpnez8( sel8x8_6(xx) ),
790 cmpnez8( sel8x8_5(xx) ),
791 cmpnez8( sel8x8_4(xx) ),
792 cmpnez8( sel8x8_3(xx) ),
793 cmpnez8( sel8x8_2(xx) ),
794 cmpnez8( sel8x8_1(xx) ),
795 cmpnez8( sel8x8_0(xx) )
796 );
797 }
798
799 /* ------------ Saturating narrowing ------------ */
800
h_generic_calc_QNarrowBin32Sto16Sx4(ULong aa,ULong bb)801 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
802 {
803 UInt d = sel32x2_1(aa);
804 UInt c = sel32x2_0(aa);
805 UInt b = sel32x2_1(bb);
806 UInt a = sel32x2_0(bb);
807 return mk16x4(
808 qnarrow32Sto16S(d),
809 qnarrow32Sto16S(c),
810 qnarrow32Sto16S(b),
811 qnarrow32Sto16S(a)
812 );
813 }
814
h_generic_calc_QNarrowBin16Sto8Sx8(ULong aa,ULong bb)815 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
816 {
817 UShort h = sel16x4_3(aa);
818 UShort g = sel16x4_2(aa);
819 UShort f = sel16x4_1(aa);
820 UShort e = sel16x4_0(aa);
821 UShort d = sel16x4_3(bb);
822 UShort c = sel16x4_2(bb);
823 UShort b = sel16x4_1(bb);
824 UShort a = sel16x4_0(bb);
825 return mk8x8(
826 qnarrow16Sto8S(h),
827 qnarrow16Sto8S(g),
828 qnarrow16Sto8S(f),
829 qnarrow16Sto8S(e),
830 qnarrow16Sto8S(d),
831 qnarrow16Sto8S(c),
832 qnarrow16Sto8S(b),
833 qnarrow16Sto8S(a)
834 );
835 }
836
h_generic_calc_QNarrowBin16Sto8Ux8(ULong aa,ULong bb)837 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
838 {
839 UShort h = sel16x4_3(aa);
840 UShort g = sel16x4_2(aa);
841 UShort f = sel16x4_1(aa);
842 UShort e = sel16x4_0(aa);
843 UShort d = sel16x4_3(bb);
844 UShort c = sel16x4_2(bb);
845 UShort b = sel16x4_1(bb);
846 UShort a = sel16x4_0(bb);
847 return mk8x8(
848 qnarrow16Sto8U(h),
849 qnarrow16Sto8U(g),
850 qnarrow16Sto8U(f),
851 qnarrow16Sto8U(e),
852 qnarrow16Sto8U(d),
853 qnarrow16Sto8U(c),
854 qnarrow16Sto8U(b),
855 qnarrow16Sto8U(a)
856 );
857 }
858
859 /* ------------ Truncating narrowing ------------ */
860
h_generic_calc_NarrowBin32to16x4(ULong aa,ULong bb)861 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
862 {
863 UInt d = sel32x2_1(aa);
864 UInt c = sel32x2_0(aa);
865 UInt b = sel32x2_1(bb);
866 UInt a = sel32x2_0(bb);
867 return mk16x4(
868 narrow32to16(d),
869 narrow32to16(c),
870 narrow32to16(b),
871 narrow32to16(a)
872 );
873 }
874
h_generic_calc_NarrowBin16to8x8(ULong aa,ULong bb)875 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
876 {
877 UShort h = sel16x4_3(aa);
878 UShort g = sel16x4_2(aa);
879 UShort f = sel16x4_1(aa);
880 UShort e = sel16x4_0(aa);
881 UShort d = sel16x4_3(bb);
882 UShort c = sel16x4_2(bb);
883 UShort b = sel16x4_1(bb);
884 UShort a = sel16x4_0(bb);
885 return mk8x8(
886 narrow16to8(h),
887 narrow16to8(g),
888 narrow16to8(f),
889 narrow16to8(e),
890 narrow16to8(d),
891 narrow16to8(c),
892 narrow16to8(b),
893 narrow16to8(a)
894 );
895 }
896
897 /* ------------ Interleaving ------------ */
898
h_generic_calc_InterleaveHI8x8(ULong aa,ULong bb)899 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
900 {
901 return mk8x8(
902 sel8x8_7(aa),
903 sel8x8_7(bb),
904 sel8x8_6(aa),
905 sel8x8_6(bb),
906 sel8x8_5(aa),
907 sel8x8_5(bb),
908 sel8x8_4(aa),
909 sel8x8_4(bb)
910 );
911 }
912
h_generic_calc_InterleaveLO8x8(ULong aa,ULong bb)913 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
914 {
915 return mk8x8(
916 sel8x8_3(aa),
917 sel8x8_3(bb),
918 sel8x8_2(aa),
919 sel8x8_2(bb),
920 sel8x8_1(aa),
921 sel8x8_1(bb),
922 sel8x8_0(aa),
923 sel8x8_0(bb)
924 );
925 }
926
h_generic_calc_InterleaveHI16x4(ULong aa,ULong bb)927 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
928 {
929 return mk16x4(
930 sel16x4_3(aa),
931 sel16x4_3(bb),
932 sel16x4_2(aa),
933 sel16x4_2(bb)
934 );
935 }
936
h_generic_calc_InterleaveLO16x4(ULong aa,ULong bb)937 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
938 {
939 return mk16x4(
940 sel16x4_1(aa),
941 sel16x4_1(bb),
942 sel16x4_0(aa),
943 sel16x4_0(bb)
944 );
945 }
946
h_generic_calc_InterleaveHI32x2(ULong aa,ULong bb)947 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
948 {
949 return mk32x2(
950 sel32x2_1(aa),
951 sel32x2_1(bb)
952 );
953 }
954
h_generic_calc_InterleaveLO32x2(ULong aa,ULong bb)955 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
956 {
957 return mk32x2(
958 sel32x2_0(aa),
959 sel32x2_0(bb)
960 );
961 }
962
963 /* ------------ Concatenation ------------ */
964
h_generic_calc_CatOddLanes16x4(ULong aa,ULong bb)965 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
966 {
967 return mk16x4(
968 sel16x4_3(aa),
969 sel16x4_1(aa),
970 sel16x4_3(bb),
971 sel16x4_1(bb)
972 );
973 }
974
h_generic_calc_CatEvenLanes16x4(ULong aa,ULong bb)975 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
976 {
977 return mk16x4(
978 sel16x4_2(aa),
979 sel16x4_0(aa),
980 sel16x4_2(bb),
981 sel16x4_0(bb)
982 );
983 }
984
985 /* ------------ Permutation ------------ */
986
h_generic_calc_Perm8x8(ULong aa,ULong bb)987 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
988 {
989 return mk8x8(
990 index8x8(aa, sel8x8_7(bb)),
991 index8x8(aa, sel8x8_6(bb)),
992 index8x8(aa, sel8x8_5(bb)),
993 index8x8(aa, sel8x8_4(bb)),
994 index8x8(aa, sel8x8_3(bb)),
995 index8x8(aa, sel8x8_2(bb)),
996 index8x8(aa, sel8x8_1(bb)),
997 index8x8(aa, sel8x8_0(bb))
998 );
999 }
1000
h_generic_calc_PermOrZero8x8(ULong aa,ULong bb)1001 ULong h_generic_calc_PermOrZero8x8 ( ULong aa, ULong bb )
1002 {
1003 return mk8x8(
1004 indexOrZero8x8(aa, sel8x8_7(bb)),
1005 indexOrZero8x8(aa, sel8x8_6(bb)),
1006 indexOrZero8x8(aa, sel8x8_5(bb)),
1007 indexOrZero8x8(aa, sel8x8_4(bb)),
1008 indexOrZero8x8(aa, sel8x8_3(bb)),
1009 indexOrZero8x8(aa, sel8x8_2(bb)),
1010 indexOrZero8x8(aa, sel8x8_1(bb)),
1011 indexOrZero8x8(aa, sel8x8_0(bb))
1012 );
1013 }
1014
1015 /* ------------ Shifting ------------ */
1016 /* Note that because these primops are undefined if the shift amount
1017 equals or exceeds the lane width, the shift amount is masked so
1018 that the scalar shifts are always in range. In fact, given the
1019 semantics of these primops (ShlN16x4, etc) it is an error if in
1020 fact we are ever given an out-of-range shift amount.
1021 */
h_generic_calc_ShlN32x2(ULong xx,UInt nn)1022 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1023 {
1024 /* vassert(nn < 32); */
1025 nn &= 31;
1026 return mk32x2(
1027 shl32( sel32x2_1(xx), nn ),
1028 shl32( sel32x2_0(xx), nn )
1029 );
1030 }
1031
h_generic_calc_ShlN16x4(ULong xx,UInt nn)1032 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1033 {
1034 /* vassert(nn < 16); */
1035 nn &= 15;
1036 return mk16x4(
1037 shl16( sel16x4_3(xx), nn ),
1038 shl16( sel16x4_2(xx), nn ),
1039 shl16( sel16x4_1(xx), nn ),
1040 shl16( sel16x4_0(xx), nn )
1041 );
1042 }
1043
h_generic_calc_ShlN8x8(ULong xx,UInt nn)1044 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
1045 {
1046 /* vassert(nn < 8); */
1047 nn &= 7;
1048 return mk8x8(
1049 shl8( sel8x8_7(xx), nn ),
1050 shl8( sel8x8_6(xx), nn ),
1051 shl8( sel8x8_5(xx), nn ),
1052 shl8( sel8x8_4(xx), nn ),
1053 shl8( sel8x8_3(xx), nn ),
1054 shl8( sel8x8_2(xx), nn ),
1055 shl8( sel8x8_1(xx), nn ),
1056 shl8( sel8x8_0(xx), nn )
1057 );
1058 }
1059
h_generic_calc_ShrN32x2(ULong xx,UInt nn)1060 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1061 {
1062 /* vassert(nn < 32); */
1063 nn &= 31;
1064 return mk32x2(
1065 shr32( sel32x2_1(xx), nn ),
1066 shr32( sel32x2_0(xx), nn )
1067 );
1068 }
1069
h_generic_calc_ShrN16x4(ULong xx,UInt nn)1070 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1071 {
1072 /* vassert(nn < 16); */
1073 nn &= 15;
1074 return mk16x4(
1075 shr16( sel16x4_3(xx), nn ),
1076 shr16( sel16x4_2(xx), nn ),
1077 shr16( sel16x4_1(xx), nn ),
1078 shr16( sel16x4_0(xx), nn )
1079 );
1080 }
1081
h_generic_calc_SarN32x2(ULong xx,UInt nn)1082 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1083 {
1084 /* vassert(nn < 32); */
1085 nn &= 31;
1086 return mk32x2(
1087 sar32( sel32x2_1(xx), nn ),
1088 sar32( sel32x2_0(xx), nn )
1089 );
1090 }
1091
h_generic_calc_SarN16x4(ULong xx,UInt nn)1092 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1093 {
1094 /* vassert(nn < 16); */
1095 nn &= 15;
1096 return mk16x4(
1097 sar16( sel16x4_3(xx), nn ),
1098 sar16( sel16x4_2(xx), nn ),
1099 sar16( sel16x4_1(xx), nn ),
1100 sar16( sel16x4_0(xx), nn )
1101 );
1102 }
1103
h_generic_calc_SarN8x8(ULong xx,UInt nn)1104 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1105 {
1106 /* vassert(nn < 8); */
1107 nn &= 7;
1108 return mk8x8(
1109 sar8( sel8x8_7(xx), nn ),
1110 sar8( sel8x8_6(xx), nn ),
1111 sar8( sel8x8_5(xx), nn ),
1112 sar8( sel8x8_4(xx), nn ),
1113 sar8( sel8x8_3(xx), nn ),
1114 sar8( sel8x8_2(xx), nn ),
1115 sar8( sel8x8_1(xx), nn ),
1116 sar8( sel8x8_0(xx), nn )
1117 );
1118 }
1119
1120 /* ------------ Averaging ------------ */
1121
h_generic_calc_Avg8Ux8(ULong xx,ULong yy)1122 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1123 {
1124 return mk8x8(
1125 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1126 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1127 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1128 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1129 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1130 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1131 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1132 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1133 );
1134 }
1135
h_generic_calc_Avg16Ux4(ULong xx,ULong yy)1136 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1137 {
1138 return mk16x4(
1139 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1140 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1141 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1142 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1143 );
1144 }
1145
1146 /* ------------ max/min ------------ */
1147
h_generic_calc_Max16Sx4(ULong xx,ULong yy)1148 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1149 {
1150 return mk16x4(
1151 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1152 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1153 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1154 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1155 );
1156 }
1157
h_generic_calc_Max8Ux8(ULong xx,ULong yy)1158 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1159 {
1160 return mk8x8(
1161 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1162 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1163 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1164 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1165 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1166 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1167 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1168 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1169 );
1170 }
1171
h_generic_calc_Min16Sx4(ULong xx,ULong yy)1172 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1173 {
1174 return mk16x4(
1175 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1176 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1177 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1178 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1179 );
1180 }
1181
h_generic_calc_Min8Ux8(ULong xx,ULong yy)1182 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1183 {
1184 return mk8x8(
1185 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1186 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1187 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1188 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1189 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1190 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1191 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1192 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1193 );
1194 }
1195
h_generic_calc_GetMSBs8x8(ULong xx)1196 UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1197 {
1198 UInt r = 0;
1199 if (xx & (1ULL << (64-1))) r |= (1<<7);
1200 if (xx & (1ULL << (56-1))) r |= (1<<6);
1201 if (xx & (1ULL << (48-1))) r |= (1<<5);
1202 if (xx & (1ULL << (40-1))) r |= (1<<4);
1203 if (xx & (1ULL << (32-1))) r |= (1<<3);
1204 if (xx & (1ULL << (24-1))) r |= (1<<2);
1205 if (xx & (1ULL << (16-1))) r |= (1<<1);
1206 if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1207 return r;
1208 }
1209
1210 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1211
1212 /* Tuple/select functions for 16x2 vectors. */
mk16x2(UShort w1,UShort w2)1213 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1214 return (((UInt)w1) << 16) | ((UInt)w2);
1215 }
1216
sel16x2_1(UInt w32)1217 static inline UShort sel16x2_1 ( UInt w32 ) {
1218 return 0xFFFF & (UShort)(w32 >> 16);
1219 }
sel16x2_0(UInt w32)1220 static inline UShort sel16x2_0 ( UInt w32 ) {
1221 return 0xFFFF & (UShort)(w32);
1222 }
1223
mk8x4(UChar w3,UChar w2,UChar w1,UChar w0)1224 static inline UInt mk8x4 ( UChar w3, UChar w2,
1225 UChar w1, UChar w0 ) {
1226 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1227 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1228 return w32;
1229 }
1230
sel8x4_3(UInt w32)1231 static inline UChar sel8x4_3 ( UInt w32 ) {
1232 return toUChar(0xFF & (w32 >> 24));
1233 }
sel8x4_2(UInt w32)1234 static inline UChar sel8x4_2 ( UInt w32 ) {
1235 return toUChar(0xFF & (w32 >> 16));
1236 }
sel8x4_1(UInt w32)1237 static inline UChar sel8x4_1 ( UInt w32 ) {
1238 return toUChar(0xFF & (w32 >> 8));
1239 }
sel8x4_0(UInt w32)1240 static inline UChar sel8x4_0 ( UInt w32 ) {
1241 return toUChar(0xFF & (w32 >> 0));
1242 }
1243
1244
1245 /* ----------------------------------------------------- */
1246 /* More externally visible functions. These simply
1247 implement the corresponding IR primops. */
1248 /* ----------------------------------------------------- */
1249
1250 /* ------ 16x2 ------ */
1251
h_generic_calc_Add16x2(UInt xx,UInt yy)1252 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1253 {
1254 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1255 sel16x2_0(xx) + sel16x2_0(yy) );
1256 }
1257
h_generic_calc_Sub16x2(UInt xx,UInt yy)1258 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1259 {
1260 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1261 sel16x2_0(xx) - sel16x2_0(yy) );
1262 }
1263
h_generic_calc_HAdd16Ux2(UInt xx,UInt yy)1264 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1265 {
1266 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1267 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1268 }
1269
h_generic_calc_HAdd16Sx2(UInt xx,UInt yy)1270 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1271 {
1272 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1273 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1274 }
1275
h_generic_calc_HSub16Ux2(UInt xx,UInt yy)1276 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1277 {
1278 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1279 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1280 }
1281
h_generic_calc_HSub16Sx2(UInt xx,UInt yy)1282 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1283 {
1284 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1285 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1286 }
1287
h_generic_calc_QAdd16Ux2(UInt xx,UInt yy)1288 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1289 {
1290 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1291 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1292 }
1293
h_generic_calc_QAdd16Sx2(UInt xx,UInt yy)1294 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1295 {
1296 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1297 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1298 }
1299
h_generic_calc_QSub16Ux2(UInt xx,UInt yy)1300 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1301 {
1302 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1303 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1304 }
1305
h_generic_calc_QSub16Sx2(UInt xx,UInt yy)1306 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1307 {
1308 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1309 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1310 }
1311
1312 /* ------ 8x4 ------ */
1313
h_generic_calc_Add8x4(UInt xx,UInt yy)1314 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1315 {
1316 return mk8x4(
1317 sel8x4_3(xx) + sel8x4_3(yy),
1318 sel8x4_2(xx) + sel8x4_2(yy),
1319 sel8x4_1(xx) + sel8x4_1(yy),
1320 sel8x4_0(xx) + sel8x4_0(yy)
1321 );
1322 }
1323
h_generic_calc_Sub8x4(UInt xx,UInt yy)1324 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1325 {
1326 return mk8x4(
1327 sel8x4_3(xx) - sel8x4_3(yy),
1328 sel8x4_2(xx) - sel8x4_2(yy),
1329 sel8x4_1(xx) - sel8x4_1(yy),
1330 sel8x4_0(xx) - sel8x4_0(yy)
1331 );
1332 }
1333
h_generic_calc_HAdd8Ux4(UInt xx,UInt yy)1334 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1335 {
1336 return mk8x4(
1337 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1338 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1339 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1340 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1341 );
1342 }
1343
h_generic_calc_HAdd8Sx4(UInt xx,UInt yy)1344 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1345 {
1346 return mk8x4(
1347 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1348 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1349 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1350 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1351 );
1352 }
1353
h_generic_calc_HSub8Ux4(UInt xx,UInt yy)1354 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1355 {
1356 return mk8x4(
1357 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1358 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1359 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1360 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1361 );
1362 }
1363
h_generic_calc_HSub8Sx4(UInt xx,UInt yy)1364 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1365 {
1366 return mk8x4(
1367 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1368 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1369 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1370 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1371 );
1372 }
1373
h_generic_calc_QAdd8Ux4(UInt xx,UInt yy)1374 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1375 {
1376 return mk8x4(
1377 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1378 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1379 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1380 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1381 );
1382 }
1383
h_generic_calc_QAdd8Sx4(UInt xx,UInt yy)1384 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1385 {
1386 return mk8x4(
1387 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1388 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1389 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1390 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1391 );
1392 }
1393
h_generic_calc_QSub8Ux4(UInt xx,UInt yy)1394 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1395 {
1396 return mk8x4(
1397 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1398 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1399 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1400 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1401 );
1402 }
1403
h_generic_calc_QSub8Sx4(UInt xx,UInt yy)1404 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1405 {
1406 return mk8x4(
1407 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1408 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1409 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1410 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1411 );
1412 }
1413
h_generic_calc_CmpNEZ16x2(UInt xx)1414 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1415 {
1416 return mk16x2(
1417 cmpnez16( sel16x2_1(xx) ),
1418 cmpnez16( sel16x2_0(xx) )
1419 );
1420 }
1421
h_generic_calc_CmpNEZ8x4(UInt xx)1422 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1423 {
1424 return mk8x4(
1425 cmpnez8( sel8x4_3(xx) ),
1426 cmpnez8( sel8x4_2(xx) ),
1427 cmpnez8( sel8x4_1(xx) ),
1428 cmpnez8( sel8x4_0(xx) )
1429 );
1430 }
1431
h_generic_calc_Sad8Ux4(UInt xx,UInt yy)1432 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1433 {
1434 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1435 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1436 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1437 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1438 }
1439
h_generic_calc_QAdd32S(UInt xx,UInt yy)1440 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1441 {
1442 return qadd32S( xx, yy );
1443 }
1444
h_generic_calc_QSub32S(UInt xx,UInt yy)1445 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1446 {
1447 return qsub32S( xx, yy );
1448 }
1449
1450
1451 /*------------------------------------------------------------------*/
1452 /* Decimal Floating Point (DFP) externally visible helper functions */
1453 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1454 /*------------------------------------------------------------------*/
1455
1456 #define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1457 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1458 #define PUT( x, y ) ( ( x )<< ( y ) )
1459
dpb_to_bcd(ULong chunk)1460 static ULong dpb_to_bcd( ULong chunk )
1461 {
1462 Short a, b, c, d, e, f, g, h, i, j, k, m;
1463 Short p, q, r, s, t, u, v, w, x, y;
1464 ULong value;
1465
1466 /* convert 10 bit densely packed BCD to BCD */
1467 p = GET( chunk, 9 );
1468 q = GET( chunk, 8 );
1469 r = GET( chunk, 7 );
1470 s = GET( chunk, 6 );
1471 t = GET( chunk, 5 );
1472 u = GET( chunk, 4 );
1473 v = GET( chunk, 3 );
1474 w = GET( chunk, 2 );
1475 x = GET( chunk, 1 );
1476 y = GET( chunk, 0 );
1477
1478 /* The BCD bit values are given by the following boolean equations.*/
1479 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1480 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1481 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1482 d = r;
1483 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1484 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1485 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1486 h = u;
1487 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1488 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1489 | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1490 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1491 | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1492 m = y;
1493
1494 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1495 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1496 | PUT(k, 1) | PUT(m, 0);
1497 return value;
1498 }
1499
bcd_to_dpb(ULong chunk)1500 static ULong bcd_to_dpb( ULong chunk )
1501 {
1502 Short a, b, c, d, e, f, g, h, i, j, k, m;
1503 Short p, q, r, s, t, u, v, w, x, y;
1504 ULong value;
1505 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1506 The boolean equations to calculate the value of each of the DPD bit
1507 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1508 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1509 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1510 */
1511 a = GET( chunk, 11 );
1512 b = GET( chunk, 10 );
1513 c = GET( chunk, 9 );
1514 d = GET( chunk, 8 );
1515 e = GET( chunk, 7 );
1516 f = GET( chunk, 6 );
1517 g = GET( chunk, 5 );
1518 h = GET( chunk, 4 );
1519 i = GET( chunk, 3 );
1520 j = GET( chunk, 2 );
1521 k = GET( chunk, 1 );
1522 m = GET( chunk, 0 );
1523
1524 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1525 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1526 r = d;
1527 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1528 | ( f & NOT(a) & NOT(e) ) | ( e & i );
1529 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1530 | ( g & NOT(a) & NOT(e) ) | ( a & i );
1531 u = h;
1532 v = a | e | i;
1533 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1534 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1535 y = m;
1536
1537 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1538 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1539
1540 return value;
1541 }
1542
h_calc_DPBtoBCD(ULong dpb)1543 ULong h_calc_DPBtoBCD( ULong dpb )
1544 {
1545 ULong result, chunk;
1546 Int i;
1547
1548 result = 0;
1549
1550 for (i = 0; i < 5; i++) {
1551 chunk = dpb >> ( 4 - i ) * 10;
1552 result = result << 12;
1553 result |= dpb_to_bcd( chunk & 0x3FF );
1554 }
1555 return result;
1556 }
1557
h_calc_BCDtoDPB(ULong bcd)1558 ULong h_calc_BCDtoDPB( ULong bcd )
1559 {
1560 ULong result, chunk;
1561 Int i;
1562
1563 result = 0;
1564
1565 for (i = 0; i < 5; i++) {
1566 chunk = bcd >> ( 4 - i ) * 12;
1567 result = result << 10;
1568 result |= bcd_to_dpb( chunk & 0xFFF );
1569 }
1570 return result;
1571 }
1572 #undef NOT
1573 #undef GET
1574 #undef PUT
1575
1576
1577 /* ----------------------------------------------------- */
1578 /* Signed and unsigned integer division, that behave like
1579 the ARMv7 UDIV ansd SDIV instructions.
1580
1581 sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1582 udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1583 */
1584 /* ----------------------------------------------------- */
1585
h_calc_udiv32_w_arm_semantics(UInt x,UInt y)1586 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1587 {
1588 // Division by zero --> zero
1589 if (UNLIKELY(y == 0)) return 0;
1590 // C requires rounding towards zero, which is also what we need.
1591 return x / y;
1592 }
1593
h_calc_udiv64_w_arm_semantics(ULong x,ULong y)1594 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1595 {
1596 // Division by zero --> zero
1597 if (UNLIKELY(y == 0)) return 0;
1598 // C requires rounding towards zero, which is also what we need.
1599 return x / y;
1600 }
1601
h_calc_sdiv32_w_arm_semantics(Int x,Int y)1602 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1603 {
1604 // Division by zero --> zero
1605 if (UNLIKELY(y == 0)) return 0;
1606 // The single case that produces an unrepresentable result
1607 if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1608 && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1609 return (Int)(UInt)0x80000000;
1610 // Else return the result rounded towards zero. C89 says
1611 // this is implementation defined (in the signed case), but gcc
1612 // promises to round towards zero. Nevertheless, at startup,
1613 // in main_main.c, do a check for that.
1614 return x / y;
1615 }
1616
h_calc_sdiv64_w_arm_semantics(Long x,Long y)1617 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1618 {
1619 // Division by zero --> zero
1620 if (UNLIKELY(y == 0)) return 0;
1621 // The single case that produces an unrepresentable result
1622 if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1623 && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1624 return (Long)(ULong)0x8000000000000000ULL;
1625 // Else return the result rounded towards zero. C89 says
1626 // this is implementation defined (in the signed case), but gcc
1627 // promises to round towards zero. Nevertheless, at startup,
1628 // in main_main.c, do a check for that.
1629 return x / y;
1630 }
1631
1632
1633 /*---------------------------------------------------------------*/
1634 /*--- end host_generic_simd64.c ---*/
1635 /*---------------------------------------------------------------*/
1636