1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                             host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2017 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37    where the instruction selectors cannot generate code in-line.
38    These are purely back-end entities and cannot be seen/referenced
39    from IR.  There are also helpers for 32-bit arithmetic in here. */
40 
41 #include "libvex_basictypes.h"
42 #include "main_util.h"              // LIKELY, UNLIKELY
43 #include "host_generic_simd64.h"
44 
45 
46 
47 /* Tuple/select functions for 32x2 vectors. */
48 
mk32x2(UInt w1,UInt w0)49 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
50    return (((ULong)w1) << 32) | ((ULong)w0);
51 }
52 
sel32x2_1(ULong w64)53 static inline UInt sel32x2_1 ( ULong w64 ) {
54    return 0xFFFFFFFF & toUInt(w64 >> 32);
55 }
sel32x2_0(ULong w64)56 static inline UInt sel32x2_0 ( ULong w64 ) {
57    return 0xFFFFFFFF & toUInt(w64);
58 }
59 
60 
61 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
62    with 64-bit shifts so we give it a hand. */
63 
mk16x4(UShort w3,UShort w2,UShort w1,UShort w0)64 static inline ULong mk16x4 ( UShort w3, UShort w2,
65                              UShort w1, UShort w0 ) {
66    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
67    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
68    return mk32x2(hi32, lo32);
69 }
70 
sel16x4_3(ULong w64)71 static inline UShort sel16x4_3 ( ULong w64 ) {
72    UInt hi32 = toUInt(w64 >> 32);
73    return toUShort(0xFFFF & (hi32 >> 16));
74 }
sel16x4_2(ULong w64)75 static inline UShort sel16x4_2 ( ULong w64 ) {
76    UInt hi32 = toUInt(w64 >> 32);
77    return toUShort(0xFFFF & hi32);
78 }
sel16x4_1(ULong w64)79 static inline UShort sel16x4_1 ( ULong w64 ) {
80    UInt lo32 = (UInt)w64;
81    return toUShort(0xFFFF & (lo32 >> 16));
82 }
sel16x4_0(ULong w64)83 static inline UShort sel16x4_0 ( ULong w64 ) {
84    UInt lo32 = (UInt)w64;
85    return toUShort(0xFFFF & lo32);
86 }
87 
88 
89 /* Tuple/select functions for 8x8 vectors. */
90 
mk8x8(UChar w7,UChar w6,UChar w5,UChar w4,UChar w3,UChar w2,UChar w1,UChar w0)91 static inline ULong mk8x8 ( UChar w7, UChar w6,
92                             UChar w5, UChar w4,
93                             UChar w3, UChar w2,
94                             UChar w1, UChar w0 ) {
95    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
96                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
97    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
98                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
99    return mk32x2(hi32, lo32);
100 }
101 
sel8x8_7(ULong w64)102 static inline UChar sel8x8_7 ( ULong w64 ) {
103    UInt hi32 = toUInt(w64 >> 32);
104    return toUChar(0xFF & (hi32 >> 24));
105 }
sel8x8_6(ULong w64)106 static inline UChar sel8x8_6 ( ULong w64 ) {
107    UInt hi32 = toUInt(w64 >> 32);
108    return toUChar(0xFF & (hi32 >> 16));
109 }
sel8x8_5(ULong w64)110 static inline UChar sel8x8_5 ( ULong w64 ) {
111    UInt hi32 = toUInt(w64 >> 32);
112    return toUChar(0xFF & (hi32 >> 8));
113 }
sel8x8_4(ULong w64)114 static inline UChar sel8x8_4 ( ULong w64 ) {
115    UInt hi32 = toUInt(w64 >> 32);
116    return toUChar(0xFF & (hi32 >> 0));
117 }
sel8x8_3(ULong w64)118 static inline UChar sel8x8_3 ( ULong w64 ) {
119    UInt lo32 = (UInt)w64;
120    return toUChar(0xFF & (lo32 >> 24));
121 }
sel8x8_2(ULong w64)122 static inline UChar sel8x8_2 ( ULong w64 ) {
123    UInt lo32 = (UInt)w64;
124    return toUChar(0xFF & (lo32 >> 16));
125 }
sel8x8_1(ULong w64)126 static inline UChar sel8x8_1 ( ULong w64 ) {
127    UInt lo32 = (UInt)w64;
128    return toUChar(0xFF & (lo32 >> 8));
129 }
sel8x8_0(ULong w64)130 static inline UChar sel8x8_0 ( ULong w64 ) {
131    UInt lo32 = (UInt)w64;
132    return toUChar(0xFF & (lo32 >> 0));
133 }
134 
index8x8(ULong w64,UChar ix)135 static inline UChar index8x8 ( ULong w64, UChar ix ) {
136    ix &= 7;
137    return toUChar((w64 >> (8*ix)) & 0xFF);
138 }
139 
indexOrZero8x8(ULong w64,UChar ix)140 static inline UChar indexOrZero8x8 ( ULong w64, UChar ix ) {
141    Char zeroingMask = (Char)ix;
142    zeroingMask ^= 0x80;
143    zeroingMask >>= 7;
144    ix &= 7;
145    return toUChar( ((w64 >> (8*ix)) & zeroingMask) & 0xFF );
146 }
147 
148 
149 /* Scalar helpers. */
150 
qadd32S(Int xx,Int yy)151 static inline Int qadd32S ( Int xx, Int yy )
152 {
153    Long t = ((Long)xx) + ((Long)yy);
154    const Long loLim = -0x80000000LL;
155    const Long hiLim =  0x7FFFFFFFLL;
156    if (t < loLim) t = loLim;
157    if (t > hiLim) t = hiLim;
158    return (Int)t;
159 }
160 
qadd16S(Short xx,Short yy)161 static inline Short qadd16S ( Short xx, Short yy )
162 {
163    Int t = ((Int)xx) + ((Int)yy);
164    if (t < -32768) t = -32768;
165    if (t > 32767)  t = 32767;
166    return (Short)t;
167 }
168 
qadd8S(Char xx,Char yy)169 static inline Char qadd8S ( Char xx, Char yy )
170 {
171    Int t = ((Int)xx) + ((Int)yy);
172    if (t < -128) t = -128;
173    if (t > 127)  t = 127;
174    return (Char)t;
175 }
176 
qadd16U(UShort xx,UShort yy)177 static inline UShort qadd16U ( UShort xx, UShort yy )
178 {
179    UInt t = ((UInt)xx) + ((UInt)yy);
180    if (t > 0xFFFF) t = 0xFFFF;
181    return (UShort)t;
182 }
183 
qadd8U(UChar xx,UChar yy)184 static inline UChar qadd8U ( UChar xx, UChar yy )
185 {
186    UInt t = ((UInt)xx) + ((UInt)yy);
187    if (t > 0xFF) t = 0xFF;
188    return (UChar)t;
189 }
190 
qsub32S(Int xx,Int yy)191 static inline Int qsub32S ( Int xx, Int yy )
192 {
193    Long t = ((Long)xx) - ((Long)yy);
194    const Long loLim = -0x80000000LL;
195    const Long hiLim =  0x7FFFFFFFLL;
196    if (t < loLim) t = loLim;
197    if (t > hiLim) t = hiLim;
198    return (Int)t;
199 }
200 
qsub16S(Short xx,Short yy)201 static inline Short qsub16S ( Short xx, Short yy )
202 {
203    Int t = ((Int)xx) - ((Int)yy);
204    if (t < -32768) t = -32768;
205    if (t > 32767)  t = 32767;
206    return (Short)t;
207 }
208 
qsub8S(Char xx,Char yy)209 static inline Char qsub8S ( Char xx, Char yy )
210 {
211    Int t = ((Int)xx) - ((Int)yy);
212    if (t < -128) t = -128;
213    if (t > 127)  t = 127;
214    return (Char)t;
215 }
216 
qsub16U(UShort xx,UShort yy)217 static inline UShort qsub16U ( UShort xx, UShort yy )
218 {
219    Int t = ((Int)xx) - ((Int)yy);
220    if (t < 0)      t = 0;
221    if (t > 0xFFFF) t = 0xFFFF;
222    return (UShort)t;
223 }
224 
qsub8U(UChar xx,UChar yy)225 static inline UChar qsub8U ( UChar xx, UChar yy )
226 {
227    Int t = ((Int)xx) - ((Int)yy);
228    if (t < 0)    t = 0;
229    if (t > 0xFF) t = 0xFF;
230    return (UChar)t;
231 }
232 
mul16(Short xx,Short yy)233 static inline Short mul16 ( Short xx, Short yy )
234 {
235    Int t = ((Int)xx) * ((Int)yy);
236    return (Short)t;
237 }
238 
mul32(Int xx,Int yy)239 static inline Int mul32 ( Int xx, Int yy )
240 {
241    Int t = ((Int)xx) * ((Int)yy);
242    return (Int)t;
243 }
244 
mulhi16S(Short xx,Short yy)245 static inline Short mulhi16S ( Short xx, Short yy )
246 {
247    Int t = ((Int)xx) * ((Int)yy);
248    t >>=/*s*/ 16;
249    return (Short)t;
250 }
251 
mulhi16U(UShort xx,UShort yy)252 static inline UShort mulhi16U ( UShort xx, UShort yy )
253 {
254    UInt t = ((UInt)xx) * ((UInt)yy);
255    t >>=/*u*/ 16;
256    return (UShort)t;
257 }
258 
cmpeq32(UInt xx,UInt yy)259 static inline UInt cmpeq32 ( UInt xx, UInt yy )
260 {
261    return xx==yy ? 0xFFFFFFFF : 0;
262 }
263 
cmpeq16(UShort xx,UShort yy)264 static inline UShort cmpeq16 ( UShort xx, UShort yy )
265 {
266    return toUShort(xx==yy ? 0xFFFF : 0);
267 }
268 
cmpeq8(UChar xx,UChar yy)269 static inline UChar cmpeq8 ( UChar xx, UChar yy )
270 {
271    return toUChar(xx==yy ? 0xFF : 0);
272 }
273 
cmpgt32S(Int xx,Int yy)274 static inline UInt cmpgt32S ( Int xx, Int yy )
275 {
276    return xx>yy ? 0xFFFFFFFF : 0;
277 }
278 
cmpgt16S(Short xx,Short yy)279 static inline UShort cmpgt16S ( Short xx, Short yy )
280 {
281    return toUShort(xx>yy ? 0xFFFF : 0);
282 }
283 
cmpgt8S(Char xx,Char yy)284 static inline UChar cmpgt8S ( Char xx, Char yy )
285 {
286    return toUChar(xx>yy ? 0xFF : 0);
287 }
288 
cmpnez32(UInt xx)289 static inline UInt cmpnez32 ( UInt xx )
290 {
291    return xx==0 ? 0 : 0xFFFFFFFF;
292 }
293 
cmpnez16(UShort xx)294 static inline UShort cmpnez16 ( UShort xx )
295 {
296    return toUShort(xx==0 ? 0 : 0xFFFF);
297 }
298 
cmpnez8(UChar xx)299 static inline UChar cmpnez8 ( UChar xx )
300 {
301    return toUChar(xx==0 ? 0 : 0xFF);
302 }
303 
qnarrow32Sto16S(UInt xx0)304 static inline Short qnarrow32Sto16S ( UInt xx0 )
305 {
306    Int xx = (Int)xx0;
307    if (xx < -32768) xx = -32768;
308    if (xx > 32767)  xx = 32767;
309    return (Short)xx;
310 }
311 
qnarrow16Sto8S(UShort xx0)312 static inline Char qnarrow16Sto8S ( UShort xx0 )
313 {
314    Short xx = (Short)xx0;
315    if (xx < -128) xx = -128;
316    if (xx > 127)  xx = 127;
317    return (Char)xx;
318 }
319 
qnarrow16Sto8U(UShort xx0)320 static inline UChar qnarrow16Sto8U ( UShort xx0 )
321 {
322    Short xx = (Short)xx0;
323    if (xx < 0)   xx = 0;
324    if (xx > 255) xx = 255;
325    return (UChar)xx;
326 }
327 
narrow32to16(UInt xx)328 static inline UShort narrow32to16 ( UInt xx )
329 {
330    return (UShort)xx;
331 }
332 
narrow16to8(UShort xx)333 static inline UChar narrow16to8 ( UShort xx )
334 {
335    return (UChar)xx;
336 }
337 
338 /* shifts: we don't care about out-of-range ones, since
339    that is dealt with at a higher level. */
340 
shl8(UChar v,UInt n)341 static inline UChar shl8 ( UChar v, UInt n )
342 {
343    return toUChar(v << n);
344 }
345 
sar8(UChar v,UInt n)346 static inline UChar sar8 ( UChar v, UInt n )
347 {
348    return toUChar(((Char)v) >> n);
349 }
350 
shl16(UShort v,UInt n)351 static inline UShort shl16 ( UShort v, UInt n )
352 {
353    return toUShort(v << n);
354 }
355 
shr16(UShort v,UInt n)356 static inline UShort shr16 ( UShort v, UInt n )
357 {
358    return toUShort((((UShort)v) >> n));
359 }
360 
sar16(UShort v,UInt n)361 static inline UShort sar16 ( UShort v, UInt n )
362 {
363    return toUShort(((Short)v) >> n);
364 }
365 
shl32(UInt v,UInt n)366 static inline UInt shl32 ( UInt v, UInt n )
367 {
368    return v << n;
369 }
370 
shr32(UInt v,UInt n)371 static inline UInt shr32 ( UInt v, UInt n )
372 {
373    return (((UInt)v) >> n);
374 }
375 
sar32(UInt v,UInt n)376 static inline UInt sar32 ( UInt v, UInt n )
377 {
378    return ((Int)v) >> n;
379 }
380 
avg8U(UChar xx,UChar yy)381 static inline UChar avg8U ( UChar xx, UChar yy )
382 {
383    UInt xxi = (UInt)xx;
384    UInt yyi = (UInt)yy;
385    UInt r   = (xxi + yyi + 1) >> 1;
386    return (UChar)r;
387 }
388 
avg16U(UShort xx,UShort yy)389 static inline UShort avg16U ( UShort xx, UShort yy )
390 {
391    UInt xxi = (UInt)xx;
392    UInt yyi = (UInt)yy;
393    UInt r   = (xxi + yyi + 1) >> 1;
394    return (UShort)r;
395 }
396 
max16S(Short xx,Short yy)397 static inline Short max16S ( Short xx, Short yy )
398 {
399    return toUShort((xx > yy) ? xx : yy);
400 }
401 
max8U(UChar xx,UChar yy)402 static inline UChar max8U ( UChar xx, UChar yy )
403 {
404    return toUChar((xx > yy) ? xx : yy);
405 }
406 
min16S(Short xx,Short yy)407 static inline Short min16S ( Short xx, Short yy )
408 {
409    return toUShort((xx < yy) ? xx : yy);
410 }
411 
min8U(UChar xx,UChar yy)412 static inline UChar min8U ( UChar xx, UChar yy )
413 {
414    return toUChar((xx < yy) ? xx : yy);
415 }
416 
hadd16U(UShort xx,UShort yy)417 static inline UShort hadd16U ( UShort xx, UShort yy )
418 {
419    UInt xxi = (UInt)xx;
420    UInt yyi = (UInt)yy;
421    UInt r   = (xxi + yyi) >> 1;
422    return (UShort)r;
423 }
424 
hadd16S(Short xx,Short yy)425 static inline Short hadd16S ( Short xx, Short yy )
426 {
427    Int xxi = (Int)xx;
428    Int yyi = (Int)yy;
429    Int r   = (xxi + yyi) >> 1;
430    return (Short)r;
431 }
432 
hsub16U(UShort xx,UShort yy)433 static inline UShort hsub16U ( UShort xx, UShort yy )
434 {
435    UInt xxi = (UInt)xx;
436    UInt yyi = (UInt)yy;
437    UInt r   = (xxi - yyi) >> 1;
438    return (UShort)r;
439 }
440 
hsub16S(Short xx,Short yy)441 static inline Short hsub16S ( Short xx, Short yy )
442 {
443    Int xxi = (Int)xx;
444    Int yyi = (Int)yy;
445    Int r   = (xxi - yyi) >> 1;
446    return (Short)r;
447 }
448 
hadd8U(UChar xx,UChar yy)449 static inline UChar hadd8U ( UChar xx, UChar yy )
450 {
451    UInt xxi = (UInt)xx;
452    UInt yyi = (UInt)yy;
453    UInt r   = (xxi + yyi) >> 1;
454    return (UChar)r;
455 }
456 
hadd8S(Char xx,Char yy)457 static inline Char hadd8S ( Char xx, Char yy )
458 {
459    Int xxi = (Int)xx;
460    Int yyi = (Int)yy;
461    Int r   = (xxi + yyi) >> 1;
462    return (Char)r;
463 }
464 
hsub8U(UChar xx,UChar yy)465 static inline UChar hsub8U ( UChar xx, UChar yy )
466 {
467    UInt xxi = (UInt)xx;
468    UInt yyi = (UInt)yy;
469    UInt r   = (xxi - yyi) >> 1;
470    return (UChar)r;
471 }
472 
hsub8S(Char xx,Char yy)473 static inline Char hsub8S ( Char xx, Char yy )
474 {
475    Int xxi = (Int)xx;
476    Int yyi = (Int)yy;
477    Int r   = (xxi - yyi) >> 1;
478    return (Char)r;
479 }
480 
absdiff8U(UChar xx,UChar yy)481 static inline UInt absdiff8U ( UChar xx, UChar yy )
482 {
483    UInt xxu = (UChar)xx;
484    UInt yyu = (UChar)yy;
485    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
486 }
487 
488 /* ----------------------------------------------------- */
489 /* Start of the externally visible functions.  These simply
490    implement the corresponding IR primops. */
491 /* ----------------------------------------------------- */
492 
493 /* ------------ Normal addition ------------ */
494 
h_generic_calc_Add32x2(ULong xx,ULong yy)495 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
496 {
497    return mk32x2(
498              sel32x2_1(xx) + sel32x2_1(yy),
499              sel32x2_0(xx) + sel32x2_0(yy)
500           );
501 }
502 
h_generic_calc_Add16x4(ULong xx,ULong yy)503 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
504 {
505    return mk16x4(
506              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
507              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
508              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
509              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
510           );
511 }
512 
h_generic_calc_Add8x8(ULong xx,ULong yy)513 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
514 {
515    return mk8x8(
516              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
517              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
518              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
519              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
520              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
521              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
522              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
523              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
524           );
525 }
526 
527 /* ------------ Saturating addition ------------ */
528 
h_generic_calc_QAdd16Sx4(ULong xx,ULong yy)529 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
530 {
531    return mk16x4(
532              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
533              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
534              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
535              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
536           );
537 }
538 
h_generic_calc_QAdd8Sx8(ULong xx,ULong yy)539 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
540 {
541    return mk8x8(
542              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
543              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
544              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
545              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
546              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
547              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
548              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
549              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
550           );
551 }
552 
h_generic_calc_QAdd16Ux4(ULong xx,ULong yy)553 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
554 {
555    return mk16x4(
556              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
557              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
558              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
559              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
560           );
561 }
562 
h_generic_calc_QAdd8Ux8(ULong xx,ULong yy)563 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
564 {
565    return mk8x8(
566              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
567              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
568              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
569              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
570              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
571              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
572              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
573              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
574           );
575 }
576 
577 /* ------------ Normal subtraction ------------ */
578 
h_generic_calc_Sub32x2(ULong xx,ULong yy)579 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
580 {
581    return mk32x2(
582              sel32x2_1(xx) - sel32x2_1(yy),
583              sel32x2_0(xx) - sel32x2_0(yy)
584           );
585 }
586 
h_generic_calc_Sub16x4(ULong xx,ULong yy)587 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
588 {
589    return mk16x4(
590              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
591              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
592              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
593              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
594           );
595 }
596 
h_generic_calc_Sub8x8(ULong xx,ULong yy)597 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
598 {
599    return mk8x8(
600              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
601              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
602              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
603              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
604              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
605              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
606              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
607              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
608           );
609 }
610 
611 /* ------------ Saturating subtraction ------------ */
612 
h_generic_calc_QSub16Sx4(ULong xx,ULong yy)613 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
614 {
615    return mk16x4(
616              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
617              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
618              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
619              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
620           );
621 }
622 
h_generic_calc_QSub8Sx8(ULong xx,ULong yy)623 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
624 {
625    return mk8x8(
626              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
627              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
628              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
629              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
630              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
631              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
632              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
633              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
634           );
635 }
636 
h_generic_calc_QSub16Ux4(ULong xx,ULong yy)637 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
638 {
639    return mk16x4(
640              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
641              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
642              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
643              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
644           );
645 }
646 
h_generic_calc_QSub8Ux8(ULong xx,ULong yy)647 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
648 {
649    return mk8x8(
650              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
651              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
652              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
653              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
654              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
655              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
656              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
657              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
658           );
659 }
660 
661 /* ------------ Multiplication ------------ */
662 
h_generic_calc_Mul16x4(ULong xx,ULong yy)663 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
664 {
665    return mk16x4(
666              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
667              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
668              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
669              mul16( sel16x4_0(xx), sel16x4_0(yy) )
670           );
671 }
672 
h_generic_calc_Mul32x2(ULong xx,ULong yy)673 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
674 {
675    return mk32x2(
676              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
677              mul32( sel32x2_0(xx), sel32x2_0(yy) )
678           );
679 }
680 
h_generic_calc_MulHi16Sx4(ULong xx,ULong yy)681 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
682 {
683    return mk16x4(
684              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
685              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
686              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
687              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
688           );
689 }
690 
h_generic_calc_MulHi16Ux4(ULong xx,ULong yy)691 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
692 {
693    return mk16x4(
694              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
695              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
696              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
697              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
698           );
699 }
700 
701 /* ------------ Comparison ------------ */
702 
h_generic_calc_CmpEQ32x2(ULong xx,ULong yy)703 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
704 {
705    return mk32x2(
706              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
707              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
708           );
709 }
710 
h_generic_calc_CmpEQ16x4(ULong xx,ULong yy)711 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
712 {
713    return mk16x4(
714              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
715              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
716              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
717              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
718           );
719 }
720 
h_generic_calc_CmpEQ8x8(ULong xx,ULong yy)721 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
722 {
723    return mk8x8(
724              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
725              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
726              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
727              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
728              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
729              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
730              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
731              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
732           );
733 }
734 
h_generic_calc_CmpGT32Sx2(ULong xx,ULong yy)735 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
736 {
737    return mk32x2(
738              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
739              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
740           );
741 }
742 
h_generic_calc_CmpGT16Sx4(ULong xx,ULong yy)743 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
744 {
745    return mk16x4(
746              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
747              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
748              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
749              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
750           );
751 }
752 
h_generic_calc_CmpGT8Sx8(ULong xx,ULong yy)753 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
754 {
755    return mk8x8(
756              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
757              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
758              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
759              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
760              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
761              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
762              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
763              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
764           );
765 }
766 
h_generic_calc_CmpNEZ32x2(ULong xx)767 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
768 {
769    return mk32x2(
770              cmpnez32( sel32x2_1(xx) ),
771              cmpnez32( sel32x2_0(xx) )
772           );
773 }
774 
h_generic_calc_CmpNEZ16x4(ULong xx)775 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
776 {
777    return mk16x4(
778              cmpnez16( sel16x4_3(xx) ),
779              cmpnez16( sel16x4_2(xx) ),
780              cmpnez16( sel16x4_1(xx) ),
781              cmpnez16( sel16x4_0(xx) )
782           );
783 }
784 
h_generic_calc_CmpNEZ8x8(ULong xx)785 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
786 {
787    return mk8x8(
788              cmpnez8( sel8x8_7(xx) ),
789              cmpnez8( sel8x8_6(xx) ),
790              cmpnez8( sel8x8_5(xx) ),
791              cmpnez8( sel8x8_4(xx) ),
792              cmpnez8( sel8x8_3(xx) ),
793              cmpnez8( sel8x8_2(xx) ),
794              cmpnez8( sel8x8_1(xx) ),
795              cmpnez8( sel8x8_0(xx) )
796           );
797 }
798 
799 /* ------------ Saturating narrowing ------------ */
800 
h_generic_calc_QNarrowBin32Sto16Sx4(ULong aa,ULong bb)801 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
802 {
803    UInt d = sel32x2_1(aa);
804    UInt c = sel32x2_0(aa);
805    UInt b = sel32x2_1(bb);
806    UInt a = sel32x2_0(bb);
807    return mk16x4(
808              qnarrow32Sto16S(d),
809              qnarrow32Sto16S(c),
810              qnarrow32Sto16S(b),
811              qnarrow32Sto16S(a)
812           );
813 }
814 
h_generic_calc_QNarrowBin16Sto8Sx8(ULong aa,ULong bb)815 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
816 {
817    UShort h = sel16x4_3(aa);
818    UShort g = sel16x4_2(aa);
819    UShort f = sel16x4_1(aa);
820    UShort e = sel16x4_0(aa);
821    UShort d = sel16x4_3(bb);
822    UShort c = sel16x4_2(bb);
823    UShort b = sel16x4_1(bb);
824    UShort a = sel16x4_0(bb);
825    return mk8x8(
826              qnarrow16Sto8S(h),
827              qnarrow16Sto8S(g),
828              qnarrow16Sto8S(f),
829              qnarrow16Sto8S(e),
830              qnarrow16Sto8S(d),
831              qnarrow16Sto8S(c),
832              qnarrow16Sto8S(b),
833              qnarrow16Sto8S(a)
834           );
835 }
836 
h_generic_calc_QNarrowBin16Sto8Ux8(ULong aa,ULong bb)837 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
838 {
839    UShort h = sel16x4_3(aa);
840    UShort g = sel16x4_2(aa);
841    UShort f = sel16x4_1(aa);
842    UShort e = sel16x4_0(aa);
843    UShort d = sel16x4_3(bb);
844    UShort c = sel16x4_2(bb);
845    UShort b = sel16x4_1(bb);
846    UShort a = sel16x4_0(bb);
847    return mk8x8(
848              qnarrow16Sto8U(h),
849              qnarrow16Sto8U(g),
850              qnarrow16Sto8U(f),
851              qnarrow16Sto8U(e),
852              qnarrow16Sto8U(d),
853              qnarrow16Sto8U(c),
854              qnarrow16Sto8U(b),
855              qnarrow16Sto8U(a)
856           );
857 }
858 
859 /* ------------ Truncating narrowing ------------ */
860 
h_generic_calc_NarrowBin32to16x4(ULong aa,ULong bb)861 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
862 {
863    UInt d = sel32x2_1(aa);
864    UInt c = sel32x2_0(aa);
865    UInt b = sel32x2_1(bb);
866    UInt a = sel32x2_0(bb);
867    return mk16x4(
868              narrow32to16(d),
869              narrow32to16(c),
870              narrow32to16(b),
871              narrow32to16(a)
872           );
873 }
874 
h_generic_calc_NarrowBin16to8x8(ULong aa,ULong bb)875 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
876 {
877    UShort h = sel16x4_3(aa);
878    UShort g = sel16x4_2(aa);
879    UShort f = sel16x4_1(aa);
880    UShort e = sel16x4_0(aa);
881    UShort d = sel16x4_3(bb);
882    UShort c = sel16x4_2(bb);
883    UShort b = sel16x4_1(bb);
884    UShort a = sel16x4_0(bb);
885    return mk8x8(
886              narrow16to8(h),
887              narrow16to8(g),
888              narrow16to8(f),
889              narrow16to8(e),
890              narrow16to8(d),
891              narrow16to8(c),
892              narrow16to8(b),
893              narrow16to8(a)
894           );
895 }
896 
897 /* ------------ Interleaving ------------ */
898 
h_generic_calc_InterleaveHI8x8(ULong aa,ULong bb)899 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
900 {
901    return mk8x8(
902              sel8x8_7(aa),
903              sel8x8_7(bb),
904              sel8x8_6(aa),
905              sel8x8_6(bb),
906              sel8x8_5(aa),
907              sel8x8_5(bb),
908              sel8x8_4(aa),
909              sel8x8_4(bb)
910           );
911 }
912 
h_generic_calc_InterleaveLO8x8(ULong aa,ULong bb)913 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
914 {
915    return mk8x8(
916              sel8x8_3(aa),
917              sel8x8_3(bb),
918              sel8x8_2(aa),
919              sel8x8_2(bb),
920              sel8x8_1(aa),
921              sel8x8_1(bb),
922              sel8x8_0(aa),
923              sel8x8_0(bb)
924           );
925 }
926 
h_generic_calc_InterleaveHI16x4(ULong aa,ULong bb)927 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
928 {
929    return mk16x4(
930              sel16x4_3(aa),
931              sel16x4_3(bb),
932              sel16x4_2(aa),
933              sel16x4_2(bb)
934           );
935 }
936 
h_generic_calc_InterleaveLO16x4(ULong aa,ULong bb)937 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
938 {
939    return mk16x4(
940              sel16x4_1(aa),
941              sel16x4_1(bb),
942              sel16x4_0(aa),
943              sel16x4_0(bb)
944           );
945 }
946 
h_generic_calc_InterleaveHI32x2(ULong aa,ULong bb)947 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
948 {
949    return mk32x2(
950              sel32x2_1(aa),
951              sel32x2_1(bb)
952           );
953 }
954 
h_generic_calc_InterleaveLO32x2(ULong aa,ULong bb)955 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
956 {
957    return mk32x2(
958              sel32x2_0(aa),
959              sel32x2_0(bb)
960           );
961 }
962 
963 /* ------------ Concatenation ------------ */
964 
h_generic_calc_CatOddLanes16x4(ULong aa,ULong bb)965 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
966 {
967    return mk16x4(
968              sel16x4_3(aa),
969              sel16x4_1(aa),
970              sel16x4_3(bb),
971              sel16x4_1(bb)
972           );
973 }
974 
h_generic_calc_CatEvenLanes16x4(ULong aa,ULong bb)975 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
976 {
977    return mk16x4(
978              sel16x4_2(aa),
979              sel16x4_0(aa),
980              sel16x4_2(bb),
981              sel16x4_0(bb)
982           );
983 }
984 
985 /* ------------ Permutation ------------ */
986 
h_generic_calc_Perm8x8(ULong aa,ULong bb)987 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
988 {
989    return mk8x8(
990              index8x8(aa, sel8x8_7(bb)),
991              index8x8(aa, sel8x8_6(bb)),
992              index8x8(aa, sel8x8_5(bb)),
993              index8x8(aa, sel8x8_4(bb)),
994              index8x8(aa, sel8x8_3(bb)),
995              index8x8(aa, sel8x8_2(bb)),
996              index8x8(aa, sel8x8_1(bb)),
997              index8x8(aa, sel8x8_0(bb))
998           );
999 }
1000 
h_generic_calc_PermOrZero8x8(ULong aa,ULong bb)1001 ULong h_generic_calc_PermOrZero8x8 ( ULong aa, ULong bb )
1002 {
1003    return mk8x8(
1004              indexOrZero8x8(aa, sel8x8_7(bb)),
1005              indexOrZero8x8(aa, sel8x8_6(bb)),
1006              indexOrZero8x8(aa, sel8x8_5(bb)),
1007              indexOrZero8x8(aa, sel8x8_4(bb)),
1008              indexOrZero8x8(aa, sel8x8_3(bb)),
1009              indexOrZero8x8(aa, sel8x8_2(bb)),
1010              indexOrZero8x8(aa, sel8x8_1(bb)),
1011              indexOrZero8x8(aa, sel8x8_0(bb))
1012           );
1013 }
1014 
1015 /* ------------ Shifting ------------ */
1016 /* Note that because these primops are undefined if the shift amount
1017    equals or exceeds the lane width, the shift amount is masked so
1018    that the scalar shifts are always in range.  In fact, given the
1019    semantics of these primops (ShlN16x4, etc) it is an error if in
1020    fact we are ever given an out-of-range shift amount.
1021 */
h_generic_calc_ShlN32x2(ULong xx,UInt nn)1022 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1023 {
1024    /* vassert(nn < 32); */
1025    nn &= 31;
1026    return mk32x2(
1027              shl32( sel32x2_1(xx), nn ),
1028              shl32( sel32x2_0(xx), nn )
1029           );
1030 }
1031 
h_generic_calc_ShlN16x4(ULong xx,UInt nn)1032 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1033 {
1034    /* vassert(nn < 16); */
1035    nn &= 15;
1036    return mk16x4(
1037              shl16( sel16x4_3(xx), nn ),
1038              shl16( sel16x4_2(xx), nn ),
1039              shl16( sel16x4_1(xx), nn ),
1040              shl16( sel16x4_0(xx), nn )
1041           );
1042 }
1043 
h_generic_calc_ShlN8x8(ULong xx,UInt nn)1044 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
1045 {
1046    /* vassert(nn < 8); */
1047    nn &= 7;
1048    return mk8x8(
1049              shl8( sel8x8_7(xx), nn ),
1050              shl8( sel8x8_6(xx), nn ),
1051              shl8( sel8x8_5(xx), nn ),
1052              shl8( sel8x8_4(xx), nn ),
1053              shl8( sel8x8_3(xx), nn ),
1054              shl8( sel8x8_2(xx), nn ),
1055              shl8( sel8x8_1(xx), nn ),
1056              shl8( sel8x8_0(xx), nn )
1057           );
1058 }
1059 
h_generic_calc_ShrN32x2(ULong xx,UInt nn)1060 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1061 {
1062    /* vassert(nn < 32); */
1063    nn &= 31;
1064    return mk32x2(
1065              shr32( sel32x2_1(xx), nn ),
1066              shr32( sel32x2_0(xx), nn )
1067           );
1068 }
1069 
h_generic_calc_ShrN16x4(ULong xx,UInt nn)1070 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1071 {
1072    /* vassert(nn < 16); */
1073    nn &= 15;
1074    return mk16x4(
1075              shr16( sel16x4_3(xx), nn ),
1076              shr16( sel16x4_2(xx), nn ),
1077              shr16( sel16x4_1(xx), nn ),
1078              shr16( sel16x4_0(xx), nn )
1079           );
1080 }
1081 
h_generic_calc_SarN32x2(ULong xx,UInt nn)1082 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1083 {
1084    /* vassert(nn < 32); */
1085    nn &= 31;
1086    return mk32x2(
1087              sar32( sel32x2_1(xx), nn ),
1088              sar32( sel32x2_0(xx), nn )
1089           );
1090 }
1091 
h_generic_calc_SarN16x4(ULong xx,UInt nn)1092 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1093 {
1094    /* vassert(nn < 16); */
1095    nn &= 15;
1096    return mk16x4(
1097              sar16( sel16x4_3(xx), nn ),
1098              sar16( sel16x4_2(xx), nn ),
1099              sar16( sel16x4_1(xx), nn ),
1100              sar16( sel16x4_0(xx), nn )
1101           );
1102 }
1103 
h_generic_calc_SarN8x8(ULong xx,UInt nn)1104 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1105 {
1106    /* vassert(nn < 8); */
1107    nn &= 7;
1108    return mk8x8(
1109              sar8( sel8x8_7(xx), nn ),
1110              sar8( sel8x8_6(xx), nn ),
1111              sar8( sel8x8_5(xx), nn ),
1112              sar8( sel8x8_4(xx), nn ),
1113              sar8( sel8x8_3(xx), nn ),
1114              sar8( sel8x8_2(xx), nn ),
1115              sar8( sel8x8_1(xx), nn ),
1116              sar8( sel8x8_0(xx), nn )
1117           );
1118 }
1119 
1120 /* ------------ Averaging ------------ */
1121 
h_generic_calc_Avg8Ux8(ULong xx,ULong yy)1122 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1123 {
1124    return mk8x8(
1125              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1126              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1127              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1128              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1129              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1130              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1131              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1132              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1133           );
1134 }
1135 
h_generic_calc_Avg16Ux4(ULong xx,ULong yy)1136 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1137 {
1138    return mk16x4(
1139              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1140              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1141              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1142              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1143           );
1144 }
1145 
1146 /* ------------ max/min ------------ */
1147 
h_generic_calc_Max16Sx4(ULong xx,ULong yy)1148 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1149 {
1150    return mk16x4(
1151              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1152              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1153              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1154              max16S( sel16x4_0(xx), sel16x4_0(yy) )
1155           );
1156 }
1157 
h_generic_calc_Max8Ux8(ULong xx,ULong yy)1158 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1159 {
1160    return mk8x8(
1161              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1162              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1163              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1164              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1165              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1166              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1167              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1168              max8U( sel8x8_0(xx), sel8x8_0(yy) )
1169           );
1170 }
1171 
h_generic_calc_Min16Sx4(ULong xx,ULong yy)1172 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1173 {
1174    return mk16x4(
1175              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1176              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1177              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1178              min16S( sel16x4_0(xx), sel16x4_0(yy) )
1179           );
1180 }
1181 
h_generic_calc_Min8Ux8(ULong xx,ULong yy)1182 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1183 {
1184    return mk8x8(
1185              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1186              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1187              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1188              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1189              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1190              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1191              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1192              min8U( sel8x8_0(xx), sel8x8_0(yy) )
1193           );
1194 }
1195 
h_generic_calc_GetMSBs8x8(ULong xx)1196 UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1197 {
1198    UInt r = 0;
1199    if (xx & (1ULL << (64-1))) r |= (1<<7);
1200    if (xx & (1ULL << (56-1))) r |= (1<<6);
1201    if (xx & (1ULL << (48-1))) r |= (1<<5);
1202    if (xx & (1ULL << (40-1))) r |= (1<<4);
1203    if (xx & (1ULL << (32-1))) r |= (1<<3);
1204    if (xx & (1ULL << (24-1))) r |= (1<<2);
1205    if (xx & (1ULL << (16-1))) r |= (1<<1);
1206    if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1207    return r;
1208 }
1209 
1210 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1211 
1212 /* Tuple/select functions for 16x2 vectors. */
mk16x2(UShort w1,UShort w2)1213 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1214    return (((UInt)w1) << 16) | ((UInt)w2);
1215 }
1216 
sel16x2_1(UInt w32)1217 static inline UShort sel16x2_1 ( UInt w32 ) {
1218    return 0xFFFF & (UShort)(w32 >> 16);
1219 }
sel16x2_0(UInt w32)1220 static inline UShort sel16x2_0 ( UInt w32 ) {
1221    return 0xFFFF & (UShort)(w32);
1222 }
1223 
mk8x4(UChar w3,UChar w2,UChar w1,UChar w0)1224 static inline UInt mk8x4 ( UChar w3, UChar w2,
1225                            UChar w1, UChar w0 ) {
1226    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1227               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1228    return w32;
1229 }
1230 
sel8x4_3(UInt w32)1231 static inline UChar sel8x4_3 ( UInt w32 ) {
1232    return toUChar(0xFF & (w32 >> 24));
1233 }
sel8x4_2(UInt w32)1234 static inline UChar sel8x4_2 ( UInt w32 ) {
1235    return toUChar(0xFF & (w32 >> 16));
1236 }
sel8x4_1(UInt w32)1237 static inline UChar sel8x4_1 ( UInt w32 ) {
1238    return toUChar(0xFF & (w32 >> 8));
1239 }
sel8x4_0(UInt w32)1240 static inline UChar sel8x4_0 ( UInt w32 ) {
1241    return toUChar(0xFF & (w32 >> 0));
1242 }
1243 
1244 
1245 /* ----------------------------------------------------- */
1246 /* More externally visible functions.  These simply
1247    implement the corresponding IR primops. */
1248 /* ----------------------------------------------------- */
1249 
1250 /* ------ 16x2 ------ */
1251 
h_generic_calc_Add16x2(UInt xx,UInt yy)1252 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1253 {
1254    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1255                   sel16x2_0(xx) + sel16x2_0(yy) );
1256 }
1257 
h_generic_calc_Sub16x2(UInt xx,UInt yy)1258 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1259 {
1260    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1261                   sel16x2_0(xx) - sel16x2_0(yy) );
1262 }
1263 
h_generic_calc_HAdd16Ux2(UInt xx,UInt yy)1264 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1265 {
1266    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1267                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1268 }
1269 
h_generic_calc_HAdd16Sx2(UInt xx,UInt yy)1270 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1271 {
1272    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1273                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1274 }
1275 
h_generic_calc_HSub16Ux2(UInt xx,UInt yy)1276 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1277 {
1278    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1279                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1280 }
1281 
h_generic_calc_HSub16Sx2(UInt xx,UInt yy)1282 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1283 {
1284    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1285                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1286 }
1287 
h_generic_calc_QAdd16Ux2(UInt xx,UInt yy)1288 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1289 {
1290    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1291                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1292 }
1293 
h_generic_calc_QAdd16Sx2(UInt xx,UInt yy)1294 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1295 {
1296    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1297                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1298 }
1299 
h_generic_calc_QSub16Ux2(UInt xx,UInt yy)1300 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1301 {
1302    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1303                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1304 }
1305 
h_generic_calc_QSub16Sx2(UInt xx,UInt yy)1306 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1307 {
1308    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1309                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1310 }
1311 
1312 /* ------ 8x4 ------ */
1313 
h_generic_calc_Add8x4(UInt xx,UInt yy)1314 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1315 {
1316    return mk8x4(
1317              sel8x4_3(xx) + sel8x4_3(yy),
1318              sel8x4_2(xx) + sel8x4_2(yy),
1319              sel8x4_1(xx) + sel8x4_1(yy),
1320              sel8x4_0(xx) + sel8x4_0(yy)
1321           );
1322 }
1323 
h_generic_calc_Sub8x4(UInt xx,UInt yy)1324 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1325 {
1326    return mk8x4(
1327              sel8x4_3(xx) - sel8x4_3(yy),
1328              sel8x4_2(xx) - sel8x4_2(yy),
1329              sel8x4_1(xx) - sel8x4_1(yy),
1330              sel8x4_0(xx) - sel8x4_0(yy)
1331           );
1332 }
1333 
h_generic_calc_HAdd8Ux4(UInt xx,UInt yy)1334 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1335 {
1336    return mk8x4(
1337              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1338              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1339              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1340              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1341           );
1342 }
1343 
h_generic_calc_HAdd8Sx4(UInt xx,UInt yy)1344 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1345 {
1346    return mk8x4(
1347              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1348              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1349              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1350              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1351           );
1352 }
1353 
h_generic_calc_HSub8Ux4(UInt xx,UInt yy)1354 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1355 {
1356    return mk8x4(
1357              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1358              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1359              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1360              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1361           );
1362 }
1363 
h_generic_calc_HSub8Sx4(UInt xx,UInt yy)1364 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1365 {
1366    return mk8x4(
1367              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1368              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1369              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1370              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1371           );
1372 }
1373 
h_generic_calc_QAdd8Ux4(UInt xx,UInt yy)1374 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1375 {
1376    return mk8x4(
1377              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1378              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1379              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1380              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1381           );
1382 }
1383 
h_generic_calc_QAdd8Sx4(UInt xx,UInt yy)1384 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1385 {
1386    return mk8x4(
1387              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1388              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1389              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1390              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1391           );
1392 }
1393 
h_generic_calc_QSub8Ux4(UInt xx,UInt yy)1394 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1395 {
1396    return mk8x4(
1397              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1398              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1399              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1400              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1401           );
1402 }
1403 
h_generic_calc_QSub8Sx4(UInt xx,UInt yy)1404 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1405 {
1406    return mk8x4(
1407              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1408              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1409              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1410              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1411           );
1412 }
1413 
h_generic_calc_CmpNEZ16x2(UInt xx)1414 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1415 {
1416    return mk16x2(
1417              cmpnez16( sel16x2_1(xx) ),
1418              cmpnez16( sel16x2_0(xx) )
1419           );
1420 }
1421 
h_generic_calc_CmpNEZ8x4(UInt xx)1422 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1423 {
1424    return mk8x4(
1425              cmpnez8( sel8x4_3(xx) ),
1426              cmpnez8( sel8x4_2(xx) ),
1427              cmpnez8( sel8x4_1(xx) ),
1428              cmpnez8( sel8x4_0(xx) )
1429           );
1430 }
1431 
h_generic_calc_Sad8Ux4(UInt xx,UInt yy)1432 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1433 {
1434    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1435           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1436           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1437           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1438 }
1439 
h_generic_calc_QAdd32S(UInt xx,UInt yy)1440 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1441 {
1442    return qadd32S( xx, yy );
1443 }
1444 
h_generic_calc_QSub32S(UInt xx,UInt yy)1445 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1446 {
1447    return qsub32S( xx, yy );
1448 }
1449 
1450 
1451 /*------------------------------------------------------------------*/
1452 /* Decimal Floating Point (DFP) externally visible helper functions */
1453 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD                     */
1454 /*------------------------------------------------------------------*/
1455 
1456 #define NOT( x )    ( ( ( x ) == 0) ? 1 : 0)
1457 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1458 #define PUT( x, y ) ( ( x )<< ( y ) )
1459 
dpb_to_bcd(ULong chunk)1460 static ULong dpb_to_bcd( ULong chunk )
1461 {
1462    Short a, b, c, d, e, f, g, h, i, j, k, m;
1463    Short p, q, r, s, t, u, v, w, x, y;
1464    ULong value;
1465 
1466    /* convert 10 bit densely packed BCD to BCD */
1467    p = GET( chunk, 9 );
1468    q = GET( chunk, 8 );
1469    r = GET( chunk, 7 );
1470    s = GET( chunk, 6 );
1471    t = GET( chunk, 5 );
1472    u = GET( chunk, 4 );
1473    v = GET( chunk, 3 );
1474    w = GET( chunk, 2 );
1475    x = GET( chunk, 1 );
1476    y = GET( chunk, 0 );
1477 
1478    /* The BCD bit values are given by the following boolean equations.*/
1479    a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1480    b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1481    c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1482    d = r;
1483    e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1484    f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1485    g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1486    h = u;
1487    i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1488    j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1489             | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1490    k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1491             | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1492    m = y;
1493 
1494    value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1495             | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1496             | PUT(k, 1) | PUT(m, 0);
1497    return value;
1498 }
1499 
bcd_to_dpb(ULong chunk)1500 static ULong bcd_to_dpb( ULong chunk )
1501 {
1502    Short a, b, c, d, e, f, g, h, i, j, k, m;
1503    Short p, q, r, s, t, u, v, w, x, y;
1504    ULong value;
1505    /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1506     The boolean equations to calculate the value of each of the DPD bit
1507     is given in Appendix B  of Book 1: Power ISA User Instruction set.  The
1508     bits for the DPD number are [abcdefghijkm].  The bits for the BCD value
1509     are [pqrstuvwxy].  The boolean logic equations in psuedo C code are:
1510     */
1511    a = GET( chunk, 11 );
1512    b = GET( chunk, 10 );
1513    c = GET( chunk, 9 );
1514    d = GET( chunk, 8 );
1515    e = GET( chunk, 7 );
1516    f = GET( chunk, 6 );
1517    g = GET( chunk, 5 );
1518    h = GET( chunk, 4 );
1519    i = GET( chunk, 3 );
1520    j = GET( chunk, 2 );
1521    k = GET( chunk, 1 );
1522    m = GET( chunk, 0 );
1523 
1524    p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1525    q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1526    r = d;
1527    s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1528             | ( f & NOT(a) & NOT(e) ) | ( e & i );
1529    t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1530             | ( g & NOT(a) & NOT(e) ) | ( a & i );
1531    u = h;
1532    v = a | e | i;
1533    w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1534    x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1535    y = m;
1536 
1537    value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1538             | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1539 
1540    return value;
1541 }
1542 
h_calc_DPBtoBCD(ULong dpb)1543 ULong h_calc_DPBtoBCD( ULong dpb )
1544 {
1545    ULong result, chunk;
1546    Int i;
1547 
1548    result = 0;
1549 
1550    for (i = 0; i < 5; i++) {
1551       chunk = dpb >> ( 4 - i ) * 10;
1552       result = result << 12;
1553       result |= dpb_to_bcd( chunk & 0x3FF );
1554    }
1555    return result;
1556 }
1557 
h_calc_BCDtoDPB(ULong bcd)1558 ULong h_calc_BCDtoDPB( ULong bcd )
1559 {
1560    ULong result, chunk;
1561    Int i;
1562 
1563    result = 0;
1564 
1565    for (i = 0; i < 5; i++) {
1566       chunk = bcd >> ( 4 - i ) * 12;
1567       result = result << 10;
1568       result |= bcd_to_dpb( chunk & 0xFFF );
1569    }
1570    return result;
1571 }
1572 #undef NOT
1573 #undef GET
1574 #undef PUT
1575 
1576 
1577 /* ----------------------------------------------------- */
1578 /* Signed and unsigned integer division, that behave like
1579    the ARMv7 UDIV ansd SDIV instructions.
1580 
1581    sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1582    udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1583 */
1584 /* ----------------------------------------------------- */
1585 
h_calc_udiv32_w_arm_semantics(UInt x,UInt y)1586 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1587 {
1588    // Division by zero --> zero
1589    if (UNLIKELY(y == 0)) return 0;
1590    // C requires rounding towards zero, which is also what we need.
1591    return x / y;
1592 }
1593 
h_calc_udiv64_w_arm_semantics(ULong x,ULong y)1594 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1595 {
1596    // Division by zero --> zero
1597    if (UNLIKELY(y == 0)) return 0;
1598    // C requires rounding towards zero, which is also what we need.
1599    return x / y;
1600 }
1601 
h_calc_sdiv32_w_arm_semantics(Int x,Int y)1602 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1603 {
1604    // Division by zero --> zero
1605    if (UNLIKELY(y == 0)) return 0;
1606    // The single case that produces an unrepresentable result
1607    if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1608                  && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1609       return (Int)(UInt)0x80000000;
1610    // Else return the result rounded towards zero.  C89 says
1611    // this is implementation defined (in the signed case), but gcc
1612    // promises to round towards zero.  Nevertheless, at startup,
1613    // in main_main.c, do a check for that.
1614    return x / y;
1615 }
1616 
h_calc_sdiv64_w_arm_semantics(Long x,Long y)1617 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1618 {
1619    // Division by zero --> zero
1620    if (UNLIKELY(y == 0)) return 0;
1621    // The single case that produces an unrepresentable result
1622    if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1623                  && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1624       return (Long)(ULong)0x8000000000000000ULL;
1625    // Else return the result rounded towards zero.  C89 says
1626    // this is implementation defined (in the signed case), but gcc
1627    // promises to round towards zero.  Nevertheless, at startup,
1628    // in main_main.c, do a check for that.
1629    return x / y;
1630 }
1631 
1632 
1633 /*---------------------------------------------------------------*/
1634 /*--- end                               host_generic_simd64.c ---*/
1635 /*---------------------------------------------------------------*/
1636