1 /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
2    Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 
4    This file is free software; you can redistribute it and/or modify it under
5    the terms of the GNU General Public License as published by the Free
6    Software Foundation; either version 3 of the License, or (at your option)
7    any later version.
8 
9    This file is distributed in the hope that it will be useful, but WITHOUT
10    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12    for more details.
13 
14    Under Section 7 of GPL version 3, you are granted additional
15    permissions described in the GCC Runtime Library Exception, version
16    3.1, as published by the Free Software Foundation.
17 
18    You should have received a copy of the GNU General Public License and
19    a copy of the GCC Runtime Library Exception along with this program;
20    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
21    <http://www.gnu.org/licenses/>.  */
22 
23 #ifndef _SI2VMX_H_
24 #define _SI2VMX_H_	1
25 
26 #ifndef __SPU__
27 
28 #include <stdlib.h>
29 #include <vec_types.h>
30 
31 
32 /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
33  * Users can override the action by defining it prior to including this
34  * header file.
35  */
36 #ifndef SPU_HALT_ACTION
37 #define SPU_HALT_ACTION		abort()
38 #endif
39 
40 /* Specify a default stop action for the spu_stop intrinsic.
41  * Users can override the action by defining it prior to including this
42  * header file.
43  */
44 #ifndef SPU_STOP_ACTION
45 #define SPU_STOP_ACTION		abort()
46 #endif
47 
48 
49 /* Specify a default action for unsupported intrinsic.
50  * Users can override the action by defining it prior to including this
51  * header file.
52  */
53 #ifndef SPU_UNSUPPORTED_ACTION
54 #define SPU_UNSUPPORTED_ACTION	abort()
55 #endif
56 
57 
58 /* Casting intrinsics - from scalar to quadword
59  */
60 
si_from_uchar(unsigned char c)61 static __inline qword si_from_uchar(unsigned char c) {
62   union {
63     qword q;
64     unsigned char c[16];
65   } x;
66   x.c[3] = c;
67   return (x.q);
68 }
69 
si_from_char(signed char c)70 static __inline qword si_from_char(signed char c) {
71   union {
72     qword q;
73     signed char c[16];
74   } x;
75   x.c[3] = c;
76   return (x.q);
77 }
78 
si_from_ushort(unsigned short s)79 static __inline qword si_from_ushort(unsigned short s) {
80   union {
81     qword q;
82     unsigned short s[8];
83   } x;
84   x.s[1] = s;
85   return (x.q);
86 }
87 
si_from_short(short s)88 static __inline qword si_from_short(short s) {
89   union {
90     qword q;
91     short s[8];
92   } x;
93   x.s[1] = s;
94   return (x.q);
95 }
96 
97 
si_from_uint(unsigned int i)98 static __inline qword si_from_uint(unsigned int i) {
99   union {
100     qword q;
101     unsigned int i[4];
102   } x;
103   x.i[0] = i;
104   return (x.q);
105 }
106 
si_from_int(int i)107 static __inline qword si_from_int(int i) {
108   union {
109     qword q;
110     int i[4];
111   } x;
112   x.i[0] = i;
113   return (x.q);
114 }
115 
si_from_ullong(unsigned long long l)116 static __inline qword si_from_ullong(unsigned long long l) {
117   union {
118     qword q;
119     unsigned long long l[2];
120   } x;
121   x.l[0] = l;
122   return (x.q);
123 }
124 
si_from_llong(long long l)125 static __inline qword si_from_llong(long long l) {
126   union {
127     qword q;
128     long long l[2];
129   } x;
130   x.l[0] = l;
131   return (x.q);
132 }
133 
si_from_float(float f)134 static __inline qword si_from_float(float f) {
135   union {
136     qword q;
137     float f[4];
138   } x;
139   x.f[0] = f;
140   return (x.q);
141 }
142 
si_from_double(double d)143 static __inline qword si_from_double(double d) {
144   union {
145     qword q;
146     double d[2];
147   } x;
148   x.d[0] = d;
149   return (x.q);
150 }
151 
si_from_ptr(void * ptr)152 static __inline qword si_from_ptr(void *ptr) {
153   union {
154     qword q;
155     void *p;
156   } x;
157   x.p = ptr;
158   return (x.q);
159 }
160 
161 
162 /* Casting intrinsics - from quadword to scalar
163  */
si_to_uchar(qword q)164 static __inline unsigned char si_to_uchar(qword q) {
165   union {
166     qword q;
167     unsigned char c[16];
168   } x;
169   x.q = q;
170   return (x.c[3]);
171 }
172 
si_to_char(qword q)173 static __inline signed char si_to_char(qword q) {
174   union {
175     qword q;
176     signed char c[16];
177   } x;
178   x.q = q;
179   return (x.c[3]);
180 }
181 
si_to_ushort(qword q)182 static __inline unsigned short si_to_ushort(qword q) {
183   union {
184     qword q;
185     unsigned short s[8];
186   } x;
187   x.q = q;
188   return (x.s[1]);
189 }
190 
si_to_short(qword q)191 static __inline short si_to_short(qword q) {
192   union {
193     qword q;
194     short s[8];
195   } x;
196   x.q = q;
197   return (x.s[1]);
198 }
199 
si_to_uint(qword q)200 static __inline unsigned int si_to_uint(qword q) {
201   union {
202     qword q;
203     unsigned int i[4];
204   } x;
205   x.q = q;
206   return (x.i[0]);
207 }
208 
si_to_int(qword q)209 static __inline int si_to_int(qword q) {
210   union {
211     qword q;
212     int i[4];
213   } x;
214   x.q = q;
215   return (x.i[0]);
216 }
217 
si_to_ullong(qword q)218 static __inline unsigned long long si_to_ullong(qword q) {
219   union {
220     qword q;
221     unsigned long long l[2];
222   } x;
223   x.q = q;
224   return (x.l[0]);
225 }
226 
si_to_llong(qword q)227 static __inline long long si_to_llong(qword q) {
228   union {
229     qword q;
230     long long l[2];
231   } x;
232   x.q = q;
233   return (x.l[0]);
234 }
235 
si_to_float(qword q)236 static __inline float si_to_float(qword q) {
237   union {
238     qword q;
239     float f[4];
240   } x;
241   x.q = q;
242   return (x.f[0]);
243 }
244 
si_to_double(qword q)245 static __inline double si_to_double(qword q) {
246   union {
247     qword q;
248     double d[2];
249   } x;
250   x.q = q;
251   return (x.d[0]);
252 }
253 
si_to_ptr(qword q)254 static __inline void * si_to_ptr(qword q) {
255   union {
256     qword q;
257     void *p;
258   } x;
259   x.q = q;
260   return (x.p);
261 }
262 
263 
264 /* Absolute difference
265  */
si_absdb(qword a,qword b)266 static __inline qword si_absdb(qword a, qword b)
267 {
268   vec_uchar16 ac, bc, dc;
269 
270   ac = (vec_uchar16)(a);
271   bc = (vec_uchar16)(b);
272   dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
273 
274   return ((qword)(dc));
275 }
276 
277 /* Add intrinsics
278  */
279 #define si_a(_a, _b)		((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
280 
281 #define si_ah(_a, _b)		((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
282 
si_ai(qword a,int b)283 static __inline qword si_ai(qword a, int b)
284 {
285   return ((qword)(vec_add((vec_int4)(a),
286 			  vec_splat((vec_int4)(si_from_int(b)), 0))));
287 }
288 
289 
si_ahi(qword a,short b)290 static __inline qword si_ahi(qword a, short b)
291 {
292   return ((qword)(vec_add((vec_short8)(a),
293 			  vec_splat((vec_short8)(si_from_short(b)), 1))));
294 }
295 
296 
297 #define si_fa(_a, _b)	((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
298 
299 
si_dfa(qword a,qword b)300 static __inline qword si_dfa(qword a, qword b)
301 {
302   union {
303     vec_double2 v;
304     double d[2];
305   } ad, bd, dd;
306 
307   ad.v = (vec_double2)(a);
308   bd.v = (vec_double2)(b);
309   dd.d[0] = ad.d[0] + bd.d[0];
310   dd.d[1] = ad.d[1] + bd.d[1];
311 
312   return ((qword)(dd.v));
313 }
314 
315 /* Add word extended
316  */
317 #define si_addx(_a, _b, _c)	((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), 	\
318 						 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
319 
320 
321 /* Bit-wise AND
322  */
323 #define si_and(_a, _b)		((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
324 
325 
si_andbi(qword a,signed char b)326 static __inline qword si_andbi(qword a, signed char b)
327 {
328   return ((qword)(vec_and((vec_char16)(a),
329 			  vec_splat((vec_char16)(si_from_char(b)), 3))));
330 }
331 
si_andhi(qword a,signed short b)332 static __inline qword si_andhi(qword a, signed short b)
333 {
334   return ((qword)(vec_and((vec_short8)(a),
335 			  vec_splat((vec_short8)(si_from_short(b)), 1))));
336 }
337 
338 
si_andi(qword a,signed int b)339 static __inline qword si_andi(qword a, signed int b)
340 {
341   return ((qword)(vec_and((vec_int4)(a),
342 			  vec_splat((vec_int4)(si_from_int(b)), 0))));
343 }
344 
345 
346 /* Bit-wise AND with complement
347  */
348 #define si_andc(_a, _b)		((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
349 
350 
351 /* Average byte vectors
352  */
353 #define si_avgb(_a, _b)		((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
354 
355 
356 /* Branch indirect and set link on external data
357  */
358 #define si_bisled(_func)	/* not mappable */
359 #define si_bisledd(_func)	/* not mappable */
360 #define si_bislede(_func)	/* not mappable */
361 
362 
363 /* Borrow generate
364  */
365 #define si_bg(_a, _b)		((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
366 
367 #define si_bgx(_a, _b, _c)	((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)),		\
368 							vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), 	\
369 								(vec_uint4)(_c))), vec_splat_u32(1))))
370 
371 /* Compare absolute equal
372  */
si_fcmeq(qword a,qword b)373 static __inline qword si_fcmeq(qword a, qword b)
374 {
375   vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
376 
377   return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb),
378 				  vec_andc((vec_float4)(b), msb))));
379 }
380 
si_dfcmeq(qword a,qword b)381 static __inline qword si_dfcmeq(qword a, qword b)
382 {
383   vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
384   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
385   vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
386 
387   vec_uint4 biteq;
388   vec_uint4 aabs;
389   vec_uint4 babs;
390   vec_uint4 a_gt;
391   vec_uint4 ahi_inf;
392   vec_uint4 anan;
393   vec_uint4 result;
394 
395   union {
396     vec_uchar16 v;
397     int i[4];
398   } x;
399 
400   /* Shift 4 bytes  */
401   x.i[3] = 4 << 3;
402 
403   /*  Mask out sign bits */
404   aabs = vec_and((vec_uint4)a,sign_mask);
405   babs = vec_and((vec_uint4)b,sign_mask);
406 
407   /*  A)  Check for bit equality, store in high word */
408   biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
409   biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
410 
411   /*
412       B)  Check if a is NaN, store in high word
413 
414       B1) If the high word is greater than max_exp (indicates a NaN)
415       B2) If the low word is greater than 0
416   */
417   a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
418 
419   /*  B3) Check if the high word is equal to the inf exponent */
420   ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
421 
422   /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
423   anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
424 
425   /*  result = A and not B  */
426   result = vec_andc(biteq, anan);
427 
428   /*  Promote high words to 64 bits and return  */
429   return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
430 }
431 
432 
433 /* Compare absolute greater than
434  */
si_fcmgt(qword a,qword b)435 static __inline qword si_fcmgt(qword a, qword b)
436 {
437   vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
438 
439   return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
440 				  vec_andc((vec_float4)(b), msb))));
441 }
442 
si_dfcmgt(qword a,qword b)443 static __inline qword si_dfcmgt(qword a, qword b)
444 {
445   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
446   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
447   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
448 
449   union {
450     vec_uchar16 v;
451     int i[4];
452   } x;
453 
454   /* Shift 4 bytes  */
455   x.i[3] = 4 << 3;
456 
457   // absolute value of a,b
458   vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
459   vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
460 
461   // check if a is nan
462   vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
463   vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
464   a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
465   a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
466 
467   // check if b is nan
468   vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
469   vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
470   b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
471   b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
472 
473   // A) Check if the exponents are different
474   vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
475 
476   // B) Check if high word equal, and low word greater
477   vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
478   vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
479   vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
480 
481   //  If either A or B is true, return true (unless NaNs detected)
482   vec_uint4 r = vec_or(gt_hi, eqgt);
483 
484   // splat the high words of the comparison step
485   r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
486 
487   // correct for NaNs in input
488   return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
489 }
490 
491 
492 /* Compare equal
493  */
si_ceqb(qword a,qword b)494 static __inline qword si_ceqb(qword a, qword b)
495 {
496   return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
497 }
498 
si_ceqh(qword a,qword b)499 static __inline qword si_ceqh(qword a, qword b)
500 {
501   return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
502 }
503 
si_ceq(qword a,qword b)504 static __inline qword si_ceq(qword a, qword b)
505 {
506   return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
507 }
508 
si_fceq(qword a,qword b)509 static __inline qword si_fceq(qword a, qword b)
510 {
511   return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
512 }
513 
si_ceqbi(qword a,signed char b)514 static __inline qword si_ceqbi(qword a, signed char b)
515 {
516   return ((qword)(vec_cmpeq((vec_char16)(a),
517 			    vec_splat((vec_char16)(si_from_char(b)), 3))));
518 }
519 
si_ceqhi(qword a,signed short b)520 static __inline qword si_ceqhi(qword a, signed short b)
521 {
522   return ((qword)(vec_cmpeq((vec_short8)(a),
523 			  vec_splat((vec_short8)(si_from_short(b)), 1))));
524 }
525 
si_ceqi(qword a,signed int b)526 static __inline qword si_ceqi(qword a, signed int b)
527 {
528   return ((qword)(vec_cmpeq((vec_int4)(a),
529 			  vec_splat((vec_int4)(si_from_int(b)), 0))));
530 }
531 
si_dfceq(qword a,qword b)532 static __inline qword si_dfceq(qword a, qword b)
533 {
534   vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
535   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
536   vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
537 
538   vec_uint4 biteq;
539   vec_uint4 aabs;
540   vec_uint4 babs;
541   vec_uint4 a_gt;
542   vec_uint4 ahi_inf;
543   vec_uint4 anan;
544   vec_uint4 iszero;
545   vec_uint4 result;
546 
547   union {
548     vec_uchar16 v;
549     int i[4];
550   } x;
551 
552   /* Shift 4 bytes  */
553   x.i[3] = 4 << 3;
554 
555   /*  A)  Check for bit equality, store in high word */
556   biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
557   biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
558 
559   /*  Mask out sign bits */
560   aabs = vec_and((vec_uint4)a,sign_mask);
561   babs = vec_and((vec_uint4)b,sign_mask);
562 
563   /*
564       B)  Check if a is NaN, store in high word
565 
566       B1) If the high word is greater than max_exp (indicates a NaN)
567       B2) If the low word is greater than 0
568   */
569   a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
570 
571   /*  B3) Check if the high word is equal to the inf exponent */
572   ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
573 
574   /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
575   anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
576 
577   /*  C)  Check for 0 = -0 special case */
578   iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
579   iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
580 
581   /*  result = (A or C) and not B  */
582   result = vec_or(biteq,iszero);
583   result = vec_andc(result, anan);
584 
585   /*  Promote high words to 64 bits and return  */
586   return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
587 }
588 
589 
590 /* Compare greater than
591  */
si_cgtb(qword a,qword b)592 static __inline qword si_cgtb(qword a, qword b)
593 {
594   return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
595 }
596 
si_cgth(qword a,qword b)597 static __inline qword si_cgth(qword a, qword b)
598 {
599   return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
600 }
601 
si_cgt(qword a,qword b)602 static __inline qword si_cgt(qword a, qword b)
603 {
604   return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
605 }
606 
si_clgtb(qword a,qword b)607 static __inline qword si_clgtb(qword a, qword b)
608 {
609   return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
610 }
611 
si_clgth(qword a,qword b)612 static __inline qword si_clgth(qword a, qword b)
613 {
614   return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
615 }
616 
si_clgt(qword a,qword b)617 static __inline qword si_clgt(qword a, qword b)
618 {
619   return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
620 }
621 
si_fcgt(qword a,qword b)622 static __inline qword si_fcgt(qword a, qword b)
623 {
624   return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
625 }
626 
si_dfcgt(qword a,qword b)627 static __inline qword si_dfcgt(qword a, qword b)
628 {
629   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
630   vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
631   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
632   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
633 
634   union {
635     vec_uchar16 v;
636     int i[4];
637   } x;
638 
639   /* Shift 4 bytes  */
640   x.i[3] = 4 << 3;
641 
642   // absolute value of a,b
643   vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
644   vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
645 
646   // check if a is nan
647   vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
648   vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
649   a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
650   a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
651 
652   // check if b is nan
653   vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
654   vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
655   b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
656   b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
657 
658   // sign of a
659   vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
660   asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
661 
662   // sign of b
663   vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
664   bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
665 
666   // negative a
667   vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
668   vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
669   abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
670   vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
671 
672   // pick the one we want
673   vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
674 
675   // negative b
676   vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
677   bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
678   vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
679 
680   // pick the one we want
681   vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
682 
683   // A) Check if the exponents are different
684   vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
685 
686   // B) Check if high word equal, and low word greater
687   vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
688   vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
689   vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
690 
691   //  If either A or B is true, return true (unless NaNs detected)
692   vec_uint4 r = vec_or(gt_hi, eqgt);
693 
694   // splat the high words of the comparison step
695   r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
696 
697   // correct for NaNs in input
698   return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
699 }
700 
si_cgtbi(qword a,signed char b)701 static __inline qword si_cgtbi(qword a, signed char b)
702 {
703   return ((qword)(vec_cmpgt((vec_char16)(a),
704 			    vec_splat((vec_char16)(si_from_char(b)), 3))));
705 }
706 
si_cgthi(qword a,signed short b)707 static __inline qword si_cgthi(qword a, signed short b)
708 {
709   return ((qword)(vec_cmpgt((vec_short8)(a),
710 			    vec_splat((vec_short8)(si_from_short(b)), 1))));
711 }
712 
si_cgti(qword a,signed int b)713 static __inline qword si_cgti(qword a, signed int b)
714 {
715   return ((qword)(vec_cmpgt((vec_int4)(a),
716 			    vec_splat((vec_int4)(si_from_int(b)), 0))));
717 }
718 
si_clgtbi(qword a,unsigned char b)719 static __inline qword si_clgtbi(qword a, unsigned char b)
720 {
721   return ((qword)(vec_cmpgt((vec_uchar16)(a),
722 			    vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
723 }
724 
si_clgthi(qword a,unsigned short b)725 static __inline qword si_clgthi(qword a, unsigned short b)
726 {
727   return ((qword)(vec_cmpgt((vec_ushort8)(a),
728 			    vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
729 }
730 
si_clgti(qword a,unsigned int b)731 static __inline qword si_clgti(qword a, unsigned int b)
732 {
733   return ((qword)(vec_cmpgt((vec_uint4)(a),
734 			    vec_splat((vec_uint4)(si_from_uint(b)), 0))));
735 }
736 
si_dftsv(qword a,char b)737 static __inline qword si_dftsv(qword a, char b)
738 {
739   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
740   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
741   vec_uint4 result = (vec_uint4){0};
742   vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
743   sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
744   vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
745 
746   union {
747     vec_uchar16 v;
748     int i[4];
749   } x;
750 
751   /* Shift 4 bytes  */
752   x.i[3] = 4 << 3;
753 
754   /* Nan or +inf or -inf  */
755   if (b & 0x70)
756   {
757     vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
758     vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
759      /* NaN  */
760      if (b & 0x40)
761      {
762        vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
763        a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
764        a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
765        result = vec_or(result, a_nan);
766      }
767      /* inf  */
768      if (b & 0x30)
769      {
770        a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
771        a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi);
772         /* +inf  */
773         if (b & 0x20)
774           result = vec_or(vec_andc(a_inf, sign), result);
775         /* -inf  */
776         if (b & 0x10)
777           result = vec_or(vec_and(a_inf, sign), result);
778      }
779   }
780   /* 0 or denorm  */
781   if (b & 0xF)
782   {
783     vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
784     iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
785     /* denorm  */
786     if (b & 0x3)
787     {
788       vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
789       vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
790       isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
791       /* +denorm  */
792      if (b & 0x2)
793         result = vec_or(vec_andc(isdenorm, sign), result);
794       /* -denorm  */
795      if (b & 0x1)
796         result = vec_or(vec_and(isdenorm, sign), result);
797     }
798     /* 0  */
799     if (b & 0xC)
800     {
801       iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
802       /* +0  */
803      if (b & 0x8)
804         result = vec_or(vec_andc(iszero, sign), result);
805       /* -0  */
806      if (b & 0x4)
807         result = vec_or(vec_and(iszero, sign), result);
808     }
809   }
810   return ((qword)result);
811 }
812 
813 
814 /* Carry generate
815  */
816 #define si_cg(_a, _b)		((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
817 
818 #define si_cgx(_a, _b, _c)	((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), 		\
819 						vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),	\
820 							 vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
821 
822 
823 /* Count ones for bytes
824  */
si_cntb(qword a)825 static __inline qword si_cntb(qword a)
826 {
827   vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
828   vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
829   vec_uchar16 av;
830 
831   av = (vec_uchar16)(a);
832 
833   return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
834 			  vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
835 }
836 
837 /* Count ones for bytes
838  */
si_clz(qword a)839 static __inline qword si_clz(qword a)
840 {
841   vec_uchar16 av;
842   vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
843   vec_uchar16 four    = vec_splat_u8(4);
844   vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
845   vec_uchar16 eight   = vec_splat_u8(8);
846   vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
847   vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
848 
849   av = (vec_uchar16)(a);
850 
851   cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
852   cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
853 
854   cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
855 
856   tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
857   tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
858   tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
859 
860   cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
861   cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
862   cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
863 
864   return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
865 }
866 
867 /* Convert to float
868  */
869 #define si_cuflt(_a, _b)	((qword)(vec_ctf((vec_uint4)(_a), _b)))
870 #define si_csflt(_a, _b)	((qword)(vec_ctf((vec_int4)(_a), _b)))
871 
872 /* Convert to signed int
873  */
874 #define si_cflts(_a, _b)	((qword)(vec_cts((vec_float4)(_a), _b)))
875 
876 /* Convert to unsigned int
877  */
878 #define si_cfltu(_a, _b)	((qword)(vec_ctu((vec_float4)(_a), _b)))
879 
880 /* Synchronize
881  */
882 #define si_dsync()		/* do nothing */
883 #define si_sync()		/* do nothing */
884 #define si_syncc()		/* do nothing */
885 
886 
887 /* Equivalence
888  */
si_eqv(qword a,qword b)889 static __inline qword si_eqv(qword a, qword b)
890 {
891   vec_uchar16 d;
892 
893   d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
894   return ((qword)(vec_nor(d, d)));
895 }
896 
897 /* Extend
898  */
si_xsbh(qword a)899 static __inline qword si_xsbh(qword a)
900 {
901   vec_char16 av;
902 
903   av = (vec_char16)(a);
904   return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15,
905 						              0, 0, 0, 0, 0, 0, 0, 0})))));
906 }
907 
si_xshw(qword a)908 static __inline qword si_xshw(qword a)
909 {
910   vec_short8 av;
911 
912   av = (vec_short8)(a);
913   return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7,
914 					                      10,11,14,15,
915 							      0, 0, 0, 0,
916 						              0, 0, 0, 0})))));
917 }
918 
si_xswd(qword a)919 static __inline qword si_xswd(qword a)
920 {
921   vec_int4 av;
922 
923   av = (vec_int4)(a);
924   return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})),
925 			   ((vec_uchar16){20, 21, 22, 23,
926 					   4,  5,  6,  7,
927 				          28, 29, 30, 31,
928 				          12, 13, 14, 15}))));
929 }
930 
si_fesd(qword a)931 static __inline qword si_fesd(qword a)
932 {
933   union {
934     double d[2];
935     vec_double2	vd;
936   } out;
937   union {
938     float f[4];
939     vec_float4 vf;
940   } in;
941 
942   in.vf = (vec_float4)(a);
943   out.d[0] = (double)(in.f[0]);
944   out.d[1] = (double)(in.f[2]);
945   return ((qword)(out.vd));
946 }
947 
948 /* Gather
949  */
si_gbb(qword a)950 static __inline qword si_gbb(qword a)
951 {
952   vec_uchar16 bits;
953   vec_uint4   bytes;
954 
955   bits  = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
956 								            7, 6, 5, 4, 3, 2, 1, 0}));
957   bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
958 
959   return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
960 					                0, 0, 0, 0, 0, 0, 0, 0}))));
961 }
962 
963 
si_gbh(qword a)964 static __inline qword si_gbh(qword a)
965 {
966   vec_ushort8 bits;
967   vec_uint4   bytes;
968 
969   bits  = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
970 
971   bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
972 
973   return ((qword)(vec_sld(bytes, bytes, 12)));
974 }
975 
si_gb(qword a)976 static __inline qword si_gb(qword a)
977 {
978   vec_uint4 bits;
979   vec_uint4 bytes;
980 
981   bits  = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
982   bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
983   return ((qword)(vec_sld(bytes, bytes, 12)));
984 }
985 
986 
987 /* Compare and halt
988  */
si_heq(qword a,qword b)989 static __inline void si_heq(qword a, qword b)
990 {
991   union {
992     vector unsigned int v;
993     unsigned int i[4];
994   } aa, bb;
995 
996   aa.v = (vector unsigned int)(a);
997   bb.v = (vector unsigned int)(b);
998 
999   if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
1000 }
1001 
si_heqi(qword a,unsigned int b)1002 static __inline void si_heqi(qword a, unsigned int b)
1003 {
1004   union {
1005     vector unsigned int v;
1006     unsigned int i[4];
1007   } aa;
1008 
1009   aa.v = (vector unsigned int)(a);
1010 
1011   if (aa.i[0] == b) { SPU_HALT_ACTION; };
1012 }
1013 
si_hgt(qword a,qword b)1014 static __inline void si_hgt(qword a, qword b)
1015 {
1016   union {
1017     vector signed int v;
1018     signed int i[4];
1019   } aa, bb;
1020 
1021   aa.v = (vector signed int)(a);
1022   bb.v = (vector signed int)(b);
1023 
1024   if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1025 }
1026 
si_hgti(qword a,signed int b)1027 static __inline void si_hgti(qword a, signed int b)
1028 {
1029   union {
1030     vector signed int v;
1031     signed int i[4];
1032   } aa;
1033 
1034   aa.v = (vector signed int)(a);
1035 
1036   if (aa.i[0] > b) { SPU_HALT_ACTION; };
1037 }
1038 
si_hlgt(qword a,qword b)1039 static __inline void si_hlgt(qword a, qword b)
1040 {
1041   union {
1042     vector unsigned int v;
1043     unsigned int i[4];
1044   } aa, bb;
1045 
1046   aa.v = (vector unsigned int)(a);
1047   bb.v = (vector unsigned int)(b);
1048 
1049   if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1050 }
1051 
si_hlgti(qword a,unsigned int b)1052 static __inline void si_hlgti(qword a, unsigned int b)
1053 {
1054   union {
1055     vector unsigned int v;
1056     unsigned int i[4];
1057   } aa;
1058 
1059   aa.v = (vector unsigned int)(a);
1060 
1061   if (aa.i[0] > b) { SPU_HALT_ACTION; };
1062 }
1063 
1064 
1065 /* Multiply and Add
1066  */
si_mpya(qword a,qword b,qword c)1067 static __inline qword si_mpya(qword a, qword b, qword c)
1068 {
1069   return ((qword)(vec_msum(vec_and((vec_short8)(a),
1070 				   ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})),
1071 			   (vec_short8)(b), (vec_int4)(c))));
1072 }
1073 
si_fma(qword a,qword b,qword c)1074 static __inline qword si_fma(qword a, qword b, qword c)
1075 {
1076   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1077 }
1078 
si_dfma(qword a,qword b,qword c)1079 static __inline qword si_dfma(qword a, qword b, qword c)
1080 {
1081   union {
1082     vec_double2 v;
1083     double d[2];
1084   } aa, bb, cc, dd;
1085 
1086   aa.v = (vec_double2)(a);
1087   bb.v = (vec_double2)(b);
1088   cc.v = (vec_double2)(c);
1089   dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
1090   dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
1091   return ((qword)(dd.v));
1092 }
1093 
1094 /* Form Mask
1095  */
1096 #define si_fsmbi(_a)	si_fsmb(si_from_int(_a))
1097 
si_fsmb(qword a)1098 static __inline qword si_fsmb(qword a)
1099 {
1100   vec_char16 mask;
1101   vec_ushort8 in;
1102 
1103   in = (vec_ushort8)(a);
1104   mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
1105 					              3, 3, 3, 3, 3, 3, 3, 3})));
1106   return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
1107 				                      0, 1, 2, 3, 4, 5, 6, 7})),
1108 			  vec_splat_u8(7))));
1109 }
1110 
1111 
si_fsmh(qword a)1112 static __inline qword si_fsmh(qword a)
1113 {
1114   vec_uchar16 in;
1115   vec_short8 mask;
1116 
1117   in = (vec_uchar16)(a);
1118   mask = (vec_short8)(vec_splat(in, 3));
1119   return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})),
1120 			  vec_splat_u16(15))));
1121 }
1122 
si_fsm(qword a)1123 static __inline qword si_fsm(qword a)
1124 {
1125   vec_uchar16 in;
1126   vec_int4 mask;
1127 
1128   in = (vec_uchar16)(a);
1129   mask = (vec_int4)(vec_splat(in, 3));
1130   return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
1131 			  ((vec_uint4){31,31,31,31}))));
1132 }
1133 
1134 /* Move from/to registers
1135  */
1136 #define si_fscrrd()		((qword)((vec_uint4){0}))
1137 #define si_fscrwr(_a)
1138 
1139 #define si_mfspr(_reg)		((qword)((vec_uint4){0}))
1140 #define si_mtspr(_reg, _a)
1141 
1142 /* Multiply High High Add
1143  */
si_mpyhha(qword a,qword b,qword c)1144 static __inline qword si_mpyhha(qword a, qword b, qword c)
1145 {
1146   return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
1147 }
1148 
si_mpyhhau(qword a,qword b,qword c)1149 static __inline qword si_mpyhhau(qword a, qword b, qword c)
1150 {
1151   return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
1152 }
1153 
1154 /* Multiply Subtract
1155  */
si_fms(qword a,qword b,qword c)1156 static __inline qword si_fms(qword a, qword b, qword c)
1157 {
1158   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b),
1159 			   vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
1160 }
1161 
si_dfms(qword a,qword b,qword c)1162 static __inline qword si_dfms(qword a, qword b, qword c)
1163 {
1164   union {
1165     vec_double2 v;
1166     double d[2];
1167   } aa, bb, cc, dd;
1168 
1169   aa.v = (vec_double2)(a);
1170   bb.v = (vec_double2)(b);
1171   cc.v = (vec_double2)(c);
1172   dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
1173   dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
1174   return ((qword)(dd.v));
1175 }
1176 
1177 /* Multiply
1178  */
si_fm(qword a,qword b)1179 static __inline qword si_fm(qword a, qword b)
1180 {
1181   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
1182 }
1183 
si_dfm(qword a,qword b)1184 static __inline qword si_dfm(qword a, qword b)
1185 {
1186   union {
1187     vec_double2 v;
1188     double d[2];
1189   } aa, bb, dd;
1190 
1191   aa.v = (vec_double2)(a);
1192   bb.v = (vec_double2)(b);
1193   dd.d[0] = aa.d[0] * bb.d[0];
1194   dd.d[1] = aa.d[1] * bb.d[1];
1195   return ((qword)(dd.v));
1196 }
1197 
1198 /* Multiply High
1199  */
si_mpyh(qword a,qword b)1200 static __inline qword si_mpyh(qword a, qword b)
1201 {
1202   vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
1203 
1204   return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
1205 }
1206 
1207 
1208 /* Multiply High High
1209  */
si_mpyhh(qword a,qword b)1210 static __inline qword si_mpyhh(qword a, qword b)
1211 {
1212   return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
1213 }
1214 
si_mpyhhu(qword a,qword b)1215 static __inline qword si_mpyhhu(qword a, qword b)
1216 {
1217   return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
1218 }
1219 
1220 /* Multiply Odd
1221  */
si_mpy(qword a,qword b)1222 static __inline qword si_mpy(qword a, qword b)
1223 {
1224   return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
1225 }
1226 
si_mpyu(qword a,qword b)1227 static __inline qword si_mpyu(qword a, qword b)
1228 {
1229   return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
1230 }
1231 
si_mpyi(qword a,short b)1232 static __inline qword si_mpyi(qword a, short b)
1233 {
1234   return ((qword)(vec_mulo((vec_short8)(a),
1235 			   vec_splat((vec_short8)(si_from_short(b)), 1))));
1236 }
1237 
si_mpyui(qword a,unsigned short b)1238 static __inline qword si_mpyui(qword a, unsigned short b)
1239 {
1240   return ((qword)(vec_mulo((vec_ushort8)(a),
1241 			   vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1242 }
1243 
1244 /* Multiply and Shift Right
1245  */
si_mpys(qword a,qword b)1246 static __inline qword si_mpys(qword a, qword b)
1247 {
1248   return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
1249 }
1250 
1251 /* Nand
1252  */
si_nand(qword a,qword b)1253 static __inline qword si_nand(qword a, qword b)
1254 {
1255   vec_uchar16 d;
1256 
1257   d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
1258   return ((qword)(vec_nor(d, d)));
1259 }
1260 
1261 /* Negative Multiply Add
1262  */
si_dfnma(qword a,qword b,qword c)1263 static __inline qword si_dfnma(qword a, qword b, qword c)
1264 {
1265   union {
1266     vec_double2 v;
1267     double d[2];
1268   } aa, bb, cc, dd;
1269 
1270   aa.v = (vec_double2)(a);
1271   bb.v = (vec_double2)(b);
1272   cc.v = (vec_double2)(c);
1273   dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
1274   dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
1275   return ((qword)(dd.v));
1276 }
1277 
1278 /* Negative Multiply and Subtract
1279  */
si_fnms(qword a,qword b,qword c)1280 static __inline qword si_fnms(qword a, qword b, qword c)
1281 {
1282   return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1283 }
1284 
si_dfnms(qword a,qword b,qword c)1285 static __inline qword si_dfnms(qword a, qword b, qword c)
1286 {
1287   union {
1288     vec_double2 v;
1289     double d[2];
1290   } aa, bb, cc, dd;
1291 
1292   aa.v = (vec_double2)(a);
1293   bb.v = (vec_double2)(b);
1294   cc.v = (vec_double2)(c);
1295   dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
1296   dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
1297   return ((qword)(dd.v));
1298 }
1299 
1300 /* Nor
1301  */
si_nor(qword a,qword b)1302 static __inline qword si_nor(qword a, qword b)
1303 {
1304   return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
1305 }
1306 
1307 /* Or
1308  */
si_or(qword a,qword b)1309 static __inline qword si_or(qword a, qword b)
1310 {
1311   return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
1312 }
1313 
si_orbi(qword a,unsigned char b)1314 static __inline qword si_orbi(qword a, unsigned char b)
1315 {
1316   return ((qword)(vec_or((vec_uchar16)(a),
1317 			 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1318 }
1319 
si_orhi(qword a,unsigned short b)1320 static __inline qword si_orhi(qword a, unsigned short b)
1321 {
1322   return ((qword)(vec_or((vec_ushort8)(a),
1323 			  vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1324 }
1325 
si_ori(qword a,unsigned int b)1326 static __inline qword si_ori(qword a, unsigned int b)
1327 {
1328   return ((qword)(vec_or((vec_uint4)(a),
1329 			  vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1330 }
1331 
1332 /* Or Complement
1333  */
si_orc(qword a,qword b)1334 static __inline qword si_orc(qword a, qword b)
1335 {
1336   return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
1337 }
1338 
1339 
1340 /* Or Across
1341  */
si_orx(qword a)1342 static __inline qword si_orx(qword a)
1343 {
1344   vec_uchar16 tmp;
1345   tmp = (vec_uchar16)(a);
1346   tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
1347   tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
1348   return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1349 				              0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1350 }
1351 
1352 
1353 /* Estimates
1354  */
si_frest(qword a)1355 static __inline qword si_frest(qword a)
1356 {
1357   return ((qword)(vec_re((vec_float4)(a))));
1358 }
1359 
si_frsqest(qword a)1360 static __inline qword si_frsqest(qword a)
1361 {
1362   return ((qword)(vec_rsqrte((vec_float4)(a))));
1363 }
1364 
1365 #define si_fi(_a, _d)		(_d)
1366 
1367 /* Channel Read and Write
1368  */
1369 #define si_rdch(_channel)		((qword)(vec_splat_u8(0)))	/* not mappable */
1370 #define si_rchcnt(_channel)		((qword)(vec_splat_u8(0)))	/* not mappable */
1371 #define si_wrch(_channel, _a)		/* not mappable */
1372 
1373 /* Rotate Left
1374  */
si_roth(qword a,qword b)1375 static __inline qword si_roth(qword a, qword b)
1376 {
1377   return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
1378 }
1379 
si_rot(qword a,qword b)1380 static __inline qword si_rot(qword a, qword b)
1381 {
1382   return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
1383 }
1384 
si_rothi(qword a,int b)1385 static __inline qword si_rothi(qword a, int b)
1386 {
1387   return ((qword)(vec_rl((vec_ushort8)(a),
1388 			 vec_splat((vec_ushort8)(si_from_int(b)), 1))));
1389 }
1390 
si_roti(qword a,int b)1391 static __inline qword si_roti(qword a, int b)
1392 {
1393   return ((qword)(vec_rl((vec_uint4)(a),
1394 			 vec_splat((vec_uint4)(si_from_int(b)), 0))));
1395 }
1396 
1397 /* Rotate Left with Mask
1398  */
si_rothm(qword a,qword b)1399 static __inline qword si_rothm(qword a, qword b)
1400 {
1401   vec_ushort8 neg_b;
1402   vec_ushort8 mask;
1403 
1404   neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1405   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1406   return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1407 }
1408 
si_rotm(qword a,qword b)1409 static __inline qword si_rotm(qword a, qword b)
1410 {
1411   vec_uint4 neg_b;
1412   vec_uint4 mask;
1413 
1414   neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1415   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1416   return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1417 }
1418 
si_rothmi(qword a,int b)1419 static __inline qword si_rothmi(qword a, int b)
1420 {
1421   vec_ushort8 neg_b;
1422   vec_ushort8 mask;
1423 
1424   neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1425   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1426   return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1427 }
1428 
si_rotmi(qword a,int b)1429 static __inline qword si_rotmi(qword a, int b)
1430 {
1431   vec_uint4 neg_b;
1432   vec_uint4 mask;
1433 
1434   neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1435   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1436   return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1437 }
1438 
1439 
1440 /* Rotate Left Algebraic with Mask
1441  */
si_rotmah(qword a,qword b)1442 static __inline qword si_rotmah(qword a, qword b)
1443 {
1444   vec_ushort8 neg_b;
1445   vec_ushort8 mask;
1446 
1447   neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1448   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1449   return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1450 }
1451 
si_rotma(qword a,qword b)1452 static __inline qword si_rotma(qword a, qword b)
1453 {
1454   vec_uint4 neg_b;
1455   vec_uint4 mask;
1456 
1457   neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1458   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1459   return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1460 }
1461 
1462 
si_rotmahi(qword a,int b)1463 static __inline qword si_rotmahi(qword a, int b)
1464 {
1465   vec_ushort8 neg_b;
1466   vec_ushort8 mask;
1467 
1468   neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1469   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1470   return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1471 }
1472 
si_rotmai(qword a,int b)1473 static __inline qword si_rotmai(qword a, int b)
1474 {
1475   vec_uint4 neg_b;
1476   vec_uint4 mask;
1477 
1478   neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1479   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1480   return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1481 }
1482 
1483 
1484 /* Rotate Left Quadword by Bytes with Mask
1485  */
si_rotqmbyi(qword a,int count)1486 static __inline qword si_rotqmbyi(qword a, int count)
1487 {
1488   union {
1489     vec_uchar16 v;
1490     int i[4];
1491   } x;
1492   vec_uchar16 mask;
1493 
1494   count = 0 - count;
1495   x.i[3] = count << 3;
1496   mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1497 
1498   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1499 }
1500 
1501 
si_rotqmby(qword a,qword count)1502 static __inline qword si_rotqmby(qword a, qword count)
1503 {
1504   union {
1505     vec_uchar16 v;
1506     int i[4];
1507   } x;
1508   int cnt;
1509   vec_uchar16 mask;
1510 
1511   x.v = (vec_uchar16)(count);
1512   x.i[0] = cnt = (0 - x.i[0]) << 3;
1513 
1514   x.v = vec_splat(x.v, 3);
1515   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1516 
1517   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1518 }
1519 
1520 
1521 /* Rotate Left Quadword by Bytes
1522  */
si_rotqbyi(qword a,int count)1523 static __inline qword si_rotqbyi(qword a, int count)
1524 {
1525   union {
1526     vec_uchar16 v;
1527     int i[4];
1528   } left, right;
1529 
1530   count <<= 3;
1531   left.i[3] = count;
1532   right.i[3] = 0 - count;
1533   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
1534 }
1535 
si_rotqby(qword a,qword count)1536 static __inline qword si_rotqby(qword a, qword count)
1537 {
1538   vec_uchar16 left, right;
1539 
1540   left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1541   right = vec_sub(vec_splat_u8(0), left);
1542   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1543 }
1544 
1545 /* Rotate Left Quadword by Bytes Bit Count
1546  */
si_rotqbybi(qword a,qword count)1547 static __inline qword si_rotqbybi(qword a, qword count)
1548 {
1549   vec_uchar16 left, right;
1550 
1551   left = vec_splat((vec_uchar16)(count), 3);
1552   right = vec_sub(vec_splat_u8(7), left);
1553   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1554 }
1555 
1556 
1557 /* Rotate Left Quadword by Bytes Bit Count
1558  */
si_rotqbii(qword a,int count)1559 static __inline qword si_rotqbii(qword a, int count)
1560 {
1561   vec_uchar16 x, y;
1562   vec_uchar16 result;
1563 
1564   x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
1565   y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1566 			   (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1567   result = vec_or(vec_sll((qword)(a), x), y);
1568   return ((qword)(result));
1569 }
1570 
si_rotqbi(qword a,qword count)1571 static __inline qword si_rotqbi(qword a, qword count)
1572 {
1573   vec_uchar16 x, y;
1574   vec_uchar16 result;
1575 
1576   x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
1577   y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1578 			   (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1579 
1580   result = vec_or(vec_sll((qword)(a), x), y);
1581   return ((qword)(result));
1582 }
1583 
1584 
1585 /* Rotate Left Quadword and Mask by Bits
1586  */
si_rotqmbii(qword a,int count)1587 static __inline qword si_rotqmbii(qword a, int count)
1588 {
1589   return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
1590 }
1591 
si_rotqmbi(qword a,qword count)1592 static __inline qword si_rotqmbi(qword a, qword count)
1593 {
1594   return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
1595 }
1596 
1597 
1598 /* Rotate Left Quadword and Mask by Bytes with Bit Count
1599  */
si_rotqmbybi(qword a,qword count)1600 static __inline qword si_rotqmbybi(qword a, qword count)
1601 {
1602   union {
1603     vec_uchar16 v;
1604     int i[4];
1605   } x;
1606   int cnt;
1607   vec_uchar16 mask;
1608 
1609   x.v = (vec_uchar16)(count);
1610   x.i[0] = cnt = 0 - (x.i[0] & ~7);
1611   x.v = vec_splat(x.v, 3);
1612   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1613 
1614   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1615 }
1616 
1617 
1618 
1619 
1620 /* Round Double to Float
1621  */
si_frds(qword a)1622 static __inline qword si_frds(qword a)
1623 {
1624   union {
1625     vec_float4 v;
1626     float f[4];
1627   } d;
1628   union {
1629     vec_double2 v;
1630     double d[2];
1631   } in;
1632 
1633   in.v = (vec_double2)(a);
1634   d.v = (vec_float4){0.0f};
1635   d.f[0] = (float)in.d[0];
1636   d.f[2] = (float)in.d[1];
1637 
1638   return ((qword)(d.v));
1639 }
1640 
1641 /* Select Bits
1642  */
si_selb(qword a,qword b,qword c)1643 static __inline qword si_selb(qword a, qword b, qword c)
1644 {
1645   return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
1646 }
1647 
1648 
1649 /* Shuffle Bytes
1650  */
si_shufb(qword a,qword b,qword pattern)1651 static __inline qword si_shufb(qword a, qword b, qword pattern)
1652 {
1653   vec_uchar16 pat;
1654 
1655   pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1656 		vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
1657 		vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
1658   return ((qword)(vec_perm(vec_perm(a, b, pattern),
1659 			   ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
1660 				          0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1661 			   pat)));
1662 }
1663 
1664 
1665 /* Shift Left
1666  */
si_shlh(qword a,qword b)1667 static __inline qword si_shlh(qword a, qword b)
1668 {
1669   vec_ushort8 mask;
1670 
1671   mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
1672   return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
1673 }
1674 
si_shl(qword a,qword b)1675 static __inline qword si_shl(qword a, qword b)
1676 {
1677   vec_uint4 mask;
1678 
1679   mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1680   return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
1681 }
1682 
1683 
si_shlhi(qword a,unsigned int b)1684 static __inline qword si_shlhi(qword a, unsigned int b)
1685 {
1686   vec_ushort8 mask;
1687   vec_ushort8 bv;
1688 
1689   bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
1690   mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
1691   return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
1692 }
1693 
si_shli(qword a,unsigned int b)1694 static __inline qword si_shli(qword a, unsigned int b)
1695 {
1696   vec_uint4 bv;
1697   vec_uint4 mask;
1698 
1699   bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
1700   mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1701   return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
1702 }
1703 
1704 
1705 /* Shift Left Quadword
1706  */
si_shlqbii(qword a,unsigned int count)1707 static __inline qword si_shlqbii(qword a, unsigned int count)
1708 {
1709   vec_uchar16 x;
1710 
1711   x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
1712   return ((qword)(vec_sll((vec_uchar16)(a), x)));
1713 }
1714 
si_shlqbi(qword a,qword count)1715 static __inline qword si_shlqbi(qword a, qword count)
1716 {
1717   vec_uchar16 x;
1718 
1719   x = vec_splat((vec_uchar16)(count), 3);
1720   return ((qword)(vec_sll((vec_uchar16)(a), x)));
1721 }
1722 
1723 
1724 /* Shift Left Quadword by Bytes
1725  */
si_shlqbyi(qword a,unsigned int count)1726 static __inline qword si_shlqbyi(qword a, unsigned int count)
1727 {
1728   union {
1729     vec_uchar16 v;
1730     int i[4];
1731   } x;
1732   vec_uchar16 mask;
1733 
1734   x.i[3] = count << 3;
1735   mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1736   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1737 }
1738 
si_shlqby(qword a,qword count)1739 static __inline qword si_shlqby(qword a, qword count)
1740 {
1741   union {
1742     vec_uchar16 v;
1743     unsigned int i[4];
1744   } x;
1745   unsigned int cnt;
1746   vec_uchar16 mask;
1747 
1748   x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1749   cnt = x.i[0];
1750   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1751   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1752 }
1753 
1754 /* Shift Left Quadword by Bytes with Bit Count
1755  */
si_shlqbybi(qword a,qword count)1756 static __inline qword si_shlqbybi(qword a, qword count)
1757 {
1758   union {
1759     vec_uchar16 v;
1760     int i[4];
1761   } x;
1762   unsigned int cnt;
1763   vec_uchar16 mask;
1764 
1765   x.v = vec_splat((vec_uchar16)(count), 3);
1766   cnt = x.i[0];
1767   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1768   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1769 }
1770 
1771 
1772 /* Stop and Signal
1773  */
1774 #define si_stop(_type)		SPU_STOP_ACTION
1775 #define si_stopd(a, b, c)	SPU_STOP_ACTION
1776 
1777 
1778 /* Subtract
1779  */
si_sfh(qword a,qword b)1780 static __inline qword si_sfh(qword a, qword b)
1781 {
1782   return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
1783 }
1784 
si_sf(qword a,qword b)1785 static __inline qword si_sf(qword a, qword b)
1786 {
1787   return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
1788 }
1789 
si_fs(qword a,qword b)1790 static __inline qword si_fs(qword a, qword b)
1791 {
1792   return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
1793 }
1794 
si_dfs(qword a,qword b)1795 static __inline qword si_dfs(qword a, qword b)
1796 {
1797   union {
1798     vec_double2 v;
1799     double d[2];
1800   } aa, bb, dd;
1801 
1802   aa.v = (vec_double2)(a);
1803   bb.v = (vec_double2)(b);
1804   dd.d[0] = aa.d[0] - bb.d[0];
1805   dd.d[1] = aa.d[1] - bb.d[1];
1806   return ((qword)(dd.v));
1807 }
1808 
si_sfhi(qword a,short b)1809 static __inline qword si_sfhi(qword a, short b)
1810 {
1811   return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
1812 			  (vec_short8)(a))));
1813 }
1814 
si_sfi(qword a,int b)1815 static __inline qword si_sfi(qword a, int b)
1816 {
1817   return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
1818 			  (vec_int4)(a))));
1819 }
1820 
1821 /* Subtract word extended
1822  */
1823 #define si_sfx(_a, _b, _c)	((qword)(vec_add(vec_add((vec_uint4)(_b), 				\
1824 							 vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), 	\
1825 						 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1826 
1827 
1828 /* Sum Bytes into Shorts
1829  */
si_sumb(qword a,qword b)1830 static __inline qword si_sumb(qword a, qword b)
1831 {
1832   vec_uint4 zero = (vec_uint4){0};
1833   vec_ushort8 sum_a, sum_b;
1834 
1835   sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
1836   sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
1837 
1838   return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19,  2,  3, 22, 23,  6,  7,
1839 					                26, 27, 10, 11, 30, 31, 14, 15}))));
1840 }
1841 
1842 /* Exclusive OR
1843  */
si_xor(qword a,qword b)1844 static __inline qword si_xor(qword a, qword b)
1845 {
1846   return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
1847 }
1848 
si_xorbi(qword a,unsigned char b)1849 static __inline qword si_xorbi(qword a, unsigned char b)
1850 {
1851   return ((qword)(vec_xor((vec_uchar16)(a),
1852 			  vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1853 }
1854 
si_xorhi(qword a,unsigned short b)1855 static __inline qword si_xorhi(qword a, unsigned short b)
1856 {
1857   return ((qword)(vec_xor((vec_ushort8)(a),
1858 			  vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1859 }
1860 
si_xori(qword a,unsigned int b)1861 static __inline qword si_xori(qword a, unsigned int b)
1862 {
1863   return ((qword)(vec_xor((vec_uint4)(a),
1864 			  vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1865 }
1866 
1867 
1868 /* Generate Controls for Sub-Quadword Insertion
1869  */
si_cbd(qword a,int imm)1870 static __inline qword si_cbd(qword a, int imm)
1871 {
1872   union {
1873     vec_uint4 v;
1874     unsigned char c[16];
1875   } shmask;
1876 
1877   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1878   shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
1879   return ((qword)(shmask.v));
1880 }
1881 
si_cdd(qword a,int imm)1882 static __inline qword si_cdd(qword a, int imm)
1883 {
1884   union {
1885     vec_uint4 v;
1886     unsigned long long ll[2];
1887   } shmask;
1888 
1889   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1890   shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
1891   return ((qword)(shmask.v));
1892 }
1893 
si_chd(qword a,int imm)1894 static __inline qword si_chd(qword a, int imm)
1895 {
1896   union {
1897     vec_uint4 v;
1898     unsigned short s[8];
1899   } shmask;
1900 
1901   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1902   shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
1903   return ((qword)(shmask.v));
1904 }
1905 
si_cwd(qword a,int imm)1906 static __inline qword si_cwd(qword a, int imm)
1907 {
1908   union {
1909     vec_uint4 v;
1910     unsigned int i[4];
1911   } shmask;
1912 
1913   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1914   shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
1915   return ((qword)(shmask.v));
1916 }
1917 
si_cbx(qword a,qword b)1918 static __inline qword si_cbx(qword a, qword b)
1919 {
1920   union {
1921     vec_uint4 v;
1922     unsigned char c[16];
1923   } shmask;
1924 
1925   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1926   shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
1927   return ((qword)(shmask.v));
1928 }
1929 
1930 
si_cdx(qword a,qword b)1931 static __inline qword si_cdx(qword a, qword b)
1932 {
1933   union {
1934     vec_uint4 v;
1935     unsigned long long ll[2];
1936   } shmask;
1937 
1938   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1939   shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
1940   return ((qword)(shmask.v));
1941 }
1942 
si_chx(qword a,qword b)1943 static __inline qword si_chx(qword a, qword b)
1944 {
1945   union {
1946     vec_uint4 v;
1947     unsigned short s[8];
1948   } shmask;
1949 
1950   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1951   shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
1952   return ((qword)(shmask.v));
1953 }
1954 
si_cwx(qword a,qword b)1955 static __inline qword si_cwx(qword a, qword b)
1956 {
1957   union {
1958     vec_uint4 v;
1959     unsigned int i[4];
1960   } shmask;
1961 
1962   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1963   shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
1964   return ((qword)(shmask.v));
1965 }
1966 
1967 
1968 /* Constant Formation
1969  */
si_il(signed short imm)1970 static __inline qword si_il(signed short imm)
1971 {
1972   return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
1973 }
1974 
1975 
si_ila(unsigned int imm)1976 static __inline qword si_ila(unsigned int imm)
1977 {
1978   return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
1979 }
1980 
si_ilh(signed short imm)1981 static __inline qword si_ilh(signed short imm)
1982 {
1983   return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
1984 }
1985 
si_ilhu(signed short imm)1986 static __inline qword si_ilhu(signed short imm)
1987 {
1988   return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
1989 }
1990 
si_iohl(qword a,unsigned short imm)1991 static __inline qword si_iohl(qword a, unsigned short imm)
1992 {
1993   return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
1994 }
1995 
1996 /* No Operation
1997  */
1998 #define si_lnop()		/* do nothing */
1999 #define si_nop()		/* do nothing */
2000 
2001 
2002 /* Memory Load and Store
2003  */
si_lqa(unsigned int imm)2004 static __inline qword si_lqa(unsigned int imm)
2005 {
2006   return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2007 }
2008 
si_lqd(qword a,unsigned int imm)2009 static __inline qword si_lqd(qword a, unsigned int imm)
2010 {
2011   return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
2012 }
2013 
si_lqr(unsigned int imm)2014 static __inline qword si_lqr(unsigned int imm)
2015 {
2016   return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2017 }
2018 
si_lqx(qword a,qword b)2019 static __inline qword si_lqx(qword a, qword b)
2020 {
2021   return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
2022 }
2023 
si_stqa(qword a,unsigned int imm)2024 static __inline void si_stqa(qword a, unsigned int imm)
2025 {
2026   vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2027 }
2028 
si_stqd(qword a,qword b,unsigned int imm)2029 static __inline void si_stqd(qword a, qword b, unsigned int imm)
2030 {
2031   vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
2032 }
2033 
si_stqr(qword a,unsigned int imm)2034 static __inline void si_stqr(qword a, unsigned int imm)
2035 {
2036   vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2037 }
2038 
si_stqx(qword a,qword b,qword c)2039 static __inline void si_stqx(qword a, qword b, qword c)
2040 {
2041   vec_st((vec_uchar16)(a),
2042 	 si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
2043 	 (vector unsigned char *)(0));
2044 }
2045 
2046 #endif /* !__SPU__ */
2047 #endif /* !_SI2VMX_H_ */
2048 
2049