1 /* Copyright (C) 2006-2014 Free Software Foundation, Inc.
2 
3    This file is free software; you can redistribute it and/or modify it under
4    the terms of the GNU General Public License as published by the Free
5    Software Foundation; either version 3 of the License, or (at your option)
6    any later version.
7 
8    This file is distributed in the hope that it will be useful, but WITHOUT
9    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11    for more details.
12 
13    Under Section 7 of GPL version 3, you are granted additional
14    permissions described in the GCC Runtime Library Exception, version
15    3.1, as published by the Free Software Foundation.
16 
17    You should have received a copy of the GNU General Public License and
18    a copy of the GCC Runtime Library Exception along with this program;
19    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
20    <http://www.gnu.org/licenses/>.  */
21 
22 #ifndef _VMX2SPU_H_
23 #define _VMX2SPU_H_	1
24 
25 #ifdef __cplusplus
26 
27 #ifdef __SPU__
28 
29 #include <spu_intrinsics.h>
30 #include <vec_types.h>
31 
32 /* This file maps generic VMX intrinsics and predicates to the SPU using
33  * overloaded C++ functions.
34  */
35 
36 /************************************************************************
37  *                        INTRINSICS
38  ************************************************************************/
39 
40 /* vec_abs (vector absolute value)
41  * =======
42  */
vec_abs(vec_char16 a)43 static inline vec_char16 vec_abs(vec_char16 a)
44 {
45   vec_char16 minus_a;
46 
47   minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
48   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
49 }
50 
vec_abs(vec_short8 a)51 static inline vec_short8 vec_abs(vec_short8 a)
52 {
53   return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
54 }
55 
vec_abs(vec_int4 a)56 static inline vec_int4 vec_abs(vec_int4 a)
57 {
58   return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
59 }
60 
vec_abs(vec_float4 a)61 static inline vec_float4 vec_abs(vec_float4 a)
62 {
63   return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
64 }
65 
66 /* vec_abss (vector absolute value saturate)
67  * ========
68  */
vec_abss(vec_char16 a)69 static inline vec_char16 vec_abss(vec_char16 a)
70 {
71   vec_char16 minus_a;
72 
73   minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
74 				(vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
75   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
76 }
77 
vec_abss(vec_short8 a)78 static inline vec_short8 vec_abss(vec_short8 a)
79 {
80   vec_short8 minus_a;
81 
82   minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
83   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
84 }
85 
vec_abss(vec_int4 a)86 static inline vec_int4 vec_abss(vec_int4 a)
87 {
88   vec_int4 minus_a;
89 
90   minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
91   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
92 }
93 
94 
95 /* vec_add (vector add)
96  * =======
97  */
vec_add(vec_uchar16 a,vec_uchar16 b)98 static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
99 {
100   return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
101 				spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
102 				spu_splats((unsigned short)(0xFF00)))));
103 }
104 
vec_add(vec_char16 a,vec_char16 b)105 static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
106 {
107   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
108 }
109 
vec_add(vec_bchar16 a,vec_char16 b)110 static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
111 {
112   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
113 }
114 
vec_add(vec_char16 a,vec_bchar16 b)115 static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
116 {
117   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
118 }
119 
vec_add(vec_ushort8 a,vec_ushort8 b)120 static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
121 {
122   return (spu_add(a, b));
123 }
124 
vec_add(vec_short8 a,vec_short8 b)125 static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
126 {
127   return (spu_add(a, b));
128 }
129 
vec_add(vec_bshort8 a,vec_short8 b)130 static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
131 {
132   return (spu_add((vec_short8)(a), b));
133 }
134 
vec_add(vec_short8 a,vec_bshort8 b)135 static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
136 {
137   return (spu_add(a, (vec_short8)(b)));
138 }
139 
vec_add(vec_uint4 a,vec_uint4 b)140 static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
141 {
142   return (spu_add(a, b));
143 }
144 
vec_add(vec_int4 a,vec_int4 b)145 static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
146 {
147   return (spu_add(a, b));
148 }
149 
vec_add(vec_bint4 a,vec_int4 b)150 static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
151 {
152   return (spu_add((vec_int4)(a), b));
153 }
154 
vec_add(vec_int4 a,vec_bint4 b)155 static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
156 {
157   return (spu_add(a, (vec_int4)(b)));
158 }
159 
vec_add(vec_float4 a,vec_float4 b)160 static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
161 {
162   return (spu_add(a, b));
163 }
164 
165 /* vec_addc (vector add carryout unsigned word)
166  * ========
167  */
168 #define vec_addc(_a, _b)	spu_genc(_a, _b)
169 
170 /* vec_adds (vector add saturated)
171  * ========
172  */
vec_adds(vec_uchar16 a,vec_uchar16 b)173 static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
174 {
175   vec_uchar16 s1, s2, s, d;
176 
177   s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
178   s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
179   s  = spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
180 				          8, 24, 10, 26, 12, 28, 14, 30}));
181   d  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
182 				          9, 25, 11, 27, 13, 29, 15, 31}));
183   return (spu_or(d, spu_cmpeq(s, 1)));
184 }
185 
vec_adds(vec_char16 a,vec_char16 b)186 static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
187 {
188   vec_uchar16 s1, s2, s, d;
189 
190   s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
191   s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
192   s  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
193 				          9, 25, 11, 27, 13, 29, 15, 31}));
194   d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
195   d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
196   return ((vec_char16)(d));
197 }
198 
vec_adds(vec_bchar16 a,vec_char16 b)199 static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
200 {
201   return (vec_adds((vec_char16)(a), b));
202 }
203 
vec_adds(vec_char16 a,vec_bchar16 b)204 static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
205 {
206   return (vec_adds(a, (vec_char16)(b)));
207 }
208 
vec_adds(vec_ushort8 a,vec_ushort8 b)209 static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
210 {
211   vec_ushort8 s, d;
212 
213   s = spu_add(a, b);
214   d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
215   return (d);
216 }
217 
vec_adds(vec_short8 a,vec_short8 b)218 static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
219 {
220   vec_short8 s, d;
221 
222   s = spu_add(a, b);
223   d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
224   d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
225   return (d);
226 }
227 
vec_adds(vec_bshort8 a,vec_short8 b)228 static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
229 {
230   return (vec_adds((vec_short8)(a), b));
231 }
232 
vec_adds(vec_short8 a,vec_bshort8 b)233 static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
234 {
235   return (vec_adds(a, (vec_short8)(b)));
236 }
237 
vec_adds(vec_uint4 a,vec_uint4 b)238 static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
239 {
240   return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
241 }
242 
vec_adds(vec_int4 a,vec_int4 b)243 static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
244 {
245   vec_int4 s, d;
246 
247   s = spu_add(a, b);
248   d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
249   d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
250   return (d);
251 }
252 
vec_adds(vec_bint4 a,vec_int4 b)253 static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
254 {
255   return (vec_adds((vec_int4)(a), b));
256 }
257 
vec_adds(vec_int4 a,vec_bint4 b)258 static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
259 {
260   return (vec_adds(a, (vec_int4)(b)));
261 }
262 
263 /* vec_and (vector logical and)
264  * =======
265  */
vec_and(vec_uchar16 a,vec_uchar16 b)266 static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
267 {
268   return (spu_and(a, b));
269 }
270 
vec_and(vec_char16 a,vec_char16 b)271 static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
272 {
273   return (spu_and(a, b));
274 }
275 
vec_and(vec_bchar16 a,vec_char16 b)276 static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
277 {
278   return (spu_and((vec_char16)(a), b));
279 }
280 
vec_and(vec_char16 a,vec_bchar16 b)281 static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
282 {
283   return (spu_and(a, (vec_char16)(b)));
284 }
285 
vec_and(vec_ushort8 a,vec_ushort8 b)286 static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
287 {
288   return (spu_and(a, b));
289 }
290 
vec_and(vec_short8 a,vec_short8 b)291 static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
292 {
293   return (spu_and(a, b));
294 }
295 
vec_and(vec_bshort8 a,vec_short8 b)296 static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
297 {
298   return (spu_and((vec_short8)(a), b));
299 }
300 
vec_and(vec_short8 a,vec_bshort8 b)301 static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
302 {
303   return (spu_and(a, (vec_short8)(b)));
304 }
305 
vec_and(vec_uint4 a,vec_uint4 b)306 static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
307 {
308   return (spu_and(a, b));
309 }
310 
vec_and(vec_int4 a,vec_int4 b)311 static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
312 {
313   return (spu_and(a, b));
314 }
315 
vec_and(vec_bint4 a,vec_int4 b)316 static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
317 {
318   return (spu_and((vec_int4)(a), b));
319 }
320 
vec_and(vec_int4 a,vec_bint4 b)321 static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
322 {
323   return (spu_and(a, (vec_int4)(b)));
324 }
325 
vec_and(vec_float4 a,vec_float4 b)326 static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
327 {
328   return (spu_and(a, b));
329 }
330 
vec_and(vec_bint4 a,vec_float4 b)331 static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
332 {
333   return (spu_and((vec_float4)(a),b));
334 }
335 
vec_and(vec_float4 a,vec_bint4 b)336 static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
337 {
338   return (spu_and(a, (vec_float4)(b)));
339 }
340 
341 
342 /* vec_andc (vector logical and with complement)
343  * ========
344  */
vec_andc(vec_uchar16 a,vec_uchar16 b)345 static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
346 {
347   return (spu_andc(a, b));
348 }
349 
vec_andc(vec_char16 a,vec_char16 b)350 static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
351 {
352   return (spu_andc(a, b));
353 }
354 
vec_andc(vec_bchar16 a,vec_char16 b)355 static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
356 {
357   return (spu_andc((vec_char16)(a), b));
358 }
359 
vec_andc(vec_char16 a,vec_bchar16 b)360 static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
361 {
362   return (spu_andc(a, (vec_char16)(b)));
363 }
364 
vec_andc(vec_ushort8 a,vec_ushort8 b)365 static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
366 {
367   return (spu_andc(a, b));
368 }
369 
vec_andc(vec_short8 a,vec_short8 b)370 static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
371 {
372   return (spu_andc(a, b));
373 }
374 
vec_andc(vec_bshort8 a,vec_short8 b)375 static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
376 {
377   return (spu_andc((vec_short8)(a), b));
378 }
379 
vec_andc(vec_short8 a,vec_bshort8 b)380 static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
381 {
382   return (spu_andc(a, (vec_short8)(b)));
383 }
384 
vec_andc(vec_uint4 a,vec_uint4 b)385 static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
386 {
387   return (spu_andc(a, b));
388 }
389 
vec_andc(vec_int4 a,vec_int4 b)390 static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
391 {
392   return (spu_andc(a, b));
393 }
394 
vec_andc(vec_bint4 a,vec_int4 b)395 static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
396 {
397   return (spu_andc((vec_int4)(a), b));
398 }
399 
vec_andc(vec_int4 a,vec_bint4 b)400 static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
401 {
402   return (spu_andc(a, (vec_int4)(b)));
403 }
404 
vec_andc(vec_float4 a,vec_float4 b)405 static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
406 {
407   return (spu_andc(a,b));
408 }
409 
vec_andc(vec_bint4 a,vec_float4 b)410 static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
411 {
412   return (spu_andc((vec_float4)(a),b));
413 }
414 
vec_andc(vec_float4 a,vec_bint4 b)415 static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
416 {
417   return (spu_andc(a, (vec_float4)(b)));
418 }
419 
420 /* vec_avg (vector average)
421  * =======
422  */
vec_avg(vec_uchar16 a,vec_uchar16 b)423 static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
424 {
425   return (spu_avg(a, b));
426 }
427 
vec_avg(vec_char16 a,vec_char16 b)428 static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
429 {
430   return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
431 			       (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
432 }
433 
vec_avg(vec_ushort8 a,vec_ushort8 b)434 static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
435 {
436   return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
437 		  spu_and(spu_or(a, b), 1)));
438 }
439 
vec_avg(vec_short8 a,vec_short8 b)440 static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
441 {
442   return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
443 		  spu_and(spu_or(a, b), 1)));
444 }
445 
vec_avg(vec_uint4 a,vec_uint4 b)446 static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
447 {
448   return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
449 		  spu_and(spu_or(a, b), 1)));
450 }
451 
vec_avg(vec_int4 a,vec_int4 b)452 static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
453 {
454   return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
455 		  spu_and(spu_or(a, b), 1)));
456 }
457 
458 
459 /* vec_ceil (vector ceiling)
460  * ========
461  */
vec_ceil(vec_float4 a)462 static inline vec_float4 vec_ceil(vec_float4 a)
463 {
464   vec_int4  exp;
465   vec_uint4 mask;
466 
467   a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
468   exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
469   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
470   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
471   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
472 
473   return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
474 }
475 
476 
477 /* vec_cmpb (vector compare bounds floating-point)
478  * ========
479  */
vec_cmpb(vec_float4 a,vec_float4 b)480 static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
481 {
482   vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
483   vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
484 
485   return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
486 		 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
487 }
488 
489 /* vec_cmpeq (vector compare equal)
490  * =========
491  */
492 #define vec_cmpeq(_a, _b)	spu_cmpeq(_a, _b)
493 
494 
495 /* vec_cmpge (vector compare greater than or equal)
496  * =========
497  */
vec_cmpge(vec_float4 a,vec_float4 b)498 static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
499 {
500   return (spu_xor(spu_cmpgt(b, a), -1));
501 }
502 
503 
504 /* vec_cmpgt (vector compare greater than)
505  * =========
506  */
507 #define vec_cmpgt(_a, _b)	spu_cmpgt(_a, _b)
508 
509 
510 /* vec_cmple (vector compare less than or equal)
511  * =========
512  */
vec_cmple(vec_float4 a,vec_float4 b)513 static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
514 {
515   return (spu_xor(spu_cmpgt(a, b), -1));
516 }
517 
518 
519 /* vec_cmplt (vector compare less than)
520  * =========
521  */
522 #define vec_cmplt(_a, _b)	spu_cmpgt(_b, _a)
523 
524 
525 /* vec_ctf (vector convert from fixed-point word)
526  * =======
527  */
528 #define vec_ctf(_a, _b)		spu_convtf(_a, _b)
529 
530 
531 /* vec_cts (vector convert to signed fixed-point word saturate)
532  * =======
533  */
534 #define vec_cts(_a, _b)		spu_convts(_a, _b)
535 
536 
537 /* vec_ctu (vector convert to unsigned fixed-point word saturate)
538  * =======
539  */
540 #define vec_ctu(_a, _b)		spu_convtu(_a, _b)
541 
542 
543 /* vec_dss (vector data stream stop)
544  * =======
545  */
546 #define vec_dss(_a)
547 
548 
549 /* vec_dssall (vector data stream stop all)
550  * ==========
551  */
552 #define vec_dssall()
553 
554 
555 /* vec_dst (vector data stream touch)
556  * =======
557  */
558 #define vec_dst(_a, _b, _c)
559 
560 
561 /* vec_dstst (vector data stream touch for store)
562  * =========
563  */
564 #define vec_dstst(_a, _b, _c)
565 
566 
567 /* vec_dststt (vector data stream touch for store transient)
568  * ==========
569  */
570 #define vec_dststt(_a, _b, _c)
571 
572 
573 /* vec_dstt (vector data stream touch transient)
574  * ========
575  */
576 #define vec_dstt(_a, _b, _c)
577 
578 
579 /* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
580  * =========
581  */
vec_expte(vec_float4 a)582 static inline vec_float4 vec_expte(vec_float4 a)
583 {
584   vec_float4 bias, frac, exp;
585   vec_int4 ia;
586 
587   bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
588   ia   = spu_convts(spu_add(a, bias), 0);
589   frac = spu_sub(spu_convtf(ia, 0), a);
590   exp  = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
591 
592   return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
593 			   frac, spu_splats(1.0f)), exp));
594 }
595 
596 
597 /* vec_floor (vector floor)
598  * =========
599  */
vec_floor(vec_float4 a)600 static inline vec_float4 vec_floor(vec_float4 a)
601 {
602   vec_int4  exp;
603   vec_uint4 mask;
604 
605   a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
606   exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
607   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
608   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
609   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
610 
611   return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
612 }
613 
614 
615 /* vec_ld (vector load indexed)
616  * ======
617  */
vec_ld(int a,unsigned char * b)618 static inline vec_uchar16 vec_ld(int a, unsigned char *b)
619 {
620   return (*((vec_uchar16 *)(b+a)));
621 }
622 
vec_ld(int a,vec_uchar16 * b)623 static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
624 {
625   return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
626 }
627 
vec_ld(int a,signed char * b)628 static inline vec_char16 vec_ld(int a, signed char *b)
629 {
630   return (*((vec_char16 *)(b+a)));
631 }
632 
vec_ld(int a,vec_char16 * b)633 static inline vec_char16 vec_ld(int a, vec_char16 *b)
634 {
635   return (*((vec_char16 *)((signed char *)(b)+a)));
636 }
637 
vec_ld(int a,unsigned short * b)638 static inline vec_ushort8 vec_ld(int a, unsigned short *b)
639 {
640   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
641 }
642 
vec_ld(int a,vec_ushort8 * b)643 static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
644 {
645   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
646 }
647 
vec_ld(int a,signed short * b)648 static inline vec_short8 vec_ld(int a, signed short *b)
649 {
650   return (*((vec_short8 *)((unsigned char *)(b)+a)));
651 }
652 
vec_ld(int a,vec_short8 * b)653 static inline vec_short8 vec_ld(int a, vec_short8 *b)
654 {
655   return (*((vec_short8 *)((signed char *)(b)+a)));
656 }
657 
vec_ld(int a,unsigned int * b)658 static inline vec_uint4 vec_ld(int a, unsigned int *b)
659 {
660   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
661 }
662 
vec_ld(int a,vec_uint4 * b)663 static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
664 {
665   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
666 }
667 
vec_ld(int a,signed int * b)668 static inline vec_int4 vec_ld(int a, signed int *b)
669 {
670   return (*((vec_int4 *)((unsigned char *)(b)+a)));
671 }
672 
vec_ld(int a,vec_int4 * b)673 static inline vec_int4 vec_ld(int a, vec_int4 *b)
674 {
675   return (*((vec_int4 *)((signed char *)(b)+a)));
676 }
677 
vec_ld(int a,float * b)678 static inline vec_float4 vec_ld(int a, float *b)
679 {
680   return (*((vec_float4 *)((unsigned char *)(b)+a)));
681 }
682 
vec_ld(int a,vec_float4 * b)683 static inline vec_float4 vec_ld(int a, vec_float4 *b)
684 {
685   return (*((vec_float4 *)((unsigned char *)(b)+a)));
686 }
687 
688 /* vec_lde (vector load element indexed)
689  * =======
690  */
vec_lde(int a,unsigned char * b)691 static inline vec_uchar16 vec_lde(int a, unsigned char *b)
692 {
693   return (*((vec_uchar16 *)(b+a)));
694 }
695 
vec_lde(int a,signed char * b)696 static inline vec_char16 vec_lde(int a, signed char *b)
697 {
698   return (*((vec_char16 *)(b+a)));
699 }
700 
vec_lde(int a,unsigned short * b)701 static inline vec_ushort8 vec_lde(int a, unsigned short *b)
702 {
703   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
704 }
705 
vec_lde(int a,signed short * b)706 static inline vec_short8 vec_lde(int a, signed short *b)
707 {
708   return (*((vec_short8 *)((unsigned char *)(b)+a)));
709 }
710 
711 
vec_lde(int a,unsigned int * b)712 static inline vec_uint4 vec_lde(int a, unsigned int *b)
713 {
714   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
715 }
716 
vec_lde(int a,signed int * b)717 static inline vec_int4 vec_lde(int a, signed int *b)
718 {
719   return (*((vec_int4 *)((unsigned char *)(b)+a)));
720 }
721 
722 
vec_lde(int a,float * b)723 static inline vec_float4 vec_lde(int a, float *b)
724 {
725   return (*((vec_float4 *)((unsigned char *)(b)+a)));
726 }
727 
728 /* vec_ldl (vector load indexed LRU)
729  * =======
730  */
731 #define vec_ldl(_a, _b)		vec_ld(_a, _b)
732 
733 
734 /* vec_loge (vector log2 estimate floating-point)
735  * ========
736  */
vec_loge(vec_float4 a)737 static inline vec_float4 vec_loge(vec_float4 a)
738 {
739   vec_int4 exp;
740   vec_float4 frac;
741 
742   exp  = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
743   frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
744 
745   return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
746 		   frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
747 }
748 
749 
750 /* vec_lvsl (vector load for shift left)
751  * ========
752  */
vec_lvsl(int a,unsigned char * b)753 static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
754 {
755   return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
756 			       ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
757 				              0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
758 }
759 
vec_lvsl(int a,signed char * b)760 static inline vec_uchar16 vec_lvsl(int a, signed char *b)
761 {
762   return (vec_lvsl(a, (unsigned char *)b));
763 }
764 
vec_lvsl(int a,unsigned short * b)765 static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
766 {
767   return (vec_lvsl(a, (unsigned char *)b));
768 }
769 
vec_lvsl(int a,short * b)770 static inline vec_uchar16 vec_lvsl(int a, short *b)
771 {
772   return (vec_lvsl(a, (unsigned char *)b));
773 }
774 
vec_lvsl(int a,unsigned int * b)775 static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
776 {
777   return (vec_lvsl(a, (unsigned char *)b));
778 }
779 
vec_lvsl(int a,int * b)780 static inline vec_uchar16 vec_lvsl(int a, int *b)
781 {
782   return (vec_lvsl(a, (unsigned char *)b));
783 }
784 
vec_lvsl(int a,float * b)785 static inline vec_uchar16 vec_lvsl(int a, float *b)
786 {
787   return (vec_lvsl(a, (unsigned char *)b));
788 }
789 
790 
791 /* vec_lvsr (vector load for shift right)
792  * ========
793  */
vec_lvsr(int a,unsigned char * b)794 static  inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
795 {
796   return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
797 				               0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
798 				(vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
799 }
800 
vec_lvsr(int a,signed char * b)801 static inline vec_uchar16 vec_lvsr(int a, signed char *b)
802 {
803   return (vec_lvsr(a, (unsigned char *)b));
804 }
805 
vec_lvsr(int a,unsigned short * b)806 static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
807 {
808   return (vec_lvsr(a, (unsigned char *)b));
809 }
810 
vec_lvsr(int a,short * b)811 static inline vec_uchar16 vec_lvsr(int a, short *b)
812 {
813   return (vec_lvsr(a, (unsigned char *)b));
814 }
815 
vec_lvsr(int a,unsigned int * b)816 static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
817 {
818   return (vec_lvsr(a, (unsigned char *)b));
819 }
820 
vec_lvsr(int a,int * b)821 static inline vec_uchar16 vec_lvsr(int a, int *b)
822 {
823   return (vec_lvsr(a, (unsigned char *)b));
824 }
825 
vec_lvsr(int a,float * b)826 static inline vec_uchar16 vec_lvsr(int a, float *b)
827 {
828   return (vec_lvsr(a, (unsigned char *)b));
829 }
830 
831 /* vec_madd (vector multiply add)
832  * ========
833  */
834 #define vec_madd(_a, _b, _c)	spu_madd(_a, _b, _c)
835 
836 
837 
838 /* vec_madds (vector multiply add saturate)
839  * =========
840  */
vec_madds(vec_short8 a,vec_short8 b,vec_short8 c)841 static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
842 {
843   return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
844 			      (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
845 			      ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
846 }
847 
848 /* vec_max (vector maximum)
849  * =======
850  */
vec_max(vec_uchar16 a,vec_uchar16 b)851 static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
852 {
853   return (spu_sel(b, a, spu_cmpgt(a, b)));
854 }
855 
vec_max(vec_char16 a,vec_char16 b)856 static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
857 {
858   return (spu_sel(b, a, spu_cmpgt(a, b)));
859 }
860 
vec_max(vec_bchar16 a,vec_char16 b)861 static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
862 {
863   return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
864 }
865 
vec_max(vec_char16 a,vec_bchar16 b)866 static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
867 {
868   return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
869 }
870 
vec_max(vec_ushort8 a,vec_ushort8 b)871 static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
872 {
873   return (spu_sel(b, a, spu_cmpgt(a, b)));
874 }
875 
vec_max(vec_short8 a,vec_short8 b)876 static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
877 {
878   return (spu_sel(b, a, spu_cmpgt(a, b)));
879 }
880 
vec_max(vec_bshort8 a,vec_short8 b)881 static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
882 {
883   return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
884 }
885 
vec_max(vec_short8 a,vec_bshort8 b)886 static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
887 {
888   return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
889 }
890 
vec_max(vec_uint4 a,vec_uint4 b)891 static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
892 {
893   return (spu_sel(b, a, spu_cmpgt(a, b)));
894 }
895 
vec_max(vec_int4 a,vec_int4 b)896 static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
897 {
898   return (spu_sel(b, a, spu_cmpgt(a, b)));
899 }
900 
vec_max(vec_bint4 a,vec_int4 b)901 static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
902 {
903   return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
904 }
905 
vec_max(vec_int4 a,vec_bint4 b)906 static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
907 {
908   return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
909 }
910 
vec_max(vec_float4 a,vec_float4 b)911 static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
912 {
913   return (spu_sel(b, a, spu_cmpgt(a, b)));
914 }
915 
916 
917 /* vec_mergeh (vector merge high)
918  * ==========
919  */
vec_mergeh(vec_uchar16 a,vec_uchar16 b)920 static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
921 {
922   return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
923 				           4, 20, 5, 21, 6, 22, 7, 23})));
924 }
925 
vec_mergeh(vec_char16 a,vec_char16 b)926 static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
927 {
928   return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
929 				           4, 20, 5, 21, 6, 22, 7, 23})));
930 }
931 
vec_mergeh(vec_ushort8 a,vec_ushort8 b)932 static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
933 {
934   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
935 				           4, 5, 20, 21, 6, 7, 22, 23})));
936 }
937 
vec_mergeh(vec_short8 a,vec_short8 b)938 static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
939 {
940   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
941 				           4, 5, 20, 21, 6, 7, 22, 23})));
942 }
943 
vec_mergeh(vec_uint4 a,vec_uint4 b)944 static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
945 {
946   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
947 				           4, 5, 6, 7, 20, 21, 22, 23})));
948 }
949 
vec_mergeh(vec_int4 a,vec_int4 b)950 static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
951 {
952   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
953 				           4, 5, 6, 7, 20, 21, 22, 23})));
954 }
955 
vec_mergeh(vec_float4 a,vec_float4 b)956 static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
957 {
958   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
959 				           4, 5, 6, 7, 20, 21, 22, 23})));
960 }
961 
962 /* vec_mergel (vector merge low)
963  * ==========
964  */
vec_mergel(vec_uchar16 a,vec_uchar16 b)965 static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
966 {
967   return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
968 				           12, 28, 13, 29, 14, 30, 15, 31})));
969 }
970 
vec_mergel(vec_char16 a,vec_char16 b)971 static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
972 {
973   return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
974 				           12, 28, 13, 29, 14, 30, 15, 31})));
975 }
976 
vec_mergel(vec_ushort8 a,vec_ushort8 b)977 static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
978 {
979   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
980 				           12, 13, 28, 29, 14, 15, 30, 31})));
981 }
982 
vec_mergel(vec_short8 a,vec_short8 b)983 static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
984 {
985   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
986 				           12, 13, 28, 29, 14, 15, 30, 31})));
987 }
988 
vec_mergel(vec_uint4 a,vec_uint4 b)989 static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
990 {
991   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
992 				           12, 13, 14, 15, 28, 29, 30, 31})));
993 }
994 
vec_mergel(vec_int4 a,vec_int4 b)995 static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
996 {
997   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
998 				           12, 13, 14, 15, 28, 29, 30, 31})));
999 }
1000 
vec_mergel(vec_float4 a,vec_float4 b)1001 static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
1002 {
1003   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
1004 				           12, 13, 14, 15, 28, 29, 30, 31})));
1005 }
1006 
1007 /* vec_mfvscr (vector move from vector status and control register)
1008  * ==========
1009  */
vec_mfvscr()1010 static inline vec_ushort8 vec_mfvscr()
1011 {
1012   return ((vec_ushort8)spu_splats(0)); 		/* not supported */
1013 }
1014 
1015 
1016 /* vec_min (vector minimum)
1017  * =======
1018  */
vec_min(vec_uchar16 a,vec_uchar16 b)1019 static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
1020 {
1021   return (spu_sel(a, b, spu_cmpgt(a, b)));
1022 }
1023 
vec_min(vec_char16 a,vec_char16 b)1024 static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
1025 {
1026   return (spu_sel(a, b, spu_cmpgt(a, b)));
1027 }
1028 
vec_min(vec_bchar16 a,vec_char16 b)1029 static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
1030 {
1031   return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
1032 }
1033 
vec_min(vec_char16 a,vec_bchar16 b)1034 static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
1035 {
1036   return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
1037 }
1038 
vec_min(vec_ushort8 a,vec_ushort8 b)1039 static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
1040 {
1041   return (spu_sel(a, b, spu_cmpgt(a, b)));
1042 }
1043 
vec_min(vec_short8 a,vec_short8 b)1044 static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
1045 {
1046   return (spu_sel(a, b, spu_cmpgt(a, b)));
1047 }
1048 
vec_min(vec_bshort8 a,vec_short8 b)1049 static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
1050 {
1051   return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
1052 }
1053 
vec_min(vec_short8 a,vec_bshort8 b)1054 static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
1055 {
1056   return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
1057 }
1058 
vec_min(vec_uint4 a,vec_uint4 b)1059 static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
1060 {
1061   return (spu_sel(a, b, spu_cmpgt(a, b)));
1062 }
1063 
vec_min(vec_int4 a,vec_int4 b)1064 static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
1065 {
1066   return (spu_sel(a, b, spu_cmpgt(a, b)));
1067 }
1068 
vec_min(vec_bint4 a,vec_int4 b)1069 static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
1070 {
1071   return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
1072 }
1073 
vec_min(vec_int4 a,vec_bint4 b)1074 static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
1075 {
1076   return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
1077 }
1078 
vec_min(vec_float4 a,vec_float4 b)1079 static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
1080 {
1081   return (spu_sel(a, b, spu_cmpgt(a, b)));
1082 }
1083 
1084 /* vec_mladd (vector multiply low and add unsigned half word)
1085  * =========
1086  */
vec_mladd(vec_short8 a,vec_short8 b,vec_short8 c)1087 static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
1088 {
1089   return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
1090 					    (vec_short8)(spu_rl((vec_uint4)(b), -16)),
1091 					    (vec_int4)(spu_rl((vec_uint4)(c), -16))),
1092 				   spu_madd(a, b, spu_extend(c)),
1093 				   ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1094 					          10, 11, 26, 27, 14, 15, 30, 31}))));
1095 }
1096 
1097 
vec_mladd(vec_ushort8 a,vec_ushort8 b,vec_ushort8 c)1098 static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
1099 {
1100   return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
1101 }
1102 
vec_mladd(vec_ushort8 a,vec_short8 b,vec_short8 c)1103 static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
1104 {
1105   return (vec_mladd((vec_short8)(a), b, c));
1106 }
1107 
vec_mladd(vec_short8 a,vec_ushort8 b,vec_ushort8 c)1108 static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
1109 {
1110   return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
1111 }
1112 
1113 
1114 /* vec_mradds (vector multiply round and add saturate)
1115  * ==========
1116  */
vec_mradds(vec_short8 a,vec_short8 b,vec_short8 c)1117 static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
1118 {
1119   vec_int4 round = (vec_int4)spu_splats(0x4000);
1120   vec_short8 hi, lo;
1121 
1122   hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
1123   lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
1124 
1125   return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
1126 }
1127 
1128 
1129 /* vec_msum (vector multiply sum)
1130  * ========
1131  */
vec_msum(vec_uchar16 a,vec_uchar16 b,vec_uint4 c)1132 static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
1133 {
1134   vec_ushort8 a1, a2, b1, b2;
1135   vec_uint4 p1, p2;
1136 
1137   a1 = spu_and((vec_ushort8)(a), 0xFF);
1138   a2 = spu_rlmask((vec_ushort8)(a), -8);
1139   b1 = spu_and((vec_ushort8)(b), 0xFF);
1140   b2 = spu_rlmask((vec_ushort8)(b), -8);
1141 
1142   p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1143   p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1144   return (spu_add(p2, spu_add(p1, c)));
1145 }
1146 
vec_msum(vec_char16 a,vec_uchar16 b,vec_int4 c)1147 static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
1148 {
1149   vec_short8 a1, a2, b1, b2;
1150   vec_int4 p1, p2;
1151 
1152   a1 = (vec_short8)(spu_extend(a));
1153   a2 = spu_rlmaska((vec_short8)(a), -8);
1154   b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
1155   b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
1156 
1157   p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1158   p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1159   return (spu_add(p2, spu_add(p1, c)));
1160 }
1161 
vec_msum(vec_ushort8 a,vec_ushort8 b,vec_uint4 c)1162 static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1163 {
1164   return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1165 }
1166 
vec_msum(vec_short8 a,vec_short8 b,vec_int4 c)1167 static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
1168 {
1169   return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1170 }
1171 
1172 
1173 /* vec_msums (vector multiply sum saturate)
1174  * ========
1175  */
vec_msums(vec_ushort8 a,vec_ushort8 b,vec_uint4 c)1176 static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1177 {
1178   vec_uint4 p1, p2;
1179 
1180   p1 = spu_mulo(a, b);
1181   p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
1182 
1183   return (vec_adds(p2, vec_adds(p1, c)));
1184 }
1185 
vec_msums(vec_short8 a,vec_short8 b,vec_int4 c)1186 static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
1187 {
1188   return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1189 }
1190 
1191 /* vec_mtvscr (vector move to vector status and control register)
1192  * ==========
1193  */
1194 #define vec_mtvscr(_a)		/* not supported */
1195 
1196 
1197 /* vec_mule (vector multiply even)
1198  * ========
1199  */
vec_mule(vec_uchar16 a,vec_uchar16 b)1200 static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
1201 {
1202   vec_ushort8 hi, lo;
1203 
1204   hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
1205 			     (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
1206   lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
1207 			     (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
1208 
1209   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1210 				             10, 11, 26, 27, 14, 15, 30, 31})));
1211 }
1212 
vec_mule(vec_char16 a,vec_char16 b)1213 static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
1214 {
1215   vec_short8 hi, lo;
1216 
1217   hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
1218 			    (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
1219   lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
1220 			    (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
1221 
1222   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1223 				             10, 11, 26, 27, 14, 15, 30, 31})));
1224 }
1225 
vec_mule(vec_ushort8 a,vec_ushort8 b)1226 static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
1227 {
1228  return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
1229 		  (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
1230 }
1231 
1232 
vec_mule(vec_short8 a,vec_short8 b)1233 static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
1234 {
1235  return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
1236 		  (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
1237 }
1238 
1239 
1240 /* vec_mulo (vector multiply odd)
1241  * ========
1242  */
vec_mulo(vec_uchar16 a,vec_uchar16 b)1243 static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
1244 {
1245   vec_ushort8 hi, lo;
1246 
1247   hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
1248 			     (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
1249   lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
1250 
1251   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1252 				             10, 11, 26, 27, 14, 15, 30, 31})));
1253 }
1254 
vec_mulo(vec_char16 a,vec_char16 b)1255 static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
1256 {
1257   vec_short8 aa, bb, hi, lo;
1258 
1259   aa = spu_extend(a);
1260   bb = spu_extend(b);
1261 
1262   hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
1263 		(vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
1264   lo = (vec_short8)spu_mulo(aa, bb);
1265   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1266 				             10, 11, 26, 27, 14, 15, 30, 31})));
1267 }
1268 
vec_mulo(vec_ushort8 a,vec_ushort8 b)1269 static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
1270 {
1271   return (spu_mulo(a, b));
1272 }
1273 
1274 
vec_mulo(vec_short8 a,vec_short8 b)1275 static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
1276 {
1277   return (spu_mulo(a, b));
1278 }
1279 
1280 
1281 /* vec_nmsub (vector negative multiply subtract)
1282  * =========
1283  */
1284 #define vec_nmsub(_a, _b, _c)	spu_nmsub(_a, _b, _c)
1285 
1286 
1287 /* vec_nor (vector logical nor)
1288  * =======
1289  */
1290 #define vec_nor(_a, _b)		spu_nor(_a, _b)
1291 
1292 
1293 /* vec_or (vector logical or)
1294  * ======
1295  */
vec_or(vec_uchar16 a,vec_uchar16 b)1296 static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
1297 {
1298   return (spu_or(a, b));
1299 }
1300 
vec_or(vec_char16 a,vec_char16 b)1301 static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
1302 {
1303   return (spu_or(a, b));
1304 }
1305 
vec_or(vec_bchar16 a,vec_char16 b)1306 static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
1307 {
1308   return (spu_or((vec_char16)(a), b));
1309 }
1310 
vec_or(vec_char16 a,vec_bchar16 b)1311 static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
1312 {
1313   return (spu_or(a, (vec_char16)(b)));
1314 }
1315 
vec_or(vec_ushort8 a,vec_ushort8 b)1316 static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
1317 {
1318   return (spu_or(a, b));
1319 }
1320 
vec_or(vec_short8 a,vec_short8 b)1321 static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
1322 {
1323   return (spu_or(a, b));
1324 }
1325 
vec_or(vec_bshort8 a,vec_short8 b)1326 static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
1327 {
1328   return (spu_or((vec_short8)(a), b));
1329 }
1330 
vec_or(vec_short8 a,vec_bshort8 b)1331 static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
1332 {
1333   return (spu_or(a, (vec_short8)(b)));
1334 }
1335 
vec_or(vec_uint4 a,vec_uint4 b)1336 static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
1337 {
1338   return (spu_or(a, b));
1339 }
1340 
vec_or(vec_int4 a,vec_int4 b)1341 static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
1342 {
1343   return (spu_or(a, b));
1344 }
1345 
vec_or(vec_bint4 a,vec_int4 b)1346 static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
1347 {
1348   return (spu_or((vec_int4)(a), b));
1349 }
1350 
vec_or(vec_int4 a,vec_bint4 b)1351 static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
1352 {
1353   return (spu_or(a, (vec_int4)(b)));
1354 }
1355 
vec_or(vec_float4 a,vec_float4 b)1356 static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
1357 {
1358   return (spu_or(a, b));
1359 }
1360 
vec_or(vec_bint4 a,vec_float4 b)1361 static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
1362 {
1363   return (spu_or((vec_float4)(a),b));
1364 }
1365 
vec_or(vec_float4 a,vec_bint4 b)1366 static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
1367 {
1368   return (spu_or(a, (vec_float4)(b)));
1369 }
1370 
1371 
1372 /* vec_pack (vector pack)
1373  * ========
1374  */
vec_pack(vec_ushort8 a,vec_ushort8 b)1375 static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
1376 {
1377   return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1378 					                17, 19, 21, 23, 25, 27, 29, 31})));
1379 }
1380 
vec_pack(vec_short8 a,vec_short8 b)1381 static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
1382 {
1383   return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1384 					               17, 19, 21, 23, 25, 27, 29, 31})));
1385 }
1386 
vec_pack(vec_uint4 a,vec_uint4 b)1387 static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
1388 {
1389   return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1390 					                18, 19, 22, 23, 26, 27, 30, 31})));
1391 }
1392 
vec_pack(vec_int4 a,vec_int4 b)1393 static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
1394 {
1395   return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1396 					               18, 19, 22, 23, 26, 27, 30, 31})));
1397 }
1398 
1399 
1400 /* vec_packpx (vector pack pixel)
1401  * ==========
1402  */
vec_packpx(vec_uint4 a,vec_uint4 b)1403 static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
1404 {
1405   vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
1406   vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
1407 
1408   return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
1409 					   spu_sl(a, 13), x001F),
1410 				   spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
1411 					   spu_sl(b, 13), x001F),
1412 				   ((vec_uchar16){ 0,  1,  4,  5,   8,  9, 12, 13,
1413 					          16, 17, 20, 21, 24, 25, 28, 29}))));
1414 }
1415 
1416 
1417 /* vec_packs (vector pack saturate)
1418  * =========
1419  */
vec_packs(vec_ushort8 a,vec_ushort8 b)1420 static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
1421 {
1422   vec_ushort8 max = spu_splats((unsigned short)0x00FF);
1423 
1424   return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
1425 				    spu_sel(b, max, spu_cmpgt(b, 255)),
1426 				    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1427 					           17, 19, 21, 23, 25, 27, 29, 31}))));
1428 }
1429 
vec_packs(vec_short8 a,vec_short8 b)1430 static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
1431 {
1432   vec_short8 max = spu_splats((signed short)0x007F);
1433   vec_short8 min = spu_splats((signed short)0xFF80);
1434 
1435   return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
1436 				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
1437 				   ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1438 					          17, 19, 21, 23, 25, 27, 29, 31}))));
1439 }
1440 
vec_packs(vec_uint4 a,vec_uint4 b)1441 static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
1442 {
1443   vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
1444 
1445   return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
1446 				    spu_sel(b, max, spu_cmpgt(b, max)),
1447 				    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1448 					           18, 19, 22, 23, 26, 27, 30, 31}))));
1449 }
1450 
vec_packs(vec_int4 a,vec_int4 b)1451 static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
1452 {
1453   vec_int4 max = spu_splats((signed int)0x00007FFF);
1454   vec_int4 min = spu_splats((signed int)0xFFFF8000);
1455 
1456   return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1457 				   spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1458 				   ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1459 					          18, 19, 22, 23, 26, 27, 30, 31}))));
1460 }
1461 
1462 
1463 /* vec_packsu (vector pack saturate unsigned)
1464  * ==========
1465  */
vec_packsu(vec_ushort8 a,vec_ushort8 b)1466 static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
1467 {
1468   return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
1469 				   spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
1470 				   ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1471 					          17, 19, 21, 23, 25, 27, 29, 31})));
1472 }
1473 
vec_packsu(vec_short8 a,vec_short8 b)1474 static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
1475 {
1476   vec_short8 max = spu_splats((signed short)0x00FF);
1477   vec_short8 min = spu_splats((signed short)0x0000);
1478 
1479   return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
1480 				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
1481 				    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1482 					           17, 19, 21, 23, 25, 27, 29, 31}))));
1483 
1484   return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
1485 }
1486 
vec_packsu(vec_uint4 a,vec_uint4 b)1487 static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
1488 {
1489   vec_uint4 max = spu_splats((unsigned int)0xFFFF);
1490 
1491   return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
1492 				   spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
1493 				   ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1494 					          18, 19, 22, 23, 26, 27, 30, 31})));
1495 }
1496 
vec_packsu(vec_int4 a,vec_int4 b)1497 static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
1498 {
1499   vec_int4 max = spu_splats((signed int)0x0000FFFF);
1500   vec_int4 min = spu_splats((signed int)0x00000000);
1501 
1502   return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1503 				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1504 				    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1505 					           18, 19, 22, 23, 26, 27, 30, 31}))));
1506 }
1507 
1508 
1509 /* vec_perm (vector permute)
1510  * ========
1511  */
vec_perm(vec_uchar16 a,vec_uchar16 b,vec_uchar16 c)1512 static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
1513 {
1514   return (spu_shuffle(a, b, spu_and(c, 0x1F)));
1515 }
1516 
vec_perm(vec_char16 a,vec_char16 b,vec_uchar16 c)1517 static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
1518 {
1519   return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1520 }
1521 
vec_perm(vec_ushort8 a,vec_ushort8 b,vec_uchar16 c)1522 static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
1523 {
1524   return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1525 }
1526 
vec_perm(vec_short8 a,vec_short8 b,vec_uchar16 c)1527 static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
1528 {
1529   return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1530 }
1531 
vec_perm(vec_uint4 a,vec_uint4 b,vec_uchar16 c)1532 static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
1533 {
1534   return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1535 }
1536 
vec_perm(vec_int4 a,vec_int4 b,vec_uchar16 c)1537 static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
1538 {
1539   return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1540 }
1541 
vec_perm(vec_float4 a,vec_float4 b,vec_uchar16 c)1542 static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
1543 {
1544   return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1545 }
1546 
1547 
1548 /* vec_re (vector reciprocal estimate)
1549  * ======
1550  */
1551 #define vec_re(_a)	spu_re(_a)
1552 
1553 
1554 /* vec_rl (vector rotate left)
1555  * ======
1556  */
vec_rl(vec_uchar16 a,vec_uchar16 b)1557 static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
1558 {
1559   vec_ushort8 r1, r2;
1560 
1561   r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
1562   r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1563   return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
1564 }
1565 
vec_rl(vec_char16 a,vec_uchar16 b)1566 static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
1567 {
1568   return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
1569 }
1570 
vec_rl(vec_ushort8 a,vec_ushort8 b)1571 static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
1572 {
1573   return (spu_rl(a, (vec_short8)(b)));
1574 }
1575 
vec_rl(vec_short8 a,vec_ushort8 b)1576 static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
1577 {
1578   return (spu_rl(a, (vec_short8)(b)));
1579 }
1580 
vec_rl(vec_uint4 a,vec_uint4 b)1581 static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
1582 {
1583   return (spu_rl(a, (vec_int4)(b)));
1584 }
1585 
vec_rl(vec_int4 a,vec_uint4 b)1586 static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
1587 {
1588   return (spu_rl(a, (vec_int4)(b)));
1589 }
1590 
1591 
1592 /* vec_round (vector round)
1593  * =========
1594  */
vec_round(vec_float4 a)1595 static inline vec_float4 vec_round(vec_float4 a)
1596 {
1597   vec_float4 s_half, s_one, d;
1598   vec_uint4 odd;
1599   vec_uint4 msb = spu_splats((unsigned int)0x80000000);
1600   vec_float4 half = spu_splats(0.5f);
1601   vec_int4 exp;
1602   vec_uint4 mask;
1603 
1604   s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
1605   a = spu_add(a, s_half);
1606   s_one = spu_add(s_half, s_half);
1607   exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
1608   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
1609   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
1610   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
1611 
1612   odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
1613   s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
1614   s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
1615 				 (vec_float4)spu_cmpeq(odd, 1)));
1616   d = spu_andc(a, (vec_float4)(mask));
1617   d = spu_sub(d, s_one);
1618   return (d);
1619 }
1620 
1621 /* vec_rsqrte (vector reciprocal square root estimate)
1622  * ==========
1623  */
1624 #define vec_rsqrte(_a)	spu_rsqrte(_a)
1625 
1626 
1627 /* vec_sel (vector select)
1628  * =======
1629  */
1630 #define vec_sel(_a, _b, _c)	spu_sel(_a, _b, _c)
1631 
1632 
1633 /* vec_sl (vector shift left)
1634  * ======
1635  */
vec_sl(vec_uchar16 a,vec_uchar16 b)1636 static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
1637 {
1638   vec_ushort8 hi, lo;
1639 
1640   lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
1641   hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1642 
1643   return ((vec_uchar16)(spu_or(hi, lo)));
1644 }
1645 
vec_sl(vec_char16 a,vec_uchar16 b)1646 static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
1647 {
1648   return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
1649 }
1650 
vec_sl(vec_ushort8 a,vec_ushort8 b)1651 static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
1652 {
1653   return (spu_sl(a, spu_and(b, 15)));
1654 }
1655 
vec_sl(vec_short8 a,vec_ushort8 b)1656 static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
1657 {
1658   return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
1659 }
1660 
vec_sl(vec_uint4 a,vec_uint4 b)1661 static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
1662 {
1663   return (spu_sl(a, spu_and(b, 31)));
1664 }
1665 
vec_sl(vec_int4 a,vec_uint4 b)1666 static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
1667 {
1668   return (spu_sl(a, spu_and(b, 31)));
1669 }
1670 
1671 
1672 /* vec_sld (vector shift left double)
1673  * =======
1674  */
1675 #define vec_sld(_a, _b, _c)	spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c),  1+(_c),  2+(_c),  3+(_c),  \
1676 								    4+(_c),  5+(_c),  6+(_c),  7+(_c), 	\
1677 								    8+(_c),  9+(_c), 10+(_c), 11+(_c), 	\
1678 							           12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
1679 
1680 
1681 /* vec_sll (vector shift left long)
1682  * =======
1683  */
1684 #define vec_sll(_a, _b)		spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
1685 
1686 
1687 /* vec_slo (vector shift left by octet)
1688  * =======
1689  */
1690 #define vec_slo(_a, _b)		spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
1691 
1692 
1693 /* vec_splat (vector splat)
1694  * =========
1695  */
1696 #define vec_splat(_a, _b)	spu_splats(spu_extract(_a, _b))
1697 
1698 
1699 /* vec_splat_s8 (vector splat signed byte)
1700  * ============
1701  */
1702 #define vec_splat_s8(_a)	spu_splats((signed char)(_a))
1703 
1704 
1705 /* vec_splat_s16 (vector splat signed half-word)
1706  * =============
1707  */
1708 #define vec_splat_s16(_a)	spu_splats((signed short)(_a))
1709 
1710 
1711 /* vec_splat_s32 (vector splat signed word)
1712  * =============
1713  */
1714 #define vec_splat_s32(_a)	spu_splats((signed int)(_a))
1715 
1716 
1717 /* vec_splat_u8 (vector splat unsigned byte)
1718  * ============
1719  */
1720 #define vec_splat_u8(_a)	spu_splats((unsigned char)(_a))
1721 
1722 
1723 /* vec_splat_u16 (vector splat unsigned half-word)
1724  * =============
1725  */
1726 #define vec_splat_u16(_a)	spu_splats((unsigned short)(_a))
1727 
1728 
1729 /* vec_splat_u32 (vector splat unsigned word)
1730  * =============
1731  */
1732 #define vec_splat_u32(_a)	spu_splats((unsigned int)(_a))
1733 
1734 
1735 /* vec_sr (vector shift right)
1736  * ======
1737  */
vec_sr(vec_uchar16 a,vec_uchar16 b)1738 static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
1739 {
1740   vec_ushort8 hi, lo;
1741 
1742   lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
1743   hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1744 
1745   return ((vec_uchar16)(spu_or(hi, lo)));
1746 }
1747 
vec_sr(vec_char16 a,vec_uchar16 b)1748 static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
1749 {
1750   return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
1751 }
1752 
vec_sr(vec_ushort8 a,vec_ushort8 b)1753 static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
1754 {
1755   return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1756 }
1757 
vec_sr(vec_short8 a,vec_ushort8 b)1758 static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
1759 {
1760   return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
1761 }
1762 
vec_sr(vec_uint4 a,vec_uint4 b)1763 static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
1764 {
1765   return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1766 }
1767 
vec_sr(vec_int4 a,vec_uint4 b)1768 static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
1769 {
1770   return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
1771 }
1772 
1773 
1774 /* vec_sra (vector shift right algebraic)
1775  * =======
1776  */
vec_sra(vec_char16 a,vec_uchar16 b)1777 static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
1778 {
1779   vec_short8 hi, lo;
1780 
1781   lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
1782   hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1783 
1784   return ((vec_char16)(spu_or(hi, lo)));
1785 }
1786 
vec_sra(vec_uchar16 a,vec_uchar16 b)1787 static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
1788 {
1789   return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
1790 }
1791 
vec_sra(vec_short8 a,vec_ushort8 b)1792 static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
1793 {
1794   return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1795 }
1796 
vec_sra(vec_ushort8 a,vec_ushort8 b)1797 static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
1798 {
1799   return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
1800 }
1801 
vec_sra(vec_int4 a,vec_uint4 b)1802 static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
1803 {
1804   return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1805 }
1806 
vec_sra(vec_uint4 a,vec_uint4 b)1807 static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
1808 {
1809   return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
1810 }
1811 
1812 
1813 /* vec_srl (vector shift right long)
1814  * =======
1815  */
1816 #define vec_srl(_a, _b)		spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
1817 
1818 
1819 /* vec_sro (vector shift right by octet)
1820  * =======
1821  */
1822 #define vec_sro(_a, _b)		spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
1823 
1824 /* vec_st (vector store indexed)
1825  * ======
1826  */
vec_st(vec_uchar16 a,int b,unsigned char * c)1827 static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
1828 {
1829   *((vec_uchar16 *)(c+b)) = a;
1830 }
1831 
vec_st(vec_uchar16 a,int b,vec_uchar16 * c)1832 static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
1833 {
1834   *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
1835 }
1836 
vec_st(vec_char16 a,int b,signed char * c)1837 static inline void vec_st(vec_char16 a, int b, signed char *c)
1838 {
1839   *((vec_char16 *)(c+b)) = a;
1840 }
1841 
vec_st(vec_char16 a,int b,vec_char16 * c)1842 static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
1843 {
1844   *((vec_char16 *)((signed char *)(c)+b)) = a;
1845 }
1846 
vec_st(vec_bchar16 a,int b,signed char * c)1847 static inline void vec_st(vec_bchar16 a, int b, signed char *c)
1848 {
1849   *((vec_bchar16 *)((signed char *)(c)+b)) = a;
1850 }
1851 
vec_st(vec_ushort8 a,int b,unsigned short * c)1852 static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
1853 {
1854   *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1855 }
1856 
vec_st(vec_ushort8 a,int b,vec_ushort8 * c)1857 static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
1858 {
1859   *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1860 }
1861 
vec_st(vec_short8 a,int b,signed short * c)1862 static inline void vec_st(vec_short8 a, int b, signed short *c)
1863 {
1864   *((vec_short8 *)((unsigned char *)(c)+b)) = a;
1865 }
1866 
vec_st(vec_short8 a,int b,vec_short8 * c)1867 static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
1868 {
1869   *((vec_short8 *)((signed char *)(c)+b)) = a;
1870 }
1871 
vec_st(vec_bshort8 a,int b,signed short * c)1872 static inline void vec_st(vec_bshort8 a, int b, signed short *c)
1873 {
1874   *((vec_bshort8 *)((signed char *)(c)+b)) = a;
1875 }
1876 
vec_st(vec_uint4 a,int b,unsigned int * c)1877 static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
1878 {
1879   *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1880 }
1881 
vec_st(vec_uint4 a,int b,vec_uint4 * c)1882 static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
1883 {
1884   *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1885 }
1886 
vec_st(vec_int4 a,int b,signed int * c)1887 static inline void vec_st(vec_int4 a, int b, signed int *c)
1888 {
1889   *((vec_int4 *)((unsigned char *)(c)+b)) = a;
1890 }
1891 
vec_st(vec_int4 a,int b,vec_int4 * c)1892 static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
1893 {
1894   *((vec_int4 *)((signed char *)(c)+b)) = a;
1895 }
1896 
vec_st(vec_bint4 a,int b,signed int * c)1897 static inline void vec_st(vec_bint4 a, int b, signed int *c)
1898 {
1899   *((vec_bint4 *)((signed char *)(c)+b)) = a;
1900 }
1901 
vec_st(vec_float4 a,int b,float * c)1902 static inline void vec_st(vec_float4 a, int b, float *c)
1903 {
1904   *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1905 }
1906 
vec_st(vec_float4 a,int b,vec_float4 * c)1907 static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
1908 {
1909   *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1910 }
1911 
1912 
1913 /* vec_ste (vector store element indexed)
1914  * =======
1915  */
vec_ste(vec_uchar16 a,int b,unsigned char * c)1916 static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
1917 {
1918   unsigned char *ptr;
1919 
1920   ptr = c + b;
1921   *ptr = spu_extract(a, (int)(ptr) & 15);
1922 }
1923 
vec_ste(vec_char16 a,int b,signed char * c)1924 static inline void vec_ste(vec_char16 a, int b, signed char *c)
1925 {
1926   vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1927 }
1928 
vec_ste(vec_bchar16 a,int b,signed char * c)1929 static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
1930 {
1931   vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1932 }
1933 
vec_ste(vec_ushort8 a,int b,unsigned short * c)1934 static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
1935 {
1936   unsigned short *ptr;
1937 
1938   ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
1939   *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
1940 }
1941 
vec_ste(vec_short8 a,int b,signed short * c)1942 static inline void vec_ste(vec_short8 a, int b, signed short *c)
1943 {
1944   vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1945 }
1946 
vec_ste(vec_bshort8 a,int b,signed short * c)1947 static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
1948 {
1949   vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1950 }
1951 
vec_ste(vec_uint4 a,int b,unsigned int * c)1952 static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
1953 {
1954   unsigned int *ptr;
1955 
1956   ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
1957   *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
1958 }
1959 
vec_ste(vec_int4 a,int b,signed int * c)1960 static inline void vec_ste(vec_int4 a, int b, signed int *c)
1961 {
1962   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1963 }
1964 
vec_ste(vec_bint4 a,int b,signed int * c)1965 static inline void vec_ste(vec_bint4 a, int b, signed int *c)
1966 {
1967   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1968 }
1969 
vec_ste(vec_float4 a,int b,float * c)1970 static inline void vec_ste(vec_float4 a, int b, float *c)
1971 {
1972   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1973 }
1974 
1975 
1976 /* vec_stl (vector store indexed LRU)
1977  * =======
1978  */
1979 #define vec_stl(_a, _b, _c)		vec_st(_a, _b, _c)
1980 
1981 
1982 /* vec_sub (vector subtract)
1983  * =======
1984  */
vec_sub(vec_uchar16 a,vec_uchar16 b)1985 static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
1986 {
1987   return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
1988 				spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
1989 				spu_splats((unsigned short)0xFF00))));
1990 }
1991 
vec_sub(vec_char16 a,vec_char16 b)1992 static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
1993 {
1994   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
1995 }
1996 
vec_sub(vec_bchar16 a,vec_char16 b)1997 static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
1998 {
1999   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2000 }
2001 
vec_sub(vec_char16 a,vec_bchar16 b)2002 static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
2003 {
2004   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2005 }
2006 
vec_sub(vec_ushort8 a,vec_ushort8 b)2007 static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
2008 {
2009   return (spu_sub(a, b));
2010 }
2011 
vec_sub(vec_short8 a,vec_short8 b)2012 static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
2013 {
2014   return (spu_sub(a, b));
2015 }
2016 
vec_sub(vec_bshort8 a,vec_short8 b)2017 static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
2018 {
2019   return (spu_sub((vec_short8)(a), b));
2020 }
2021 
vec_sub(vec_short8 a,vec_bshort8 b)2022 static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
2023 {
2024   return (spu_sub(a, (vec_short8)(b)));
2025 }
2026 
vec_sub(vec_uint4 a,vec_uint4 b)2027 static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
2028 {
2029   return (spu_sub(a, b));
2030 }
2031 
vec_sub(vec_int4 a,vec_int4 b)2032 static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
2033 {
2034   return (spu_sub(a, b));
2035 }
2036 
vec_sub(vec_bint4 a,vec_int4 b)2037 static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
2038 {
2039   return (spu_sub((vec_int4)(a), b));
2040 }
2041 
vec_sub(vec_int4 a,vec_bint4 b)2042 static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
2043 {
2044   return (spu_sub(a, (vec_int4)(b)));
2045 }
2046 
vec_sub(vec_float4 a,vec_float4 b)2047 static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
2048 {
2049   return (spu_sub(a, b));
2050 }
2051 
2052 
2053 /* vec_subc (vector subtract carryout)
2054  * ========
2055  */
2056 #define vec_subc(_a, _b)	spu_genb(_a, _b)
2057 
2058 
2059 /* vec_subs (vector subtract saturate)
2060  * ========
2061  */
vec_subs(vec_uchar16 a,vec_uchar16 b)2062 static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
2063 {
2064   vec_ushort8 s1, s2;
2065   vec_uchar16 s, d;
2066 
2067   s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2068   s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2069   s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
2070 					                8, 24, 10, 26, 12, 28, 14, 30})));
2071   d  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
2072 					                9, 25, 11, 27, 13, 29, 15, 31})));
2073   return (spu_andc(d, s));
2074 }
2075 
vec_subs(vec_char16 a,vec_char16 b)2076 static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
2077 {
2078   vec_ushort8 s1, s2;
2079   vec_uchar16 s, d;
2080 
2081   s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2082   s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2083   s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
2084 					                9, 25, 11, 27, 13, 29, 15, 31})));
2085   d  = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
2086   d  = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
2087 
2088   return ((vec_char16)(d));
2089 }
2090 
vec_subs(vec_bchar16 a,vec_char16 b)2091 static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
2092 {
2093   return (vec_subs((vec_char16)(a), b));
2094 }
2095 
vec_subs(vec_char16 a,vec_bchar16 b)2096 static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
2097 {
2098   return (vec_subs(a, (vec_char16)(b)));
2099 }
2100 
vec_subs(vec_ushort8 a,vec_ushort8 b)2101 static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
2102 {
2103   return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2104 }
2105 
vec_subs(vec_short8 a,vec_short8 b)2106 static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
2107 {
2108   vec_short8 s;
2109   vec_short8 d;
2110 
2111   s = spu_sub(a, b);
2112   d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
2113   d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
2114 
2115   return (d);
2116 }
2117 
vec_subs(vec_bshort8 a,vec_short8 b)2118 static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
2119 {
2120   return ((vec_short8)(vec_subs((vec_short8)(a), b)));
2121 }
2122 
vec_subs(vec_short8 a,vec_bshort8 b)2123 static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
2124 {
2125   return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
2126 }
2127 
vec_subs(vec_uint4 a,vec_uint4 b)2128 static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
2129 {
2130   return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2131 }
2132 
vec_subs(vec_int4 a,vec_int4 b)2133 static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
2134 {
2135   vec_int4 s;
2136   vec_int4 d;
2137 
2138   s = spu_sub(a, b);
2139   d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
2140   d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
2141 
2142   return (d);
2143 }
2144 
vec_subs(vec_bint4 a,vec_int4 b)2145 static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
2146 {
2147   return ((vec_int4)(vec_subs((vec_int4)(a), b)));
2148 }
2149 
vec_subs(vec_int4 a,vec_bint4 b)2150 static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
2151 {
2152   return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
2153 }
2154 
2155 
2156 /* vec_sum4s (vector sum across partial (1/4) saturated)
2157  * =========
2158  */
vec_sum4s(vec_uchar16 a,vec_uint4 b)2159 static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
2160 {
2161   vec_uint4 a01_23, a0123;
2162 
2163   a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
2164 			       spu_and((vec_ushort8)(a), 0xFF)));
2165   a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
2166   return (vec_adds(a0123, b));
2167 }
2168 
vec_sum4s(vec_char16 a,vec_int4 b)2169 static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
2170 {
2171   vec_int4 a01_23, a0123;
2172 
2173   a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
2174 			      spu_extend(a)));
2175   a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
2176   return (vec_adds(a0123, b));
2177 }
2178 
vec_sum4s(vec_short8 a,vec_int4 b)2179 static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
2180 {
2181   vec_int4 a0123;
2182 
2183   a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
2184   return (vec_adds(a0123, b));
2185 }
2186 
2187 
2188 /* vec_sum2s (vector sum across partial (1/2) saturated)
2189  * =========
2190  */
vec_sum2s(vec_int4 a,vec_int4 b)2191 static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
2192 {
2193   vec_int4 c, d;
2194   vec_int4 sign1, sign2, sign3;
2195   vec_int4 carry, sum_l, sum_h, sat, sat_val;
2196 
2197   sign1 = spu_rlmaska(a, -31);
2198   sign2 = spu_rlmaska(b, -31);
2199 
2200   c = spu_rlqwbyte(a, -4);
2201   sign3 = spu_rlqwbyte(sign1, -4);
2202 
2203   carry = spu_genc(a, b);
2204   sum_l = spu_add(a, b);
2205   sum_h = spu_addx(sign1, sign2, carry);
2206 
2207   carry = spu_genc(sum_l, c);
2208   sum_l = spu_add(sum_l, c);
2209   sum_h = spu_addx(sum_h, sign3, carry);
2210 
2211   sign1 = spu_rlmaska(sum_l, -31);
2212   sign2 = spu_rlmaska(sum_h, -31);
2213 
2214   sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
2215 
2216   sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
2217 
2218   d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
2219 
2220   return (d);
2221 }
2222 
2223 
2224 /* vec_sums (vector sum saturated)
2225  * ========
2226  */
vec_sums(vec_int4 a,vec_int4 b)2227 static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
2228 {
2229   vec_int4 a0, a1, a2, c0, c1, c2, d;
2230   vec_int4 sign_a, sign_b, sign_l, sign_h;
2231   vec_int4 sum_l, sum_h, sat, sat_val;
2232 
2233   sign_a = spu_rlmaska(a, -31);
2234   sign_b = spu_rlmaska(b, -31);
2235 
2236   a0 = spu_rlqwbyte(a, -12);
2237   a1 = spu_rlqwbyte(a, -8);
2238   a2 = spu_rlqwbyte(a, -4);
2239 
2240   sum_l = spu_add(a, b);
2241   sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
2242 
2243   c2 = spu_genc(sum_l, a2);
2244   sum_l = spu_add(sum_l, a2);
2245   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
2246 
2247   c1 = spu_genc(sum_l, a1);
2248   sum_l = spu_add(sum_l, a1);
2249   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
2250 
2251   c0 = spu_genc(sum_l, a0);
2252   sum_l = spu_add(sum_l, a0);
2253   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
2254 
2255   sign_l = spu_rlmaska(sum_l, -31);
2256   sign_h = spu_rlmaska(sum_h, -31);
2257 
2258   sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
2259 
2260   sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
2261 
2262   d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
2263 
2264   return (d);
2265 }
2266 
2267 
2268 /* vec_trunc (vector truncate)
2269  * =========
2270  */
vec_trunc(vec_float4 a)2271 static inline vec_float4 vec_trunc(vec_float4 a)
2272 {
2273   vec_int4 exp;
2274   vec_uint4 mask;
2275 
2276   exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
2277   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
2278   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
2279   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
2280   return (spu_andc(a, (vec_float4)(mask)));
2281 }
2282 
2283 /* vec_unpackh (vector unpack high element)
2284  * ===========
2285  */
vec_unpackh(vec_char16 a)2286 static inline vec_short8 vec_unpackh(vec_char16 a)
2287 {
2288   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
2289 					              4, 4, 5, 5, 6, 6, 7, 7}))));
2290 }
2291 
vec_unpackh(vec_bchar16 a)2292 static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
2293 {
2294   return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
2295 }
2296 
vec_unpackh(vec_short8 a)2297 static inline vec_int4 vec_unpackh(vec_short8 a)
2298 {
2299   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
2300 					              0, 0, 4, 5, 0, 0, 6, 7}))));
2301 }
2302 
2303 #ifdef SUPPORT_UNPACK_PIXEL
2304 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2305  * can not simultaneously be supported. By default, the boolean short is
2306  * supported.
2307  */
vec_unpackh(vec_pixel8 a)2308 static inline vec_uint4 vec_unpackh(vec_pixel8 a)
2309 {
2310   vec_ushort8 p1, p2;
2311 
2312   p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
2313 		   spu_and((vec_ushort8)(a.p), 0x1F),
2314 		   ((vec_uchar16){ 0, 128, 128, 17,  2, 128, 128, 19,
2315 			           4, 128, 128, 21,  6, 128, 128, 23}));
2316   p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
2317 		   spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
2318 		   ((vec_uchar16){ 128,  17, 1, 128, 128,  19, 3, 128,
2319 			           128,  21, 5, 128, 128,  23, 7, 128}));
2320   return ((vec_uint4)(spu_or(p1, p2)));
2321 }
2322 
2323 #else
2324 
vec_unpackh(vec_bshort8 a)2325 static inline vec_bint4 vec_unpackh(vec_bshort8 a)
2326 {
2327   return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
2328 }
2329 #endif
2330 
2331 
2332 
2333 
2334 
2335 /* vec_unpackl (vector unpack low element)
2336  * ===========
2337  */
vec_unpackl(vec_char16 a)2338 static inline vec_short8 vec_unpackl(vec_char16 a)
2339 {
2340   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
2341 					              12, 12, 13, 13, 14, 14, 15, 15}))));
2342 }
2343 
vec_unpackl(vec_bchar16 a)2344 static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
2345 {
2346   return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
2347 }
2348 
2349 
vec_unpackl(vec_short8 a)2350 static inline vec_int4 vec_unpackl(vec_short8 a)
2351 {
2352   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
2353 					              0, 0,12,13, 0, 0, 14, 15}))));
2354 }
2355 
2356 
2357 #ifdef SUPPORT_UNPACK_PIXEL
2358 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2359  * can not simultaneously be supported. By default, the boolean short is
2360  * supported.
2361  */
vec_unpackl(vec_pixel8 a)2362 static inline vec_uint4 vec_unpackl(vec_pixel8 a)
2363 {
2364   vec_ushort8 p1, p2;
2365 
2366   p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
2367 		   spu_and((vec_ushort8)(a), 0x1F),
2368 		   ((vec_uchar16){ 8, 128, 128, 25,  10, 128, 128, 27,
2369 			          12, 128, 128, 29,  14, 128, 128, 31}));
2370   p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
2371 		   spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
2372 		   ((vec_uchar16){ 128, 25,  9, 128, 128, 27, 11, 128,
2373 			           128, 29, 13, 128, 128, 31, 15, 128}));
2374   return ((vec_uint4)(spu_or(p1, p2)));
2375 }
2376 
2377 #else
2378 
vec_unpackl(vec_bshort8 a)2379 static inline vec_bint4 vec_unpackl(vec_bshort8 a)
2380 {
2381   return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
2382 
2383 }
2384 #endif
2385 
2386 
2387 
2388 /* vec_xor (vector logical xor)
2389  * ======
2390  */
vec_xor(vec_uchar16 a,vec_uchar16 b)2391 static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
2392 {
2393   return (spu_xor(a, b));
2394 }
2395 
vec_xor(vec_char16 a,vec_char16 b)2396 static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
2397 {
2398   return (spu_xor(a, b));
2399 }
2400 
vec_xor(vec_bchar16 a,vec_char16 b)2401 static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
2402 {
2403   return (spu_xor((vec_char16)(a), b));
2404 }
2405 
vec_xor(vec_char16 a,vec_bchar16 b)2406 static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
2407 {
2408   return (spu_xor(a, (vec_char16)(b)));
2409 }
2410 
vec_xor(vec_ushort8 a,vec_ushort8 b)2411 static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
2412 {
2413   return (spu_xor(a, b));
2414 }
2415 
vec_xor(vec_short8 a,vec_short8 b)2416 static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
2417 {
2418   return (spu_xor(a, b));
2419 }
2420 
vec_xor(vec_bshort8 a,vec_short8 b)2421 static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
2422 {
2423   return (spu_xor((vec_short8)(a), b));
2424 }
2425 
vec_xor(vec_short8 a,vec_bshort8 b)2426 static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
2427 {
2428   return (spu_xor(a, (vec_short8)(b)));
2429 }
2430 
vec_xor(vec_uint4 a,vec_uint4 b)2431 static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
2432 {
2433   return (spu_xor(a, b));
2434 }
2435 
vec_xor(vec_int4 a,vec_int4 b)2436 static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
2437 {
2438   return (spu_xor(a, b));
2439 }
2440 
vec_xor(vec_bint4 a,vec_int4 b)2441 static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
2442 {
2443   return (spu_xor((vec_int4)(a), b));
2444 }
2445 
vec_xor(vec_int4 a,vec_bint4 b)2446 static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
2447 {
2448   return (spu_xor(a, (vec_int4)(b)));
2449 }
2450 
vec_xor(vec_float4 a,vec_float4 b)2451 static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
2452 {
2453   return (spu_xor(a, b));
2454 }
2455 
vec_xor(vec_bint4 a,vec_float4 b)2456 static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
2457 {
2458   return (spu_xor((vec_float4)(a),b));
2459 }
2460 
vec_xor(vec_float4 a,vec_bint4 b)2461 static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
2462 {
2463   return (spu_xor(a, (vec_float4)(b)));
2464 }
2465 
2466 /************************************************************************
2467  *                        PREDICATES
2468  ************************************************************************/
2469 
2470 /* vec_all_eq (all elements equal)
2471  * ==========
2472  */
vec_all_eq(vec_uchar16 a,vec_uchar16 b)2473 static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
2474 {
2475   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2476 }
2477 
vec_all_eq(vec_char16 a,vec_char16 b)2478 static inline int vec_all_eq(vec_char16 a, vec_char16 b)
2479 {
2480   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2481 }
2482 
vec_all_eq(vec_bchar16 a,vec_char16 b)2483 static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
2484 {
2485   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
2486 }
2487 
vec_all_eq(vec_char16 a,vec_bchar16 b)2488 static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
2489 {
2490   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
2491 }
2492 
vec_all_eq(vec_ushort8 a,vec_ushort8 b)2493 static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
2494 {
2495   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2496 }
2497 
vec_all_eq(vec_short8 a,vec_short8 b)2498 static inline int vec_all_eq(vec_short8 a, vec_short8 b)
2499 {
2500   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2501 }
2502 
vec_all_eq(vec_bshort8 a,vec_short8 b)2503 static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
2504 {
2505   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
2506 }
2507 
vec_all_eq(vec_short8 a,vec_bshort8 b)2508 static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
2509 {
2510   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
2511 }
2512 
vec_all_eq(vec_uint4 a,vec_uint4 b)2513 static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
2514 {
2515   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2516 }
2517 
vec_all_eq(vec_int4 a,vec_int4 b)2518 static inline int vec_all_eq(vec_int4 a, vec_int4 b)
2519 {
2520   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2521 }
2522 
vec_all_eq(vec_bint4 a,vec_int4 b)2523 static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
2524 {
2525   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
2526 }
2527 
vec_all_eq(vec_int4 a,vec_bint4 b)2528 static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
2529 {
2530   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
2531 }
2532 
vec_all_eq(vec_float4 a,vec_float4 b)2533 static inline int vec_all_eq(vec_float4 a, vec_float4 b)
2534 {
2535   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2536 }
2537 
2538 
2539 /* vec_all_ge (all elements greater than or equal)
2540  * ==========
2541  */
vec_all_ge(vec_uchar16 a,vec_uchar16 b)2542 static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
2543 {
2544   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2545 }
2546 
vec_all_ge(vec_char16 a,vec_char16 b)2547 static inline int vec_all_ge(vec_char16 a, vec_char16 b)
2548 {
2549   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2550 }
2551 
vec_all_ge(vec_bchar16 a,vec_char16 b)2552 static inline  int vec_all_ge(vec_bchar16 a, vec_char16 b)
2553 {
2554   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
2555 }
2556 
vec_all_ge(vec_char16 a,vec_bchar16 b)2557 static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
2558 {
2559   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
2560 }
2561 
vec_all_ge(vec_ushort8 a,vec_ushort8 b)2562 static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
2563 {
2564   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2565 }
2566 
vec_all_ge(vec_short8 a,vec_short8 b)2567 static inline int vec_all_ge(vec_short8 a, vec_short8 b)
2568 {
2569   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2570 }
2571 
vec_all_ge(vec_bshort8 a,vec_short8 b)2572 static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
2573 {
2574   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
2575 }
2576 
vec_all_ge(vec_short8 a,vec_bshort8 b)2577 static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
2578 {
2579   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
2580 }
2581 
vec_all_ge(vec_uint4 a,vec_uint4 b)2582 static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
2583 {
2584   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2585 }
2586 
vec_all_ge(vec_int4 a,vec_int4 b)2587 static inline int vec_all_ge(vec_int4 a, vec_int4 b)
2588 {
2589   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2590 }
2591 
vec_all_ge(vec_bint4 a,vec_int4 b)2592 static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
2593 {
2594   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
2595 }
2596 
vec_all_ge(vec_int4 a,vec_bint4 b)2597 static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
2598 {
2599   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
2600 }
2601 
vec_all_ge(vec_float4 a,vec_float4 b)2602 static inline int vec_all_ge(vec_float4 a, vec_float4 b)
2603 {
2604   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2605 }
2606 
2607 
2608 /* vec_all_gt (all elements greater than)
2609  * ==========
2610  */
vec_all_gt(vec_uchar16 a,vec_uchar16 b)2611 static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
2612 {
2613   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2614 }
2615 
vec_all_gt(vec_char16 a,vec_char16 b)2616 static inline int vec_all_gt(vec_char16 a, vec_char16 b)
2617 {
2618   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2619 }
2620 
vec_all_gt(vec_bchar16 a,vec_char16 b)2621 static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
2622 {
2623   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
2624 }
2625 
vec_all_gt(vec_char16 a,vec_bchar16 b)2626 static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
2627 {
2628   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
2629 }
2630 
vec_all_gt(vec_ushort8 a,vec_ushort8 b)2631 static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
2632 {
2633   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2634 }
2635 
vec_all_gt(vec_short8 a,vec_short8 b)2636 static inline int vec_all_gt(vec_short8 a, vec_short8 b)
2637 {
2638   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2639 }
2640 
vec_all_gt(vec_bshort8 a,vec_short8 b)2641 static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
2642 {
2643   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
2644 }
2645 
vec_all_gt(vec_short8 a,vec_bshort8 b)2646 static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
2647 {
2648   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
2649 }
2650 
vec_all_gt(vec_uint4 a,vec_uint4 b)2651 static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
2652 {
2653   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2654 }
2655 
vec_all_gt(vec_int4 a,vec_int4 b)2656 static inline int vec_all_gt(vec_int4 a, vec_int4 b)
2657 {
2658   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2659 }
2660 
vec_all_gt(vec_bint4 a,vec_int4 b)2661 static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
2662 {
2663   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
2664 }
2665 
vec_all_gt(vec_int4 a,vec_bint4 b)2666 static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
2667 {
2668   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
2669 }
2670 
vec_all_gt(vec_float4 a,vec_float4 b)2671 static inline int vec_all_gt(vec_float4 a, vec_float4 b)
2672 {
2673   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2674 }
2675 
2676 
2677 /* vec_all_in (all elements in bounds)
2678  * ==========
2679  */
vec_all_in(vec_float4 a,vec_float4 b)2680 static inline int vec_all_in(vec_float4 a, vec_float4 b)
2681 {
2682   return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
2683 }
2684 
2685 
2686 /* vec_all_le (all elements less than or equal)
2687  * ==========
2688  */
vec_all_le(vec_uchar16 a,vec_uchar16 b)2689 static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
2690 {
2691   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2692 }
2693 
vec_all_le(vec_char16 a,vec_char16 b)2694 static inline int vec_all_le(vec_char16 a, vec_char16 b)
2695 {
2696   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2697 }
2698 
vec_all_le(vec_bchar16 a,vec_char16 b)2699 static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
2700 {
2701   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
2702 }
2703 
vec_all_le(vec_char16 a,vec_bchar16 b)2704 static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
2705 {
2706   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
2707 }
2708 
vec_all_le(vec_ushort8 a,vec_ushort8 b)2709 static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
2710 {
2711   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2712 }
2713 
vec_all_le(vec_short8 a,vec_short8 b)2714 static inline int vec_all_le(vec_short8 a, vec_short8 b)
2715 {
2716   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2717 }
2718 
vec_all_le(vec_bshort8 a,vec_short8 b)2719 static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
2720 {
2721   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
2722 }
2723 
vec_all_le(vec_short8 a,vec_bshort8 b)2724 static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
2725 {
2726   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
2727 }
2728 
vec_all_le(vec_uint4 a,vec_uint4 b)2729 static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
2730 {
2731   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2732 }
2733 
vec_all_le(vec_int4 a,vec_int4 b)2734 static inline int vec_all_le(vec_int4 a, vec_int4 b)
2735 {
2736   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2737 }
2738 
vec_all_le(vec_bint4 a,vec_int4 b)2739 static inline int vec_all_le(vec_bint4 a, vec_int4 b)
2740 {
2741   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
2742 }
2743 
vec_all_le(vec_int4 a,vec_bint4 b)2744 static inline int vec_all_le(vec_int4 a, vec_bint4 b)
2745 {
2746   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
2747 }
2748 
vec_all_le(vec_float4 a,vec_float4 b)2749 static inline int vec_all_le(vec_float4 a, vec_float4 b)
2750 {
2751   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2752 }
2753 
2754 
2755 /* vec_all_lt (all elements less than)
2756  * ==========
2757  */
vec_all_lt(vec_uchar16 a,vec_uchar16 b)2758 static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
2759 {
2760   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2761 }
2762 
vec_all_lt(vec_char16 a,vec_char16 b)2763 static inline int vec_all_lt(vec_char16 a, vec_char16 b)
2764 {
2765   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2766 }
2767 
vec_all_lt(vec_bchar16 a,vec_char16 b)2768 static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
2769 {
2770   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
2771 }
2772 
vec_all_lt(vec_char16 a,vec_bchar16 b)2773 static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
2774 {
2775   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
2776 }
2777 
vec_all_lt(vec_ushort8 a,vec_ushort8 b)2778 static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
2779 {
2780   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2781 }
2782 
vec_all_lt(vec_short8 a,vec_short8 b)2783 static inline int vec_all_lt(vec_short8 a, vec_short8 b)
2784 {
2785   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2786 }
2787 
vec_all_lt(vec_bshort8 a,vec_short8 b)2788 static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
2789 {
2790   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
2791 }
2792 
vec_all_lt(vec_short8 a,vec_bshort8 b)2793 static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
2794 {
2795   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
2796 }
2797 
vec_all_lt(vec_uint4 a,vec_uint4 b)2798 static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
2799 {
2800   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2801 }
2802 
vec_all_lt(vec_int4 a,vec_int4 b)2803 static inline int vec_all_lt(vec_int4 a, vec_int4 b)
2804 {
2805   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2806 }
2807 
vec_all_lt(vec_bint4 a,vec_int4 b)2808 static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
2809 {
2810   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
2811 }
2812 
vec_all_lt(vec_int4 a,vec_bint4 b)2813 static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
2814 {
2815   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
2816 }
2817 
vec_all_lt(vec_float4 a,vec_float4 b)2818 static inline int vec_all_lt(vec_float4 a, vec_float4 b)
2819 {
2820   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2821 }
2822 
2823 
2824 /* vec_all_nan (all elements not a number)
2825  * ===========
2826  */
vec_all_nan(vec_float4 a)2827 static inline int vec_all_nan(vec_float4 a)
2828 {
2829   vec_uint4 exp, man;
2830   vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
2831 
2832   exp = spu_and((vec_uint4)(a), exp_mask);
2833   man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
2834   return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
2835 						spu_cmpeq(man, 0))), 0) == 0xF));
2836 }
2837 
2838 #define vec_all_nan(_a)		(0)
2839 
2840 
2841 /* vec_all_ne (all elements not equal)
2842  * ==========
2843  */
vec_all_ne(vec_uchar16 a,vec_uchar16 b)2844 static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
2845 {
2846   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2847 }
2848 
vec_all_ne(vec_char16 a,vec_char16 b)2849 static inline int vec_all_ne(vec_char16 a, vec_char16 b)
2850 {
2851   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2852 }
2853 
vec_all_ne(vec_bchar16 a,vec_char16 b)2854 static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
2855 {
2856   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
2857 }
2858 
vec_all_ne(vec_char16 a,vec_bchar16 b)2859 static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
2860 {
2861   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
2862 }
2863 
vec_all_ne(vec_ushort8 a,vec_ushort8 b)2864 static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
2865 {
2866   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2867 }
2868 
vec_all_ne(vec_short8 a,vec_short8 b)2869 static inline int vec_all_ne(vec_short8 a, vec_short8 b)
2870 {
2871   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2872 }
2873 
vec_all_ne(vec_bshort8 a,vec_short8 b)2874 static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
2875 {
2876   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
2877 }
2878 
vec_all_ne(vec_short8 a,vec_bshort8 b)2879 static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
2880 {
2881   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
2882 }
2883 
vec_all_ne(vec_uint4 a,vec_uint4 b)2884 static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
2885 {
2886   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2887 }
2888 
vec_all_ne(vec_int4 a,vec_int4 b)2889 static inline int vec_all_ne(vec_int4 a, vec_int4 b)
2890 {
2891   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2892 }
2893 
vec_all_ne(vec_bint4 a,vec_int4 b)2894 static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
2895 {
2896   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
2897 }
2898 
vec_all_ne(vec_int4 a,vec_bint4 b)2899 static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
2900 {
2901   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
2902 }
2903 
vec_all_ne(vec_float4 a,vec_float4 b)2904 static inline int vec_all_ne(vec_float4 a, vec_float4 b)
2905 {
2906   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2907 }
2908 
2909 
2910 /* vec_all_nge (all elements not greater than or equal)
2911  * ===========
2912  */
vec_all_nge(vec_float4 a,vec_float4 b)2913 static inline int vec_all_nge(vec_float4 a, vec_float4 b)
2914 {
2915   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2916 }
2917 
2918 
2919 /* vec_all_ngt (all elements not greater than)
2920  * ===========
2921  */
vec_all_ngt(vec_float4 a,vec_float4 b)2922 static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
2923 {
2924   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2925 }
2926 
2927 
2928 /* vec_all_nle (all elements not less than or equal)
2929  * ===========
2930  */
vec_all_nle(vec_float4 a,vec_float4 b)2931 static inline int vec_all_nle(vec_float4 a, vec_float4 b)
2932 {
2933   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2934 }
2935 
2936 
2937 /* vec_all_nlt (all elements not less than)
2938  * ===========
2939  */
vec_all_nlt(vec_float4 a,vec_float4 b)2940 static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
2941 {
2942   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2943 }
2944 
2945 
2946 /* vec_all_numeric (all elements numeric)
2947  * ===========
2948  */
vec_all_numeric(vec_float4 a)2949 static inline int vec_all_numeric(vec_float4 a)
2950 {
2951   vec_uint4 exp;
2952 
2953   exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
2954   return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
2955 }
2956 
2957 
2958 
2959 /* vec_any_eq (any elements equal)
2960  * ==========
2961  */
vec_any_eq(vec_uchar16 a,vec_uchar16 b)2962 static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
2963 {
2964   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2965 }
2966 
vec_any_eq(vec_char16 a,vec_char16 b)2967 static inline int vec_any_eq(vec_char16 a, vec_char16 b)
2968 {
2969   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2970 }
2971 
vec_any_eq(vec_bchar16 a,vec_char16 b)2972 static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
2973 {
2974   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
2975 }
2976 
vec_any_eq(vec_char16 a,vec_bchar16 b)2977 static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
2978 {
2979   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
2980 }
2981 
vec_any_eq(vec_ushort8 a,vec_ushort8 b)2982 static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
2983 {
2984   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2985 }
2986 
vec_any_eq(vec_short8 a,vec_short8 b)2987 static inline int vec_any_eq(vec_short8 a, vec_short8 b)
2988 {
2989   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2990 }
2991 
vec_any_eq(vec_bshort8 a,vec_short8 b)2992 static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
2993 {
2994   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
2995 }
2996 
vec_any_eq(vec_short8 a,vec_bshort8 b)2997 static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
2998 {
2999   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
3000 }
3001 
vec_any_eq(vec_uint4 a,vec_uint4 b)3002 static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
3003 {
3004   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3005 }
3006 
vec_any_eq(vec_int4 a,vec_int4 b)3007 static inline int vec_any_eq(vec_int4 a, vec_int4 b)
3008 {
3009   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3010 }
3011 
vec_any_eq(vec_bint4 a,vec_int4 b)3012 static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
3013 {
3014   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
3015 }
3016 
vec_any_eq(vec_int4 a,vec_bint4 b)3017 static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
3018 {
3019   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
3020 }
3021 
vec_any_eq(vec_float4 a,vec_float4 b)3022 static inline int vec_any_eq(vec_float4 a, vec_float4 b)
3023 {
3024   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3025 }
3026 
3027 /* vec_any_ge (any elements greater than or equal)
3028  * ==========
3029  */
vec_any_ge(vec_uchar16 a,vec_uchar16 b)3030 static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
3031 {
3032   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3033 }
3034 
vec_any_ge(vec_char16 a,vec_char16 b)3035 static inline int vec_any_ge(vec_char16 a, vec_char16 b)
3036 {
3037   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3038 }
3039 
vec_any_ge(vec_bchar16 a,vec_char16 b)3040 static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
3041 {
3042   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
3043 }
3044 
vec_any_ge(vec_char16 a,vec_bchar16 b)3045 static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
3046 {
3047   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
3048 }
3049 
vec_any_ge(vec_ushort8 a,vec_ushort8 b)3050 static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
3051 {
3052   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3053 }
3054 
vec_any_ge(vec_short8 a,vec_short8 b)3055 static inline int vec_any_ge(vec_short8 a, vec_short8 b)
3056 {
3057   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3058 }
3059 
vec_any_ge(vec_bshort8 a,vec_short8 b)3060 static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
3061 {
3062   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
3063 }
3064 
vec_any_ge(vec_short8 a,vec_bshort8 b)3065 static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
3066 {
3067   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
3068 }
3069 
vec_any_ge(vec_uint4 a,vec_uint4 b)3070 static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
3071 {
3072   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3073 }
3074 
vec_any_ge(vec_int4 a,vec_int4 b)3075 static inline int vec_any_ge(vec_int4 a, vec_int4 b)
3076 {
3077   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3078 }
3079 
vec_any_ge(vec_bint4 a,vec_int4 b)3080 static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
3081 {
3082   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
3083 }
3084 
vec_any_ge(vec_int4 a,vec_bint4 b)3085 static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
3086 {
3087   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
3088 }
3089 
vec_any_ge(vec_float4 a,vec_float4 b)3090 static inline int vec_any_ge(vec_float4 a, vec_float4 b)
3091 {
3092   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3093 }
3094 
3095 
3096 /* vec_any_gt (any elements greater than)
3097  * ==========
3098  */
vec_any_gt(vec_uchar16 a,vec_uchar16 b)3099 static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
3100 {
3101   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3102 }
3103 
vec_any_gt(vec_char16 a,vec_char16 b)3104 static inline int vec_any_gt(vec_char16 a, vec_char16 b)
3105 {
3106   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3107 }
3108 
vec_any_gt(vec_bchar16 a,vec_char16 b)3109 static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
3110 {
3111   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
3112 }
3113 
vec_any_gt(vec_char16 a,vec_bchar16 b)3114 static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
3115 {
3116   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
3117 }
3118 
vec_any_gt(vec_ushort8 a,vec_ushort8 b)3119 static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
3120 {
3121   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3122 }
3123 
vec_any_gt(vec_short8 a,vec_short8 b)3124 static inline int vec_any_gt(vec_short8 a, vec_short8 b)
3125 {
3126   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3127 }
3128 
vec_any_gt(vec_bshort8 a,vec_short8 b)3129 static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
3130 {
3131   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
3132 }
3133 
vec_any_gt(vec_short8 a,vec_bshort8 b)3134 static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
3135 {
3136   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
3137 }
3138 
3139 
vec_any_gt(vec_uint4 a,vec_uint4 b)3140 static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
3141 {
3142   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3143 }
3144 
vec_any_gt(vec_int4 a,vec_int4 b)3145 static inline int vec_any_gt(vec_int4 a, vec_int4 b)
3146 {
3147   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3148 }
3149 
vec_any_gt(vec_bint4 a,vec_int4 b)3150 static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
3151 {
3152   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
3153 }
3154 
vec_any_gt(vec_int4 a,vec_bint4 b)3155 static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
3156 {
3157   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
3158 }
3159 
vec_any_gt(vec_float4 a,vec_float4 b)3160 static inline int vec_any_gt(vec_float4 a, vec_float4 b)
3161 {
3162   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3163 }
3164 
3165 /* vec_any_le (any elements less than or equal)
3166  * ==========
3167  */
vec_any_le(vec_uchar16 a,vec_uchar16 b)3168 static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
3169 {
3170   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3171 }
3172 
vec_any_le(vec_char16 a,vec_char16 b)3173 static inline int vec_any_le(vec_char16 a, vec_char16 b)
3174 {
3175   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3176 }
3177 
vec_any_le(vec_bchar16 a,vec_char16 b)3178 static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
3179 {
3180   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
3181 }
3182 
vec_any_le(vec_char16 a,vec_bchar16 b)3183 static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
3184 {
3185   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
3186 }
3187 
vec_any_le(vec_ushort8 a,vec_ushort8 b)3188 static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
3189 {
3190   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3191 }
3192 
vec_any_le(vec_short8 a,vec_short8 b)3193 static inline int vec_any_le(vec_short8 a, vec_short8 b)
3194 {
3195   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3196 }
3197 
vec_any_le(vec_bshort8 a,vec_short8 b)3198 static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
3199 {
3200   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
3201 }
3202 
vec_any_le(vec_short8 a,vec_bshort8 b)3203 static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
3204 {
3205   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
3206 }
3207 
vec_any_le(vec_uint4 a,vec_uint4 b)3208 static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
3209 {
3210   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3211 }
3212 
vec_any_le(vec_int4 a,vec_int4 b)3213 static inline int vec_any_le(vec_int4 a, vec_int4 b)
3214 {
3215   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3216 }
3217 
vec_any_le(vec_bint4 a,vec_int4 b)3218 static inline int vec_any_le(vec_bint4 a, vec_int4 b)
3219 {
3220   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
3221 }
3222 
vec_any_le(vec_int4 a,vec_bint4 b)3223 static inline int vec_any_le(vec_int4 a, vec_bint4 b)
3224 {
3225   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
3226 }
3227 
vec_any_le(vec_float4 a,vec_float4 b)3228 static inline int vec_any_le(vec_float4 a, vec_float4 b)
3229 {
3230   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3231 }
3232 
3233 
3234 /* vec_any_lt (any elements less than)
3235  * ==========
3236  */
vec_any_lt(vec_uchar16 a,vec_uchar16 b)3237 static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
3238 {
3239   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3240 }
3241 
vec_any_lt(vec_char16 a,vec_char16 b)3242 static inline int vec_any_lt(vec_char16 a, vec_char16 b)
3243 {
3244   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3245 }
3246 
vec_any_lt(vec_bchar16 a,vec_char16 b)3247 static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
3248 {
3249   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
3250 }
3251 
vec_any_lt(vec_char16 a,vec_bchar16 b)3252 static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
3253 {
3254   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
3255 }
3256 
vec_any_lt(vec_ushort8 a,vec_ushort8 b)3257 static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
3258 {
3259   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3260 }
3261 
vec_any_lt(vec_short8 a,vec_short8 b)3262 static inline int vec_any_lt(vec_short8 a, vec_short8 b)
3263 {
3264   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3265 }
3266 
vec_any_lt(vec_bshort8 a,vec_short8 b)3267 static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
3268 {
3269   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
3270 }
3271 
vec_any_lt(vec_short8 a,vec_bshort8 b)3272 static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
3273 {
3274   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
3275 }
3276 
vec_any_lt(vec_uint4 a,vec_uint4 b)3277 static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
3278 {
3279   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3280 }
3281 
vec_any_lt(vec_int4 a,vec_int4 b)3282 static inline int vec_any_lt(vec_int4 a, vec_int4 b)
3283 {
3284   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3285 }
3286 
vec_any_lt(vec_bint4 a,vec_int4 b)3287 static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
3288 {
3289   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
3290 }
3291 
vec_any_lt(vec_int4 a,vec_bint4 b)3292 static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
3293 {
3294   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
3295 }
3296 
vec_any_lt(vec_float4 a,vec_float4 b)3297 static inline int vec_any_lt(vec_float4 a, vec_float4 b)
3298 {
3299   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3300 }
3301 
3302 /* vec_any_nan (any elements not a number)
3303  * ===========
3304  */
vec_any_nan(vec_float4 a)3305 static inline int vec_any_nan(vec_float4 a)
3306 {
3307   vec_uint4 exp, man;
3308   vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
3309 
3310   exp = spu_and((vec_uint4)(a), exp_mask);
3311   man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
3312   return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
3313 						spu_cmpeq(man, 0))), 0) != 0));
3314 }
3315 
3316 
3317 /* vec_any_ne (any elements not equal)
3318  * ==========
3319  */
vec_any_ne(vec_uchar16 a,vec_uchar16 b)3320 static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
3321 {
3322   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3323 }
3324 
vec_any_ne(vec_char16 a,vec_char16 b)3325 static inline int vec_any_ne(vec_char16 a, vec_char16 b)
3326 {
3327   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3328 }
3329 
vec_any_ne(vec_bchar16 a,vec_char16 b)3330 static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
3331 {
3332   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
3333 }
3334 
vec_any_ne(vec_char16 a,vec_bchar16 b)3335 static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
3336 {
3337   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
3338 }
3339 
vec_any_ne(vec_ushort8 a,vec_ushort8 b)3340 static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
3341 {
3342   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3343 }
3344 
vec_any_ne(vec_short8 a,vec_short8 b)3345 static inline int vec_any_ne(vec_short8 a, vec_short8 b)
3346 {
3347   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3348 }
3349 
vec_any_ne(vec_bshort8 a,vec_short8 b)3350 static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
3351 {
3352   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
3353 }
3354 
vec_any_ne(vec_short8 a,vec_bshort8 b)3355 static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
3356 {
3357   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
3358 }
3359 
vec_any_ne(vec_uint4 a,vec_uint4 b)3360 static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
3361 {
3362   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3363 }
3364 
vec_any_ne(vec_int4 a,vec_int4 b)3365 static inline int vec_any_ne(vec_int4 a, vec_int4 b)
3366 {
3367   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3368 }
3369 
vec_any_ne(vec_bint4 a,vec_int4 b)3370 static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
3371 {
3372   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
3373 }
3374 
vec_any_ne(vec_int4 a,vec_bint4 b)3375 static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
3376 {
3377   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
3378 }
3379 
vec_any_ne(vec_float4 a,vec_float4 b)3380 static inline int vec_any_ne(vec_float4 a, vec_float4 b)
3381 {
3382   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3383 }
3384 
3385 
3386 /* vec_any_nge (any elements not greater than or equal)
3387  * ===========
3388  */
vec_any_nge(vec_float4 a,vec_float4 b)3389 static inline int vec_any_nge(vec_float4 a, vec_float4 b)
3390 {
3391   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3392 }
3393 
3394 /* vec_any_ngt (any elements not greater than)
3395  * ===========
3396  */
vec_any_ngt(vec_float4 a,vec_float4 b)3397 static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
3398 {
3399   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3400 }
3401 
3402 
3403 /* vec_any_nle (any elements not less than or equal)
3404  * ===========
3405  */
vec_any_nle(vec_float4 a,vec_float4 b)3406 static inline int vec_any_nle(vec_float4 a, vec_float4 b)
3407 {
3408   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3409 }
3410 
3411 
3412 /* vec_any_nlt (any elements not less than)
3413  * ===========
3414  */
vec_any_nlt(vec_float4 a,vec_float4 b)3415 static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
3416 {
3417   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3418 }
3419 
3420 
3421 /* vec_any_numeric (any elements numeric)
3422  * ===============
3423  */
vec_any_numeric(vec_float4 a)3424 static inline int vec_any_numeric(vec_float4 a)
3425 {
3426   vec_uint4 exp;
3427 
3428   exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
3429   return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
3430 }
3431 
3432 
3433 /* vec_any_out (any elements out of bounds)
3434  * ===========
3435  */
vec_any_out(vec_float4 a,vec_float4 b)3436 static inline int vec_any_out(vec_float4 a, vec_float4 b)
3437 {
3438   return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
3439 }
3440 
3441 
3442 /* CBE Language Extension Intrinsics
3443  */
3444 
3445 /* vec_extract (extract element from vector)
3446  * ===========
3447  */
3448 #define vec_extract(_a, _element)	spu_extract(_a, _element)
3449 
3450 
3451 /* vec_insert (insert scalar into specified vector element)
3452  * ==========
3453  */
3454 #define vec_insert(_a, _b, _element)	spu_insert(_a, _b, _element)
3455 
3456 /* vec_lvlx (load vector left indexed)
3457  * ========
3458  */
vec_lvlx(int a,unsigned char * b)3459 static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
3460 {
3461   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3462   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3463 }
3464 
vec_lvlx(int a,vec_uchar16 * b)3465 static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
3466 {
3467   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3468   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3469 }
3470 
vec_lvlx(int a,signed char * b)3471 static inline vec_char16 vec_lvlx(int a, signed char *b)
3472 {
3473   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3474   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3475 }
3476 
vec_lvlx(int a,vec_char16 * b)3477 static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
3478 {
3479   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3480   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3481 }
3482 
vec_lvlx(int a,unsigned short * b)3483 static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
3484 {
3485   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3486   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3487 }
3488 
vec_lvlx(int a,vec_ushort8 * b)3489 static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
3490 {
3491   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3492   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3493 }
3494 
vec_lvlx(int a,signed short * b)3495 static inline vec_short8 vec_lvlx(int a, signed short *b)
3496 {
3497   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3498   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3499 }
3500 
vec_lvlx(int a,vec_short8 * b)3501 static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
3502 {
3503   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3504   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3505 }
3506 
vec_lvlx(int a,unsigned int * b)3507 static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
3508 {
3509   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3510   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3511 }
3512 
vec_lvlx(int a,vec_uint4 * b)3513 static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
3514 {
3515   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3516   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3517 }
3518 
vec_lvlx(int a,signed int * b)3519 static inline vec_int4 vec_lvlx(int a, signed int *b)
3520 {
3521   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3522   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3523 }
3524 
vec_lvlx(int a,vec_int4 * b)3525 static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
3526 {
3527   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3528   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3529 }
3530 
vec_lvlx(int a,float * b)3531 static inline vec_float4 vec_lvlx(int a, float *b)
3532 {
3533   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3534   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3535 }
3536 
vec_lvlx(int a,vec_float4 * b)3537 static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
3538 {
3539   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3540   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3541 }
3542 
3543 
3544 /* vec_lvlxl (load vector left indexed last)
3545  * =========
3546  */
3547 #define vec_lvlxl(_a, _b)	vec_lvlx(_a, _b)
3548 
3549 
3550 /* vec_lvrx (load vector right indexed)
3551  * ========
3552  */
vec_lvrx(int a,unsigned char * b)3553 static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
3554 {
3555   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3556   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3557 }
3558 
vec_lvrx(int a,vec_uchar16 * b)3559 static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
3560 {
3561   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3562   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3563 }
3564 
vec_lvrx(int a,signed char * b)3565 static inline vec_char16 vec_lvrx(int a, signed char *b)
3566 {
3567   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3568   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3569 }
3570 
vec_lvrx(int a,vec_char16 * b)3571 static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
3572 {
3573   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3574   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3575 }
3576 
vec_lvrx(int a,unsigned short * b)3577 static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
3578 {
3579   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3580   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3581 }
3582 
vec_lvrx(int a,vec_ushort8 * b)3583 static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
3584 {
3585   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3586   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3587 }
3588 
vec_lvrx(int a,signed short * b)3589 static inline vec_short8 vec_lvrx(int a, signed short *b)
3590 {
3591   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3592   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3593 }
3594 
vec_lvrx(int a,vec_short8 * b)3595 static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
3596 {
3597   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3598   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3599 }
3600 
vec_lvrx(int a,unsigned int * b)3601 static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
3602 {
3603   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3604   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3605 }
3606 
vec_lvrx(int a,vec_uint4 * b)3607 static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
3608 {
3609   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3610   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3611 }
3612 
vec_lvrx(int a,signed int * b)3613 static inline vec_int4 vec_lvrx(int a, signed int *b)
3614 {
3615   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3616   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3617 }
3618 
vec_lvrx(int a,vec_int4 * b)3619 static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
3620 {
3621   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3622   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3623 }
3624 
vec_lvrx(int a,float * b)3625 static inline vec_float4 vec_lvrx(int a, float *b)
3626 {
3627   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3628   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3629 }
3630 
vec_lvrx(int a,vec_float4 * b)3631 static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
3632 {
3633   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3634   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3635 }
3636 
3637 
3638 
3639 /* vec_lvrxl (load vector right indexed last)
3640  * =========
3641  */
3642 #define vec_lvrxl(_a, _b)	vec_lvrx(_a, _b)
3643 
3644 
3645 /* vec_promote (promote scalar to a vector)
3646  * ===========
3647  */
3648 #define vec_promote(_a, _element)	spu_promote(_a, _element)
3649 
3650 
3651 /* vec_splats (splat scalar to a vector)
3652  * ==========
3653  */
3654 #define vec_splats(_a)	spu_splats(_a)
3655 
3656 
3657 /* vec_stvlx (store vector left indexed)
3658  * =========
3659  */
vec_stvlx(vec_uchar16 a,int b,unsigned char * c)3660 static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
3661 {
3662   int shift;
3663   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3664 
3665   shift = -((int)p & 0xF);
3666   *p = spu_sel(*p,
3667 	       spu_rlmaskqwbyte(a, shift),
3668 	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3669 }
3670 
vec_stvlx(vec_uchar16 a,int b,vec_uchar16 * c)3671 static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
3672 {
3673   int shift;
3674   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3675 
3676   shift = -((int)p & 0xF);
3677   *p = spu_sel(*p,
3678 	       spu_rlmaskqwbyte(a, shift),
3679 	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3680 }
3681 
vec_stvlx(vec_char16 a,int b,signed char * c)3682 static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
3683 {
3684   int shift;
3685   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3686 
3687   shift = -((int)p & 0xF);
3688   *p = spu_sel(*p,
3689 	       spu_rlmaskqwbyte(a, shift),
3690 	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3691 }
3692 
vec_stvlx(vec_char16 a,int b,vec_char16 * c)3693 static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
3694 {
3695   int shift;
3696   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3697 
3698   shift = -((int)p & 0xF);
3699   *p = spu_sel(*p,
3700 	       spu_rlmaskqwbyte(a, shift),
3701 	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3702 }
3703 
vec_stvlx(vec_ushort8 a,int b,unsigned short * c)3704 static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
3705 {
3706   int shift;
3707   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3708 
3709   shift = -((int)p & 0xF);
3710   *p = spu_sel(*p,
3711 	       spu_rlmaskqwbyte(a, shift),
3712 	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3713 }
3714 
vec_stvlx(vec_ushort8 a,int b,vec_ushort8 * c)3715 static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
3716 {
3717   int shift;
3718   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3719 
3720   shift = -((int)p & 0xF);
3721   *p = spu_sel(*p,
3722 	       spu_rlmaskqwbyte(a, shift),
3723 	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3724 }
3725 
vec_stvlx(vec_short8 a,int b,signed short * c)3726 static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
3727 {
3728   int shift;
3729   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3730 
3731   shift = -((int)p & 0xF);
3732   *p = spu_sel(*p,
3733 	       spu_rlmaskqwbyte(a, shift),
3734 	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3735 }
3736 
vec_stvlx(vec_short8 a,int b,vec_short8 * c)3737 static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
3738 {
3739   int shift;
3740   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3741 
3742   shift = -((int)p & 0xF);
3743   *p = spu_sel(*p,
3744 	       spu_rlmaskqwbyte(a, shift),
3745 	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3746 }
3747 
vec_stvlx(vec_uint4 a,int b,unsigned int * c)3748 static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
3749 {
3750   int shift;
3751   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3752 
3753   shift = -((int)p & 0xF);
3754   *p = spu_sel(*p,
3755 	       spu_rlmaskqwbyte(a, shift),
3756 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3757 }
3758 
vec_stvlx(vec_uint4 a,int b,vec_uint4 * c)3759 static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
3760 {
3761   int shift;
3762   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3763 
3764   shift = -((int)p & 0xF);
3765   *p = spu_sel(*p,
3766 	       spu_rlmaskqwbyte(a, shift),
3767 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3768 }
3769 
vec_stvlx(vec_int4 a,int b,signed int * c)3770 static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
3771 {
3772   int shift;
3773   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3774 
3775   shift = -((int)p & 0xF);
3776   *p = spu_sel(*p,
3777 	       spu_rlmaskqwbyte(a, shift),
3778 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3779 }
3780 
vec_stvlx(vec_int4 a,int b,vec_int4 * c)3781 static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
3782 {
3783   int shift;
3784   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3785 
3786   shift = -((int)p & 0xF);
3787   *p = spu_sel(*p,
3788 	       spu_rlmaskqwbyte(a, shift),
3789 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3790 }
3791 
vec_stvlx(vec_float4 a,int b,float * c)3792 static inline void vec_stvlx(vec_float4 a, int b, float *c)
3793 {
3794   int shift;
3795   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3796 
3797   shift = -((int)p & 0xF);
3798   *p = spu_sel(*p,
3799 	       spu_rlmaskqwbyte(a, shift),
3800 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3801 }
3802 
vec_stvlx(vec_float4 a,int b,vec_float4 * c)3803 static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
3804 {
3805   int shift;
3806   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3807 
3808   shift = -((int)p & 0xF);
3809   *p = spu_sel(*p,
3810 	       spu_rlmaskqwbyte(a, shift),
3811 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3812 }
3813 
3814 /* vec_stvlxl (store vector left indexed last)
3815  * ==========
3816  */
3817 #define vec_stvlxl(_a, _b, _c)	vec_stvlx(_a, _b, _c)
3818 
3819 
3820 /* vec_stvrx (store vector right indexed)
3821  * =========
3822  */
vec_stvrx(vec_uchar16 a,int b,unsigned char * c)3823 static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
3824 {
3825   int shift;
3826   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3827 
3828   shift = 16-((int)p & 0xF);
3829   *p = spu_sel(*p,
3830 	       spu_slqwbyte(a, shift),
3831 	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3832 }
3833 
vec_stvrx(vec_uchar16 a,int b,vec_uchar16 * c)3834 static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
3835 {
3836   int shift;
3837   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3838 
3839   shift = 16-((int)p & 0xF);
3840   *p = spu_sel(*p,
3841 	       spu_slqwbyte(a, shift),
3842 	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3843 }
3844 
vec_stvrx(vec_char16 a,int b,signed char * c)3845 static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
3846 {
3847   int shift;
3848   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3849 
3850   shift = 16-((int)p & 0xF);
3851   *p = spu_sel(*p,
3852 	       spu_slqwbyte(a, shift),
3853 	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3854 }
3855 
vec_stvrx(vec_char16 a,int b,vec_char16 * c)3856 static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
3857 {
3858   int shift;
3859   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3860 
3861   shift = 16-((int)p & 0xF);
3862   *p = spu_sel(*p,
3863 	       spu_slqwbyte(a, shift),
3864 	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3865 }
3866 
vec_stvrx(vec_ushort8 a,int b,unsigned short * c)3867 static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
3868 {
3869   int shift;
3870   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3871 
3872   shift = 16-((int)p & 0xF);
3873   *p = spu_sel(*p,
3874 	       spu_slqwbyte(a, shift),
3875 	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3876 }
3877 
vec_stvrx(vec_ushort8 a,int b,vec_ushort8 * c)3878 static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
3879 {
3880   int shift;
3881   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3882 
3883   shift = 16-((int)p & 0xF);
3884   *p = spu_sel(*p,
3885 	       spu_slqwbyte(a, shift),
3886 	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3887 }
3888 
vec_stvrx(vec_short8 a,int b,signed short * c)3889 static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
3890 {
3891   int shift;
3892   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3893 
3894   shift = 16-((int)p & 0xF);
3895   *p = spu_sel(*p,
3896 	       spu_slqwbyte(a, shift),
3897 	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3898 }
3899 
vec_stvrx(vec_short8 a,int b,vec_short8 * c)3900 static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
3901 {
3902   int shift;
3903   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3904 
3905   shift = 16-((int)p & 0xF);
3906   *p = spu_sel(*p,
3907 	       spu_slqwbyte(a, shift),
3908 	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3909 }
3910 
vec_stvrx(vec_uint4 a,int b,unsigned int * c)3911 static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
3912 {
3913   int shift;
3914   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3915 
3916   shift = 16-((int)p & 0xF);
3917   *p = spu_sel(*p,
3918 	       spu_slqwbyte(a, shift),
3919 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3920 }
3921 
vec_stvrx(vec_uint4 a,int b,vec_uint4 * c)3922 static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
3923 {
3924   int shift;
3925   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3926 
3927   shift = 16-((int)p & 0xF);
3928   *p = spu_sel(*p,
3929 	       spu_slqwbyte(a, shift),
3930 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3931 }
3932 
vec_stvrx(vec_int4 a,int b,signed int * c)3933 static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
3934 {
3935   int shift;
3936   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3937 
3938   shift = 16-((int)p & 0xF);
3939   *p = spu_sel(*p,
3940 	       spu_slqwbyte(a, shift),
3941 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3942 }
3943 
vec_stvrx(vec_int4 a,int b,vec_int4 * c)3944 static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
3945 {
3946   int shift;
3947   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3948 
3949   shift = 16-((int)p & 0xF);
3950   *p = spu_sel(*p,
3951 	       spu_slqwbyte(a, shift),
3952 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3953 }
3954 
vec_stvrx(vec_float4 a,int b,float * c)3955 static inline void vec_stvrx(vec_float4 a, int b, float *c)
3956 {
3957   int shift;
3958   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3959 
3960   shift = 16-((int)p & 0xF);
3961   *p = spu_sel(*p,
3962 	       spu_slqwbyte(a, shift),
3963 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3964 }
3965 
vec_stvrx(vec_float4 a,int b,vec_float4 * c)3966 static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
3967 {
3968   int shift;
3969   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3970 
3971   shift = 16-((int)p & 0xF);
3972   *p = spu_sel(*p,
3973 	       spu_slqwbyte(a, shift),
3974 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3975 }
3976 
3977 /* vec_stvrxl (store vector right indexed last)
3978  * ==========
3979  */
3980 #define vec_stvrxl(_a, _b, _c)	vec_stvrx(_a, _b, _c)
3981 
3982 
3983 #endif /* __SPU__ */
3984 #endif /* __cplusplus */
3985 #endif /* !_VMX2SPU_H_ */
3986