1 /* Copyright (C) 2003-2020 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
35
36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38 operations. However scalar double operations in vector (XMM)
39 registers require the POWER8 VSX ISA (2.07) level. Also there are
40 important differences for data format and placement of double
41 scalars in the vector register.
42
43 For PowerISA Scalar double is in FPRs (left most 64-bits of the
44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45 the XMM. These differences require extra steps on POWER to match
46 the SSE2 scalar double semantics.
47
48 Most SSE2 scalar double intrinsic operations can be performed more
49 efficiently as C language double scalar operations or optimized to
50 use vector SIMD operations. We recommend this for new applications.
51
52 Another difference is the format and details of the X86_64 MXSCR vs
53 the PowerISA FPSCR / VSCR registers. We recommend applications
54 replace direct access to the MXSCR with the more portable <fenv.h>
55 Posix APIs. */
56 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #endif
58
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
61
62 #include <altivec.h>
63 #include <assert.h>
64
65 /* We need definitions from the SSE header files. */
66 #include <xmmintrin.h>
67
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
78
79 /* The Intel API is flexible enough that we must allow aliasing with other
80 vector types, and their scalar components. */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
83
84 /* Unaligned version of the same types. */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
87
88 /* Define two value permute mask. */
89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
90
91 /* Create a vector with element 0 as F and the rest zero. */
92 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd(double __F)93 _mm_set_sd (double __F)
94 {
95 return __extension__ (__m128d){ __F, 0.0 };
96 }
97
98 /* Create a vector with both elements equal to F. */
99 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd(double __F)100 _mm_set1_pd (double __F)
101 {
102 return __extension__ (__m128d){ __F, __F };
103 }
104
105 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1(double __F)106 _mm_set_pd1 (double __F)
107 {
108 return _mm_set1_pd (__F);
109 }
110
111 /* Create a vector with the lower value X and upper value W. */
112 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd(double __W,double __X)113 _mm_set_pd (double __W, double __X)
114 {
115 return __extension__ (__m128d){ __X, __W };
116 }
117
118 /* Create a vector with the lower value W and upper value X. */
119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd(double __W,double __X)120 _mm_setr_pd (double __W, double __X)
121 {
122 return __extension__ (__m128d){ __W, __X };
123 }
124
125 /* Create an undefined vector. */
126 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd(void)127 _mm_undefined_pd (void)
128 {
129 __m128d __Y = __Y;
130 return __Y;
131 }
132
133 /* Create a vector of zeros. */
134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd(void)135 _mm_setzero_pd (void)
136 {
137 return (__m128d) vec_splats (0);
138 }
139
140 /* Sets the low DPFP value of A from the low value of B. */
141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd(__m128d __A,__m128d __B)142 _mm_move_sd (__m128d __A, __m128d __B)
143 {
144 __v2df __result = (__v2df) __A;
145 __result [0] = ((__v2df) __B)[0];
146 return (__m128d) __result;
147 }
148
149 /* Load two DPFP values from P. The address must be 16-byte aligned. */
150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd(double const * __P)151 _mm_load_pd (double const *__P)
152 {
153 assert(((unsigned long)__P & 0xfUL) == 0UL);
154 return ((__m128d)vec_ld(0, (__v16qu*)__P));
155 }
156
157 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
158 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd(double const * __P)159 _mm_loadu_pd (double const *__P)
160 {
161 return (vec_vsx_ld(0, __P));
162 }
163
164 /* Create a vector with all two elements equal to *P. */
165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd(double const * __P)166 _mm_load1_pd (double const *__P)
167 {
168 return (vec_splats (*__P));
169 }
170
171 /* Create a vector with element 0 as *P and the rest zero. */
172 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd(double const * __P)173 _mm_load_sd (double const *__P)
174 {
175 return _mm_set_sd (*__P);
176 }
177
178 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1(double const * __P)179 _mm_load_pd1 (double const *__P)
180 {
181 return _mm_load1_pd (__P);
182 }
183
184 /* Load two DPFP values in reverse order. The address must be aligned. */
185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd(double const * __P)186 _mm_loadr_pd (double const *__P)
187 {
188 __v2df __tmp = _mm_load_pd (__P);
189 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
190 }
191
192 /* Store two DPFP values. The address must be 16-byte aligned. */
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd(double * __P,__m128d __A)194 _mm_store_pd (double *__P, __m128d __A)
195 {
196 assert(((unsigned long)__P & 0xfUL) == 0UL);
197 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
198 }
199
200 /* Store two DPFP values. The address need not be 16-byte aligned. */
201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd(double * __P,__m128d __A)202 _mm_storeu_pd (double *__P, __m128d __A)
203 {
204 *(__m128d_u *)__P = __A;
205 }
206
207 /* Stores the lower DPFP value. */
208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd(double * __P,__m128d __A)209 _mm_store_sd (double *__P, __m128d __A)
210 {
211 *__P = ((__v2df)__A)[0];
212 }
213
214 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64(__m128d __A)215 _mm_cvtsd_f64 (__m128d __A)
216 {
217 return ((__v2df)__A)[0];
218 }
219
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd(double * __P,__m128d __A)221 _mm_storel_pd (double *__P, __m128d __A)
222 {
223 _mm_store_sd (__P, __A);
224 }
225
226 /* Stores the upper DPFP value. */
227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd(double * __P,__m128d __A)228 _mm_storeh_pd (double *__P, __m128d __A)
229 {
230 *__P = ((__v2df)__A)[1];
231 }
232 /* Store the lower DPFP value across two words.
233 The address must be 16-byte aligned. */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd(double * __P,__m128d __A)235 _mm_store1_pd (double *__P, __m128d __A)
236 {
237 _mm_store_pd (__P, vec_splat (__A, 0));
238 }
239
240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1(double * __P,__m128d __A)241 _mm_store_pd1 (double *__P, __m128d __A)
242 {
243 _mm_store1_pd (__P, __A);
244 }
245
246 /* Store two DPFP values in reverse order. The address must be aligned. */
247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd(double * __P,__m128d __A)248 _mm_storer_pd (double *__P, __m128d __A)
249 {
250 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
251 }
252
253 /* Intel intrinsic. */
254 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64(__m128i __A)255 _mm_cvtsi128_si64 (__m128i __A)
256 {
257 return ((__v2di)__A)[0];
258 }
259
260 /* Microsoft intrinsic. */
261 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x(__m128i __A)262 _mm_cvtsi128_si64x (__m128i __A)
263 {
264 return ((__v2di)__A)[0];
265 }
266
267 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd(__m128d __A,__m128d __B)268 _mm_add_pd (__m128d __A, __m128d __B)
269 {
270 return (__m128d) ((__v2df)__A + (__v2df)__B);
271 }
272
273 /* Add the lower double-precision (64-bit) floating-point element in
274 a and b, store the result in the lower element of dst, and copy
275 the upper element from a to the upper element of dst. */
276 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd(__m128d __A,__m128d __B)277 _mm_add_sd (__m128d __A, __m128d __B)
278 {
279 __A[0] = __A[0] + __B[0];
280 return (__A);
281 }
282
283 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd(__m128d __A,__m128d __B)284 _mm_sub_pd (__m128d __A, __m128d __B)
285 {
286 return (__m128d) ((__v2df)__A - (__v2df)__B);
287 }
288
289 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd(__m128d __A,__m128d __B)290 _mm_sub_sd (__m128d __A, __m128d __B)
291 {
292 __A[0] = __A[0] - __B[0];
293 return (__A);
294 }
295
296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd(__m128d __A,__m128d __B)297 _mm_mul_pd (__m128d __A, __m128d __B)
298 {
299 return (__m128d) ((__v2df)__A * (__v2df)__B);
300 }
301
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd(__m128d __A,__m128d __B)303 _mm_mul_sd (__m128d __A, __m128d __B)
304 {
305 __A[0] = __A[0] * __B[0];
306 return (__A);
307 }
308
309 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd(__m128d __A,__m128d __B)310 _mm_div_pd (__m128d __A, __m128d __B)
311 {
312 return (__m128d) ((__v2df)__A / (__v2df)__B);
313 }
314
315 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd(__m128d __A,__m128d __B)316 _mm_div_sd (__m128d __A, __m128d __B)
317 {
318 __A[0] = __A[0] / __B[0];
319 return (__A);
320 }
321
322 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd(__m128d __A)323 _mm_sqrt_pd (__m128d __A)
324 {
325 return (vec_sqrt (__A));
326 }
327
328 /* Return pair {sqrt (B[0]), A[1]}. */
329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd(__m128d __A,__m128d __B)330 _mm_sqrt_sd (__m128d __A, __m128d __B)
331 {
332 __v2df __c;
333 __c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
334 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
335 }
336
337 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd(__m128d __A,__m128d __B)338 _mm_min_pd (__m128d __A, __m128d __B)
339 {
340 return (vec_min (__A, __B));
341 }
342
343 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd(__m128d __A,__m128d __B)344 _mm_min_sd (__m128d __A, __m128d __B)
345 {
346 __v2df __a, __b, __c;
347 __a = vec_splats (__A[0]);
348 __b = vec_splats (__B[0]);
349 __c = vec_min (__a, __b);
350 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
351 }
352
353 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd(__m128d __A,__m128d __B)354 _mm_max_pd (__m128d __A, __m128d __B)
355 {
356 return (vec_max (__A, __B));
357 }
358
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd(__m128d __A,__m128d __B)360 _mm_max_sd (__m128d __A, __m128d __B)
361 {
362 __v2df __a, __b, __c;
363 __a = vec_splats (__A[0]);
364 __b = vec_splats (__B[0]);
365 __c = vec_max (__a, __b);
366 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
367 }
368
369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd(__m128d __A,__m128d __B)370 _mm_cmpeq_pd (__m128d __A, __m128d __B)
371 {
372 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
373 }
374
375 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd(__m128d __A,__m128d __B)376 _mm_cmplt_pd (__m128d __A, __m128d __B)
377 {
378 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
379 }
380
381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd(__m128d __A,__m128d __B)382 _mm_cmple_pd (__m128d __A, __m128d __B)
383 {
384 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
385 }
386
387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd(__m128d __A,__m128d __B)388 _mm_cmpgt_pd (__m128d __A, __m128d __B)
389 {
390 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391 }
392
393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd(__m128d __A,__m128d __B)394 _mm_cmpge_pd (__m128d __A, __m128d __B)
395 {
396 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
397 }
398
399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd(__m128d __A,__m128d __B)400 _mm_cmpneq_pd (__m128d __A, __m128d __B)
401 {
402 __v2df __temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
403 return ((__m128d)vec_nor (__temp, __temp));
404 }
405
406 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd(__m128d __A,__m128d __B)407 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
408 {
409 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
410 }
411
412 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd(__m128d __A,__m128d __B)413 _mm_cmpnle_pd (__m128d __A, __m128d __B)
414 {
415 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
416 }
417
418 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd(__m128d __A,__m128d __B)419 _mm_cmpngt_pd (__m128d __A, __m128d __B)
420 {
421 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
422 }
423
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd(__m128d __A,__m128d __B)425 _mm_cmpnge_pd (__m128d __A, __m128d __B)
426 {
427 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
428 }
429
430 #if _ARCH_PWR8
431 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd(__m128d __A,__m128d __B)432 _mm_cmpord_pd (__m128d __A, __m128d __B)
433 {
434 __v2du c, d;
435 /* Compare against self will return false (0's) if NAN. */
436 c = (__v2du)vec_cmpeq (__A, __A);
437 d = (__v2du)vec_cmpeq (__B, __B);
438 /* A != NAN and B != NAN. */
439 return ((__m128d)vec_and(c, d));
440 }
441 #endif
442
443 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd(__m128d __A,__m128d __B)444 _mm_cmpunord_pd (__m128d __A, __m128d __B)
445 {
446 #if _ARCH_PWR8
447 __v2du c, d;
448 /* Compare against self will return false (0's) if NAN. */
449 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
450 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
451 /* A == NAN OR B == NAN converts too:
452 NOT(A != NAN) OR NOT(B != NAN). */
453 c = vec_nor (c, c);
454 return ((__m128d)vec_orc(c, d));
455 #else
456 __v2du c, d;
457 /* Compare against self will return false (0's) if NAN. */
458 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
459 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
460 /* Convert the true ('1's) is NAN. */
461 c = vec_nor (c, c);
462 d = vec_nor (d, d);
463 return ((__m128d)vec_or(c, d));
464 #endif
465 }
466
467 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A,__m128d __B)468 _mm_cmpeq_sd(__m128d __A, __m128d __B)
469 {
470 __v2df a, b, c;
471 /* PowerISA VSX does not allow partial (for just lower double)
472 results. So to insure we don't generate spurious exceptions
473 (from the upper double values) we splat the lower double
474 before we do the operation. */
475 a = vec_splats (__A[0]);
476 b = vec_splats (__B[0]);
477 c = (__v2df) vec_cmpeq(a, b);
478 /* Then we merge the lower double result with the original upper
479 double from __A. */
480 return (__m128d) _mm_setr_pd (c[0], __A[1]);
481 }
482
483 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd(__m128d __A,__m128d __B)484 _mm_cmplt_sd (__m128d __A, __m128d __B)
485 {
486 __v2df a, b, c;
487 a = vec_splats (__A[0]);
488 b = vec_splats (__B[0]);
489 c = (__v2df) vec_cmplt(a, b);
490 return (__m128d) _mm_setr_pd (c[0], __A[1]);
491 }
492
493 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd(__m128d __A,__m128d __B)494 _mm_cmple_sd (__m128d __A, __m128d __B)
495 {
496 __v2df a, b, c;
497 a = vec_splats (__A[0]);
498 b = vec_splats (__B[0]);
499 c = (__v2df) vec_cmple(a, b);
500 return (__m128d) _mm_setr_pd (c[0], __A[1]);
501 }
502
503 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd(__m128d __A,__m128d __B)504 _mm_cmpgt_sd (__m128d __A, __m128d __B)
505 {
506 __v2df a, b, c;
507 a = vec_splats (__A[0]);
508 b = vec_splats (__B[0]);
509 c = (__v2df) vec_cmpgt(a, b);
510 return (__m128d) _mm_setr_pd (c[0], __A[1]);
511 }
512
513 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd(__m128d __A,__m128d __B)514 _mm_cmpge_sd (__m128d __A, __m128d __B)
515 {
516 __v2df a, b, c;
517 a = vec_splats (__A[0]);
518 b = vec_splats (__B[0]);
519 c = (__v2df) vec_cmpge(a, b);
520 return (__m128d) _mm_setr_pd (c[0], __A[1]);
521 }
522
523 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd(__m128d __A,__m128d __B)524 _mm_cmpneq_sd (__m128d __A, __m128d __B)
525 {
526 __v2df a, b, c;
527 a = vec_splats (__A[0]);
528 b = vec_splats (__B[0]);
529 c = (__v2df) vec_cmpeq(a, b);
530 c = vec_nor (c, c);
531 return (__m128d) _mm_setr_pd (c[0], __A[1]);
532 }
533
534 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd(__m128d __A,__m128d __B)535 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
536 {
537 __v2df a, b, c;
538 a = vec_splats (__A[0]);
539 b = vec_splats (__B[0]);
540 /* Not less than is just greater than or equal. */
541 c = (__v2df) vec_cmpge(a, b);
542 return (__m128d) _mm_setr_pd (c[0], __A[1]);
543 }
544
545 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd(__m128d __A,__m128d __B)546 _mm_cmpnle_sd (__m128d __A, __m128d __B)
547 {
548 __v2df a, b, c;
549 a = vec_splats (__A[0]);
550 b = vec_splats (__B[0]);
551 /* Not less than or equal is just greater than. */
552 c = (__v2df) vec_cmpge(a, b);
553 return (__m128d) _mm_setr_pd (c[0], __A[1]);
554 }
555
556 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd(__m128d __A,__m128d __B)557 _mm_cmpngt_sd (__m128d __A, __m128d __B)
558 {
559 __v2df a, b, c;
560 a = vec_splats (__A[0]);
561 b = vec_splats (__B[0]);
562 /* Not greater than is just less than or equal. */
563 c = (__v2df) vec_cmple(a, b);
564 return (__m128d) _mm_setr_pd (c[0], __A[1]);
565 }
566
567 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd(__m128d __A,__m128d __B)568 _mm_cmpnge_sd (__m128d __A, __m128d __B)
569 {
570 __v2df a, b, c;
571 a = vec_splats (__A[0]);
572 b = vec_splats (__B[0]);
573 /* Not greater than or equal is just less than. */
574 c = (__v2df) vec_cmplt(a, b);
575 return (__m128d) _mm_setr_pd (c[0], __A[1]);
576 }
577
578 #if _ARCH_PWR8
579 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd(__m128d __A,__m128d __B)580 _mm_cmpord_sd (__m128d __A, __m128d __B)
581 {
582 __v2df r;
583 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
584 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
585 }
586 #endif
587
588 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd(__m128d __A,__m128d __B)589 _mm_cmpunord_sd (__m128d __A, __m128d __B)
590 {
591 __v2df r;
592 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
593 return (__m128d) _mm_setr_pd (r[0], __A[1]);
594 }
595
596 /* FIXME
597 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
598 exactly the same because GCC for PowerPC only generates unordered
599 compares (scalar and vector).
600 Technically __mm_comieq_sp et all should be using the ordered
601 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
602 be OK. */
603 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd(__m128d __A,__m128d __B)604 _mm_comieq_sd (__m128d __A, __m128d __B)
605 {
606 return (__A[0] == __B[0]);
607 }
608
609 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd(__m128d __A,__m128d __B)610 _mm_comilt_sd (__m128d __A, __m128d __B)
611 {
612 return (__A[0] < __B[0]);
613 }
614
615 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd(__m128d __A,__m128d __B)616 _mm_comile_sd (__m128d __A, __m128d __B)
617 {
618 return (__A[0] <= __B[0]);
619 }
620
621 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd(__m128d __A,__m128d __B)622 _mm_comigt_sd (__m128d __A, __m128d __B)
623 {
624 return (__A[0] > __B[0]);
625 }
626
627 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd(__m128d __A,__m128d __B)628 _mm_comige_sd (__m128d __A, __m128d __B)
629 {
630 return (__A[0] >= __B[0]);
631 }
632
633 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd(__m128d __A,__m128d __B)634 _mm_comineq_sd (__m128d __A, __m128d __B)
635 {
636 return (__A[0] != __B[0]);
637 }
638
639 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd(__m128d __A,__m128d __B)640 _mm_ucomieq_sd (__m128d __A, __m128d __B)
641 {
642 return (__A[0] == __B[0]);
643 }
644
645 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd(__m128d __A,__m128d __B)646 _mm_ucomilt_sd (__m128d __A, __m128d __B)
647 {
648 return (__A[0] < __B[0]);
649 }
650
651 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd(__m128d __A,__m128d __B)652 _mm_ucomile_sd (__m128d __A, __m128d __B)
653 {
654 return (__A[0] <= __B[0]);
655 }
656
657 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd(__m128d __A,__m128d __B)658 _mm_ucomigt_sd (__m128d __A, __m128d __B)
659 {
660 return (__A[0] > __B[0]);
661 }
662
663 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd(__m128d __A,__m128d __B)664 _mm_ucomige_sd (__m128d __A, __m128d __B)
665 {
666 return (__A[0] >= __B[0]);
667 }
668
669 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd(__m128d __A,__m128d __B)670 _mm_ucomineq_sd (__m128d __A, __m128d __B)
671 {
672 return (__A[0] != __B[0]);
673 }
674
675 /* Create a vector of Qi, where i is the element number. */
676 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x(long long __q1,long long __q0)677 _mm_set_epi64x (long long __q1, long long __q0)
678 {
679 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
680 }
681
682 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64(__m64 __q1,__m64 __q0)683 _mm_set_epi64 (__m64 __q1, __m64 __q0)
684 {
685 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
686 }
687
688 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32(int __q3,int __q2,int __q1,int __q0)689 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
690 {
691 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
692 }
693
694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16(short __q7,short __q6,short __q5,short __q4,short __q3,short __q2,short __q1,short __q0)695 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
696 short __q3, short __q2, short __q1, short __q0)
697 {
698 return __extension__ (__m128i)(__v8hi){
699 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
700 }
701
702 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8(char __q15,char __q14,char __q13,char __q12,char __q11,char __q10,char __q09,char __q08,char __q07,char __q06,char __q05,char __q04,char __q03,char __q02,char __q01,char __q00)703 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
704 char __q11, char __q10, char __q09, char __q08,
705 char __q07, char __q06, char __q05, char __q04,
706 char __q03, char __q02, char __q01, char __q00)
707 {
708 return __extension__ (__m128i)(__v16qi){
709 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
710 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
711 };
712 }
713
714 /* Set all of the elements of the vector to A. */
715 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x(long long __A)716 _mm_set1_epi64x (long long __A)
717 {
718 return _mm_set_epi64x (__A, __A);
719 }
720
721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64(__m64 __A)722 _mm_set1_epi64 (__m64 __A)
723 {
724 return _mm_set_epi64 (__A, __A);
725 }
726
727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32(int __A)728 _mm_set1_epi32 (int __A)
729 {
730 return _mm_set_epi32 (__A, __A, __A, __A);
731 }
732
733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16(short __A)734 _mm_set1_epi16 (short __A)
735 {
736 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
737 }
738
739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8(char __A)740 _mm_set1_epi8 (char __A)
741 {
742 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
743 __A, __A, __A, __A, __A, __A, __A, __A);
744 }
745
746 /* Create a vector of Qi, where i is the element number.
747 The parameter order is reversed from the _mm_set_epi* functions. */
748 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64(__m64 __q0,__m64 __q1)749 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
750 {
751 return _mm_set_epi64 (__q1, __q0);
752 }
753
754 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32(int __q0,int __q1,int __q2,int __q3)755 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
756 {
757 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
758 }
759
760 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16(short __q0,short __q1,short __q2,short __q3,short __q4,short __q5,short __q6,short __q7)761 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
762 short __q4, short __q5, short __q6, short __q7)
763 {
764 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
765 }
766
767 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8(char __q00,char __q01,char __q02,char __q03,char __q04,char __q05,char __q06,char __q07,char __q08,char __q09,char __q10,char __q11,char __q12,char __q13,char __q14,char __q15)768 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
769 char __q04, char __q05, char __q06, char __q07,
770 char __q08, char __q09, char __q10, char __q11,
771 char __q12, char __q13, char __q14, char __q15)
772 {
773 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
774 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
775 }
776
777 /* Create a vector with element 0 as *P and the rest zero. */
778 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128(__m128i const * __P)779 _mm_load_si128 (__m128i const *__P)
780 {
781 return *__P;
782 }
783
784 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128(__m128i_u const * __P)785 _mm_loadu_si128 (__m128i_u const *__P)
786 {
787 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
788 }
789
790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64(__m128i_u const * __P)791 _mm_loadl_epi64 (__m128i_u const *__P)
792 {
793 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
794 }
795
796 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128(__m128i * __P,__m128i __B)797 _mm_store_si128 (__m128i *__P, __m128i __B)
798 {
799 assert(((unsigned long )__P & 0xfUL) == 0UL);
800 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
801 }
802
803 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128(__m128i_u * __P,__m128i __B)804 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
805 {
806 *__P = __B;
807 }
808
809 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64(__m128i_u * __P,__m128i __B)810 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
811 {
812 *(long long *)__P = ((__v2di)__B)[0];
813 }
814
815 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64(__m128i_u __B)816 _mm_movepi64_pi64 (__m128i_u __B)
817 {
818 return (__m64) ((__v2di)__B)[0];
819 }
820
821 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64(__m64 __A)822 _mm_movpi64_epi64 (__m64 __A)
823 {
824 return _mm_set_epi64 ((__m64)0LL, __A);
825 }
826
827 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64(__m128i __A)828 _mm_move_epi64 (__m128i __A)
829 {
830 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
831 }
832
833 /* Create an undefined vector. */
834 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128(void)835 _mm_undefined_si128 (void)
836 {
837 __m128i __Y = __Y;
838 return __Y;
839 }
840
841 /* Create a vector of zeros. */
842 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128(void)843 _mm_setzero_si128 (void)
844 {
845 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
846 }
847
848 #ifdef _ARCH_PWR8
849 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd(__m128i __A)850 _mm_cvtepi32_pd (__m128i __A)
851 {
852 __v2di __val;
853 /* For LE need to generate Vector Unpack Low Signed Word.
854 Which is generated from unpackh. */
855 __val = (__v2di)vec_unpackh ((__v4si)__A);
856
857 return (__m128d)vec_ctf (__val, 0);
858 }
859 #endif
860
861 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps(__m128i __A)862 _mm_cvtepi32_ps (__m128i __A)
863 {
864 return ((__m128)vec_ctf((__v4si)__A, 0));
865 }
866
867 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32(__m128d __A)868 _mm_cvtpd_epi32 (__m128d __A)
869 {
870 __v2df __rounded = vec_rint (__A);
871 __v4si __result, __temp;
872 const __v4si __vzero =
873 { 0, 0, 0, 0 };
874
875 /* VSX Vector truncate Double-Precision to integer and Convert to
876 Signed Integer Word format with Saturate. */
877 __asm__(
878 "xvcvdpsxws %x0,%x1"
879 : "=wa" (__temp)
880 : "wa" (__rounded)
881 : );
882
883 #ifdef _ARCH_PWR8
884 #ifdef __LITTLE_ENDIAN__
885 __temp = vec_mergeo (__temp, __temp);
886 #else
887 __temp = vec_mergee (__temp, __temp);
888 #endif
889 __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
890 (__vector long long) __vzero);
891 #else
892 {
893 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
894 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
895 __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
896 }
897 #endif
898 return (__m128i) __result;
899 }
900
901 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32(__m128d __A)902 _mm_cvtpd_pi32 (__m128d __A)
903 {
904 __m128i __result = _mm_cvtpd_epi32(__A);
905
906 return (__m64) __result[0];
907 }
908
909 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps(__m128d __A)910 _mm_cvtpd_ps (__m128d __A)
911 {
912 __v4sf __result;
913 __v4si __temp;
914 const __v4si __vzero = { 0, 0, 0, 0 };
915
916 __asm__(
917 "xvcvdpsp %x0,%x1"
918 : "=wa" (__temp)
919 : "wa" (__A)
920 : );
921
922 #ifdef _ARCH_PWR8
923 #ifdef __LITTLE_ENDIAN__
924 __temp = vec_mergeo (__temp, __temp);
925 #else
926 __temp = vec_mergee (__temp, __temp);
927 #endif
928 __result = (__v4sf) vec_vpkudum ((__vector long long) __temp,
929 (__vector long long) __vzero);
930 #else
931 {
932 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
933 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
934 __result = (__v4sf) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
935 }
936 #endif
937 return ((__m128)__result);
938 }
939
940 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32(__m128d __A)941 _mm_cvttpd_epi32 (__m128d __A)
942 {
943 __v4si __result;
944 __v4si __temp;
945 const __v4si __vzero = { 0, 0, 0, 0 };
946
947 /* VSX Vector truncate Double-Precision to integer and Convert to
948 Signed Integer Word format with Saturate. */
949 __asm__(
950 "xvcvdpsxws %x0,%x1"
951 : "=wa" (__temp)
952 : "wa" (__A)
953 : );
954
955 #ifdef _ARCH_PWR8
956 #ifdef __LITTLE_ENDIAN__
957 __temp = vec_mergeo (__temp, __temp);
958 #else
959 __temp = vec_mergee (__temp, __temp);
960 #endif
961 __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
962 (__vector long long) __vzero);
963 #else
964 {
965 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
966 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
967 __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
968 }
969 #endif
970
971 return ((__m128i) __result);
972 }
973
974 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32(__m128d __A)975 _mm_cvttpd_pi32 (__m128d __A)
976 {
977 __m128i __result = _mm_cvttpd_epi32 (__A);
978
979 return (__m64) __result[0];
980 }
981
982 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32(__m128i __A)983 _mm_cvtsi128_si32 (__m128i __A)
984 {
985 return ((__v4si)__A)[0];
986 }
987
988 #ifdef _ARCH_PWR8
989 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd(__m64 __A)990 _mm_cvtpi32_pd (__m64 __A)
991 {
992 __v4si __temp;
993 __v2di __tmp2;
994 __v2df __result;
995
996 __temp = (__v4si)vec_splats (__A);
997 __tmp2 = (__v2di)vec_unpackl (__temp);
998 __result = vec_ctf ((__vector signed long long) __tmp2, 0);
999 return (__m128d)__result;
1000 }
1001 #endif
1002
1003 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32(__m128 __A)1004 _mm_cvtps_epi32 (__m128 __A)
1005 {
1006 __v4sf __rounded;
1007 __v4si __result;
1008
1009 __rounded = vec_rint((__v4sf) __A);
1010 __result = vec_cts (__rounded, 0);
1011 return (__m128i) __result;
1012 }
1013
1014 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32(__m128 __A)1015 _mm_cvttps_epi32 (__m128 __A)
1016 {
1017 __v4si __result;
1018
1019 __result = vec_cts ((__v4sf) __A, 0);
1020 return (__m128i) __result;
1021 }
1022
1023 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd(__m128 __A)1024 _mm_cvtps_pd (__m128 __A)
1025 {
1026 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1027 #ifdef vec_doubleh
1028 return (__m128d) vec_doubleh ((__v4sf)__A);
1029 #else
1030 /* Otherwise the compiler is not current and so need to generate the
1031 equivalent code. */
1032 __v4sf __a = (__v4sf)__A;
1033 __v4sf __temp;
1034 __v2df __result;
1035 #ifdef __LITTLE_ENDIAN__
1036 /* The input float values are in elements {[0], [1]} but the convert
1037 instruction needs them in elements {[1], [3]}, So we use two
1038 shift left double vector word immediates to get the elements
1039 lined up. */
1040 __temp = __builtin_vsx_xxsldwi (__a, __a, 3);
1041 __temp = __builtin_vsx_xxsldwi (__a, __temp, 2);
1042 #else
1043 /* The input float values are in elements {[0], [1]} but the convert
1044 instruction needs them in elements {[0], [2]}, So we use two
1045 shift left double vector word immediates to get the elements
1046 lined up. */
1047 __temp = vec_vmrghw (__a, __a);
1048 #endif
1049 __asm__(
1050 " xvcvspdp %x0,%x1"
1051 : "=wa" (__result)
1052 : "wa" (__temp)
1053 : );
1054 return (__m128d) __result;
1055 #endif
1056 }
1057
1058 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32(__m128d __A)1059 _mm_cvtsd_si32 (__m128d __A)
1060 {
1061 __v2df __rounded = vec_rint((__v2df) __A);
1062 int __result = ((__v2df)__rounded)[0];
1063
1064 return __result;
1065 }
1066 /* Intel intrinsic. */
1067 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64(__m128d __A)1068 _mm_cvtsd_si64 (__m128d __A)
1069 {
1070 __v2df __rounded = vec_rint ((__v2df) __A );
1071 long long __result = ((__v2df) __rounded)[0];
1072
1073 return __result;
1074 }
1075
1076 /* Microsoft intrinsic. */
1077 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x(__m128d __A)1078 _mm_cvtsd_si64x (__m128d __A)
1079 {
1080 return _mm_cvtsd_si64 ((__v2df)__A);
1081 }
1082
1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32(__m128d __A)1084 _mm_cvttsd_si32 (__m128d __A)
1085 {
1086 int __result = ((__v2df)__A)[0];
1087
1088 return __result;
1089 }
1090
1091 /* Intel intrinsic. */
1092 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64(__m128d __A)1093 _mm_cvttsd_si64 (__m128d __A)
1094 {
1095 long long __result = ((__v2df)__A)[0];
1096
1097 return __result;
1098 }
1099
1100 /* Microsoft intrinsic. */
1101 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x(__m128d __A)1102 _mm_cvttsd_si64x (__m128d __A)
1103 {
1104 return _mm_cvttsd_si64 (__A);
1105 }
1106
1107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss(__m128 __A,__m128d __B)1108 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1109 {
1110 __v4sf __result = (__v4sf)__A;
1111
1112 #ifdef __LITTLE_ENDIAN__
1113 __v4sf __temp_s;
1114 /* Copy double element[0] to element [1] for conversion. */
1115 __v2df __temp_b = vec_splat((__v2df)__B, 0);
1116
1117 /* Pre-rotate __A left 3 (logically right 1) elements. */
1118 __result = __builtin_vsx_xxsldwi (__result, __result, 3);
1119 /* Convert double to single float scalar in a vector. */
1120 __asm__(
1121 "xscvdpsp %x0,%x1"
1122 : "=wa" (__temp_s)
1123 : "wa" (__temp_b)
1124 : );
1125 /* Shift the resulting scalar into vector element [0]. */
1126 __result = __builtin_vsx_xxsldwi (__result, __temp_s, 1);
1127 #else
1128 __result [0] = ((__v2df)__B)[0];
1129 #endif
1130 return (__m128) __result;
1131 }
1132
1133 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd(__m128d __A,int __B)1134 _mm_cvtsi32_sd (__m128d __A, int __B)
1135 {
1136 __v2df __result = (__v2df)__A;
1137 double __db = __B;
1138 __result [0] = __db;
1139 return (__m128d)__result;
1140 }
1141
1142 /* Intel intrinsic. */
1143 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd(__m128d __A,long long __B)1144 _mm_cvtsi64_sd (__m128d __A, long long __B)
1145 {
1146 __v2df __result = (__v2df)__A;
1147 double __db = __B;
1148 __result [0] = __db;
1149 return (__m128d)__result;
1150 }
1151
1152 /* Microsoft intrinsic. */
1153 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd(__m128d __A,long long __B)1154 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1155 {
1156 return _mm_cvtsi64_sd (__A, __B);
1157 }
1158
1159 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd(__m128d __A,__m128 __B)1160 _mm_cvtss_sd (__m128d __A, __m128 __B)
1161 {
1162 #ifdef __LITTLE_ENDIAN__
1163 /* Use splat to move element [0] into position for the convert. */
1164 __v4sf __temp = vec_splat ((__v4sf)__B, 0);
1165 __v2df __res;
1166 /* Convert single float scalar to double in a vector. */
1167 __asm__(
1168 "xscvspdp %x0,%x1"
1169 : "=wa" (__res)
1170 : "wa" (__temp)
1171 : );
1172 return (__m128d) vec_mergel (__res, (__v2df)__A);
1173 #else
1174 __v2df __res = (__v2df)__A;
1175 __res [0] = ((__v4sf)__B) [0];
1176 return (__m128d) __res;
1177 #endif
1178 }
1179
1180 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A,__m128d __B,const int __mask)1181 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1182 {
1183 __vector double __result;
1184 const int __litmsk = __mask & 0x3;
1185
1186 if (__litmsk == 0)
1187 __result = vec_mergeh (__A, __B);
1188 #if __GNUC__ < 6
1189 else if (__litmsk == 1)
1190 __result = vec_xxpermdi (__B, __A, 2);
1191 else if (__litmsk == 2)
1192 __result = vec_xxpermdi (__B, __A, 1);
1193 #else
1194 else if (__litmsk == 1)
1195 __result = vec_xxpermdi (__A, __B, 2);
1196 else if (__litmsk == 2)
1197 __result = vec_xxpermdi (__A, __B, 1);
1198 #endif
1199 else
1200 __result = vec_mergel (__A, __B);
1201
1202 return __result;
1203 }
1204
1205 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd(__m128d __A,__m128d __B)1206 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1207 {
1208 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1209 }
1210
1211 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd(__m128d __A,__m128d __B)1212 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1213 {
1214 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1215 }
1216
1217 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd(__m128d __A,double const * __B)1218 _mm_loadh_pd (__m128d __A, double const *__B)
1219 {
1220 __v2df __result = (__v2df)__A;
1221 __result [1] = *__B;
1222 return (__m128d)__result;
1223 }
1224
1225 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd(__m128d __A,double const * __B)1226 _mm_loadl_pd (__m128d __A, double const *__B)
1227 {
1228 __v2df __result = (__v2df)__A;
1229 __result [0] = *__B;
1230 return (__m128d)__result;
1231 }
1232
1233 #ifdef _ARCH_PWR8
1234 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1235
1236 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1237 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd(__m128d __A)1238 _mm_movemask_pd (__m128d __A)
1239 {
1240 __vector unsigned long long __result;
1241 static const __vector unsigned int __perm_mask =
1242 {
1243 #ifdef __LITTLE_ENDIAN__
1244 0x80800040, 0x80808080, 0x80808080, 0x80808080
1245 #else
1246 0x80808080, 0x80808080, 0x80808080, 0x80804000
1247 #endif
1248 };
1249
1250 __result = ((__vector unsigned long long)
1251 vec_vbpermq ((__vector unsigned char) __A,
1252 (__vector unsigned char) __perm_mask));
1253
1254 #ifdef __LITTLE_ENDIAN__
1255 return __result[1];
1256 #else
1257 return __result[0];
1258 #endif
1259 }
1260 #endif /* _ARCH_PWR8 */
1261
1262 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16(__m128i __A,__m128i __B)1263 _mm_packs_epi16 (__m128i __A, __m128i __B)
1264 {
1265 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1266 }
1267
1268 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32(__m128i __A,__m128i __B)1269 _mm_packs_epi32 (__m128i __A, __m128i __B)
1270 {
1271 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1272 }
1273
1274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16(__m128i __A,__m128i __B)1275 _mm_packus_epi16 (__m128i __A, __m128i __B)
1276 {
1277 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1278 }
1279
1280 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8(__m128i __A,__m128i __B)1281 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1282 {
1283 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1284 }
1285
1286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16(__m128i __A,__m128i __B)1287 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1288 {
1289 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1290 }
1291
1292 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32(__m128i __A,__m128i __B)1293 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1294 {
1295 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1296 }
1297
1298 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64(__m128i __A,__m128i __B)1299 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1300 {
1301 return (__m128i) vec_mergel ((__vector long long) __A,
1302 (__vector long long) __B);
1303 }
1304
1305 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8(__m128i __A,__m128i __B)1306 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1307 {
1308 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1309 }
1310
1311 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16(__m128i __A,__m128i __B)1312 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1313 {
1314 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1315 }
1316
1317 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32(__m128i __A,__m128i __B)1318 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1319 {
1320 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1321 }
1322
1323 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64(__m128i __A,__m128i __B)1324 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1325 {
1326 return (__m128i) vec_mergeh ((__vector long long) __A,
1327 (__vector long long) __B);
1328 }
1329
1330 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8(__m128i __A,__m128i __B)1331 _mm_add_epi8 (__m128i __A, __m128i __B)
1332 {
1333 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1334 }
1335
1336 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16(__m128i __A,__m128i __B)1337 _mm_add_epi16 (__m128i __A, __m128i __B)
1338 {
1339 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1340 }
1341
1342 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32(__m128i __A,__m128i __B)1343 _mm_add_epi32 (__m128i __A, __m128i __B)
1344 {
1345 return (__m128i) ((__v4su)__A + (__v4su)__B);
1346 }
1347
1348 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64(__m128i __A,__m128i __B)1349 _mm_add_epi64 (__m128i __A, __m128i __B)
1350 {
1351 return (__m128i) ((__v2du)__A + (__v2du)__B);
1352 }
1353
1354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8(__m128i __A,__m128i __B)1355 _mm_adds_epi8 (__m128i __A, __m128i __B)
1356 {
1357 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1358 }
1359
1360 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16(__m128i __A,__m128i __B)1361 _mm_adds_epi16 (__m128i __A, __m128i __B)
1362 {
1363 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1364 }
1365
1366 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8(__m128i __A,__m128i __B)1367 _mm_adds_epu8 (__m128i __A, __m128i __B)
1368 {
1369 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1370 }
1371
1372 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16(__m128i __A,__m128i __B)1373 _mm_adds_epu16 (__m128i __A, __m128i __B)
1374 {
1375 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1376 }
1377
1378 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8(__m128i __A,__m128i __B)1379 _mm_sub_epi8 (__m128i __A, __m128i __B)
1380 {
1381 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1382 }
1383
1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16(__m128i __A,__m128i __B)1385 _mm_sub_epi16 (__m128i __A, __m128i __B)
1386 {
1387 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1388 }
1389
1390 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32(__m128i __A,__m128i __B)1391 _mm_sub_epi32 (__m128i __A, __m128i __B)
1392 {
1393 return (__m128i) ((__v4su)__A - (__v4su)__B);
1394 }
1395
1396 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64(__m128i __A,__m128i __B)1397 _mm_sub_epi64 (__m128i __A, __m128i __B)
1398 {
1399 return (__m128i) ((__v2du)__A - (__v2du)__B);
1400 }
1401
1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8(__m128i __A,__m128i __B)1403 _mm_subs_epi8 (__m128i __A, __m128i __B)
1404 {
1405 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1406 }
1407
1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16(__m128i __A,__m128i __B)1409 _mm_subs_epi16 (__m128i __A, __m128i __B)
1410 {
1411 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1412 }
1413
1414 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8(__m128i __A,__m128i __B)1415 _mm_subs_epu8 (__m128i __A, __m128i __B)
1416 {
1417 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1418 }
1419
1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16(__m128i __A,__m128i __B)1421 _mm_subs_epu16 (__m128i __A, __m128i __B)
1422 {
1423 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1424 }
1425
1426 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16(__m128i __A,__m128i __B)1427 _mm_madd_epi16 (__m128i __A, __m128i __B)
1428 {
1429 __vector signed int __zero = {0, 0, 0, 0};
1430
1431 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, __zero);
1432 }
1433
1434 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16(__m128i __A,__m128i __B)1435 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1436 {
1437 __vector signed int __w0, __w1;
1438
1439 __vector unsigned char __xform1 = {
1440 #ifdef __LITTLE_ENDIAN__
1441 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1442 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1443 #else
1444 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1445 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1446 #endif
1447 };
1448
1449 __w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1450 __w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1451 return (__m128i) vec_perm (__w0, __w1, __xform1);
1452 }
1453
1454 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16(__m128i __A,__m128i __B)1455 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1456 {
1457 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1458 }
1459
1460 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32(__m64 __A,__m64 __B)1461 _mm_mul_su32 (__m64 __A, __m64 __B)
1462 {
1463 unsigned int __a = __A;
1464 unsigned int __b = __B;
1465
1466 return ((__m64)__a * (__m64)__b);
1467 }
1468
1469 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32(__m128i __A,__m128i __B)1470 _mm_mul_epu32 (__m128i __A, __m128i __B)
1471 {
1472 #if __GNUC__ < 8 || !defined (_ARCH_PWR8)
1473 __v2du __result;
1474
1475 #ifdef __LITTLE_ENDIAN__
1476 /* VMX Vector Multiply Odd Unsigned Word. */
1477 __asm__(
1478 "vmulouw %0,%1,%2"
1479 : "=v" (__result)
1480 : "v" (__A), "v" (__B)
1481 : );
1482 #else
1483 /* VMX Vector Multiply Even Unsigned Word. */
1484 __asm__(
1485 "vmuleuw %0,%1,%2"
1486 : "=v" (__result)
1487 : "v" (__A), "v" (__B)
1488 : );
1489 #endif
1490 return (__m128i) __result;
1491 #else
1492 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1493 #endif
1494 }
1495
1496 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16(__m128i __A,int __B)1497 _mm_slli_epi16 (__m128i __A, int __B)
1498 {
1499 __v8hu __lshift;
1500 __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1501
1502 if (__B >= 0 && __B < 16)
1503 {
1504 if (__builtin_constant_p(__B))
1505 __lshift = (__v8hu) vec_splat_s16(__B);
1506 else
1507 __lshift = vec_splats ((unsigned short) __B);
1508
1509 __result = vec_sl ((__v8hi) __A, __lshift);
1510 }
1511
1512 return (__m128i) __result;
1513 }
1514
1515 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32(__m128i __A,int __B)1516 _mm_slli_epi32 (__m128i __A, int __B)
1517 {
1518 __v4su __lshift;
1519 __v4si __result = { 0, 0, 0, 0 };
1520
1521 if (__B >= 0 && __B < 32)
1522 {
1523 if (__builtin_constant_p(__B) && __B < 16)
1524 __lshift = (__v4su) vec_splat_s32(__B);
1525 else
1526 __lshift = vec_splats ((unsigned int) __B);
1527
1528 __result = vec_sl ((__v4si) __A, __lshift);
1529 }
1530
1531 return (__m128i) __result;
1532 }
1533
1534 #ifdef _ARCH_PWR8
1535 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64(__m128i __A,int __B)1536 _mm_slli_epi64 (__m128i __A, int __B)
1537 {
1538 __v2du __lshift;
1539 __v2di __result = { 0, 0 };
1540
1541 if (__B >= 0 && __B < 64)
1542 {
1543 if (__builtin_constant_p(__B) && __B < 16)
1544 __lshift = (__v2du) vec_splat_s32(__B);
1545 else
1546 __lshift = (__v2du) vec_splats ((unsigned int) __B);
1547
1548 __result = vec_sl ((__v2di) __A, __lshift);
1549 }
1550
1551 return (__m128i) __result;
1552 }
1553 #endif
1554
1555 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16(__m128i __A,int __B)1556 _mm_srai_epi16 (__m128i __A, int __B)
1557 {
1558 __v8hu __rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1559 __v8hi __result;
1560
1561 if (__B < 16)
1562 {
1563 if (__builtin_constant_p(__B))
1564 __rshift = (__v8hu) vec_splat_s16(__B);
1565 else
1566 __rshift = vec_splats ((unsigned short) __B);
1567 }
1568 __result = vec_sra ((__v8hi) __A, __rshift);
1569
1570 return (__m128i) __result;
1571 }
1572
1573 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32(__m128i __A,int __B)1574 _mm_srai_epi32 (__m128i __A, int __B)
1575 {
1576 __v4su __rshift = { 31, 31, 31, 31 };
1577 __v4si __result;
1578
1579 if (__B < 32)
1580 {
1581 if (__builtin_constant_p(__B))
1582 {
1583 if (__B < 16)
1584 __rshift = (__v4su) vec_splat_s32(__B);
1585 else
1586 __rshift = (__v4su) vec_splats((unsigned int)__B);
1587 }
1588 else
1589 __rshift = vec_splats ((unsigned int) __B);
1590 }
1591 __result = vec_sra ((__v4si) __A, __rshift);
1592
1593 return (__m128i) __result;
1594 }
1595
1596 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128(__m128i __A,const int __N)1597 _mm_bslli_si128 (__m128i __A, const int __N)
1598 {
1599 __v16qu __result;
1600 const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1601
1602 if (__N < 16)
1603 __result = vec_sld ((__v16qu) __A, __zeros, __N);
1604 else
1605 __result = __zeros;
1606
1607 return (__m128i) __result;
1608 }
1609
1610 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128(__m128i __A,const int __N)1611 _mm_bsrli_si128 (__m128i __A, const int __N)
1612 {
1613 __v16qu __result;
1614 const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1615
1616 if (__N < 16)
1617 #ifdef __LITTLE_ENDIAN__
1618 if (__builtin_constant_p(__N))
1619 /* Would like to use Vector Shift Left Double by Octet
1620 Immediate here to use the immediate form and avoid
1621 load of __N * 8 value into a separate VR. */
1622 __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N));
1623 else
1624 #endif
1625 {
1626 __v16qu __shift = vec_splats((unsigned char)(__N*8));
1627 #ifdef __LITTLE_ENDIAN__
1628 __result = vec_sro ((__v16qu)__A, __shift);
1629 #else
1630 __result = vec_slo ((__v16qu)__A, __shift);
1631 #endif
1632 }
1633 else
1634 __result = __zeros;
1635
1636 return (__m128i) __result;
1637 }
1638
1639 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128(__m128i __A,const int __N)1640 _mm_srli_si128 (__m128i __A, const int __N)
1641 {
1642 return _mm_bsrli_si128 (__A, __N);
1643 }
1644
1645 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128(__m128i __A,const int _imm5)1646 _mm_slli_si128 (__m128i __A, const int _imm5)
1647 {
1648 __v16qu __result;
1649 const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1650
1651 if (_imm5 < 16)
1652 #ifdef __LITTLE_ENDIAN__
1653 __result = vec_sld ((__v16qu) __A, __zeros, _imm5);
1654 #else
1655 __result = vec_sld (__zeros, (__v16qu) __A, (16 - _imm5));
1656 #endif
1657 else
1658 __result = __zeros;
1659
1660 return (__m128i) __result;
1661 }
1662
1663 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664
_mm_srli_epi16(__m128i __A,int __B)1665 _mm_srli_epi16 (__m128i __A, int __B)
1666 {
1667 __v8hu __rshift;
1668 __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1669
1670 if (__B < 16)
1671 {
1672 if (__builtin_constant_p(__B))
1673 __rshift = (__v8hu) vec_splat_s16(__B);
1674 else
1675 __rshift = vec_splats ((unsigned short) __B);
1676
1677 __result = vec_sr ((__v8hi) __A, __rshift);
1678 }
1679
1680 return (__m128i) __result;
1681 }
1682
1683 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32(__m128i __A,int __B)1684 _mm_srli_epi32 (__m128i __A, int __B)
1685 {
1686 __v4su __rshift;
1687 __v4si __result = { 0, 0, 0, 0 };
1688
1689 if (__B < 32)
1690 {
1691 if (__builtin_constant_p(__B))
1692 {
1693 if (__B < 16)
1694 __rshift = (__v4su) vec_splat_s32(__B);
1695 else
1696 __rshift = (__v4su) vec_splats((unsigned int)__B);
1697 }
1698 else
1699 __rshift = vec_splats ((unsigned int) __B);
1700
1701 __result = vec_sr ((__v4si) __A, __rshift);
1702 }
1703
1704 return (__m128i) __result;
1705 }
1706
1707 #ifdef _ARCH_PWR8
1708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64(__m128i __A,int __B)1709 _mm_srli_epi64 (__m128i __A, int __B)
1710 {
1711 __v2du __rshift;
1712 __v2di __result = { 0, 0 };
1713
1714 if (__B < 64)
1715 {
1716 if (__builtin_constant_p(__B))
1717 {
1718 if (__B < 16)
1719 __rshift = (__v2du) vec_splat_s32(__B);
1720 else
1721 __rshift = (__v2du) vec_splats((unsigned long long)__B);
1722 }
1723 else
1724 __rshift = (__v2du) vec_splats ((unsigned int) __B);
1725
1726 __result = vec_sr ((__v2di) __A, __rshift);
1727 }
1728
1729 return (__m128i) __result;
1730 }
1731 #endif
1732
1733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16(__m128i __A,__m128i __B)1734 _mm_sll_epi16 (__m128i __A, __m128i __B)
1735 {
1736 __v8hu __lshift;
1737 __vector __bool short __shmask;
1738 const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1739 __v8hu __result;
1740
1741 #ifdef __LITTLE_ENDIAN__
1742 __lshift = vec_splat ((__v8hu) __B, 0);
1743 #else
1744 __lshift = vec_splat ((__v8hu) __B, 3);
1745 #endif
1746 __shmask = vec_cmple (__lshift, __shmax);
1747 __result = vec_sl ((__v8hu) __A, __lshift);
1748 __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
1749
1750 return (__m128i) __result;
1751 }
1752
1753 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32(__m128i __A,__m128i __B)1754 _mm_sll_epi32 (__m128i __A, __m128i __B)
1755 {
1756 __v4su __lshift;
1757 __vector __bool int __shmask;
1758 const __v4su __shmax = { 32, 32, 32, 32 };
1759 __v4su __result;
1760 #ifdef __LITTLE_ENDIAN__
1761 __lshift = vec_splat ((__v4su) __B, 0);
1762 #else
1763 __lshift = vec_splat ((__v4su) __B, 1);
1764 #endif
1765 __shmask = vec_cmplt (__lshift, __shmax);
1766 __result = vec_sl ((__v4su) __A, __lshift);
1767 __result = vec_sel ((__v4su) __shmask, __result, __shmask);
1768
1769 return (__m128i) __result;
1770 }
1771
1772 #ifdef _ARCH_PWR8
1773 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64(__m128i __A,__m128i __B)1774 _mm_sll_epi64 (__m128i __A, __m128i __B)
1775 {
1776 __v2du __lshift;
1777 __vector __bool long long __shmask;
1778 const __v2du __shmax = { 64, 64 };
1779 __v2du __result;
1780
1781 __lshift = vec_splat ((__v2du) __B, 0);
1782 __shmask = vec_cmplt (__lshift, __shmax);
1783 __result = vec_sl ((__v2du) __A, __lshift);
1784 __result = vec_sel ((__v2du) __shmask, __result, __shmask);
1785
1786 return (__m128i) __result;
1787 }
1788 #endif
1789
1790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16(__m128i __A,__m128i __B)1791 _mm_sra_epi16 (__m128i __A, __m128i __B)
1792 {
1793 const __v8hu __rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1794 __v8hu __rshift;
1795 __v8hi __result;
1796
1797 #ifdef __LITTLE_ENDIAN__
1798 __rshift = vec_splat ((__v8hu)__B, 0);
1799 #else
1800 __rshift = vec_splat ((__v8hu)__B, 3);
1801 #endif
1802 __rshift = vec_min (__rshift, __rshmax);
1803 __result = vec_sra ((__v8hi) __A, __rshift);
1804
1805 return (__m128i) __result;
1806 }
1807
1808 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32(__m128i __A,__m128i __B)1809 _mm_sra_epi32 (__m128i __A, __m128i __B)
1810 {
1811 const __v4su __rshmax = { 31, 31, 31, 31 };
1812 __v4su __rshift;
1813 __v4si __result;
1814
1815 #ifdef __LITTLE_ENDIAN__
1816 __rshift = vec_splat ((__v4su)__B, 0);
1817 #else
1818 __rshift = vec_splat ((__v4su)__B, 1);
1819 #endif
1820 __rshift = vec_min (__rshift, __rshmax);
1821 __result = vec_sra ((__v4si) __A, __rshift);
1822
1823 return (__m128i) __result;
1824 }
1825
1826 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16(__m128i __A,__m128i __B)1827 _mm_srl_epi16 (__m128i __A, __m128i __B)
1828 {
1829 __v8hu __rshift;
1830 __vector __bool short __shmask;
1831 const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1832 __v8hu __result;
1833
1834 #ifdef __LITTLE_ENDIAN__
1835 __rshift = vec_splat ((__v8hu) __B, 0);
1836 #else
1837 __rshift = vec_splat ((__v8hu) __B, 3);
1838 #endif
1839 __shmask = vec_cmple (__rshift, __shmax);
1840 __result = vec_sr ((__v8hu) __A, __rshift);
1841 __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
1842
1843 return (__m128i) __result;
1844 }
1845
1846 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32(__m128i __A,__m128i __B)1847 _mm_srl_epi32 (__m128i __A, __m128i __B)
1848 {
1849 __v4su __rshift;
1850 __vector __bool int __shmask;
1851 const __v4su __shmax = { 32, 32, 32, 32 };
1852 __v4su __result;
1853
1854 #ifdef __LITTLE_ENDIAN__
1855 __rshift = vec_splat ((__v4su) __B, 0);
1856 #else
1857 __rshift = vec_splat ((__v4su) __B, 1);
1858 #endif
1859 __shmask = vec_cmplt (__rshift, __shmax);
1860 __result = vec_sr ((__v4su) __A, __rshift);
1861 __result = vec_sel ((__v4su) __shmask, __result, __shmask);
1862
1863 return (__m128i) __result;
1864 }
1865
1866 #ifdef _ARCH_PWR8
1867 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64(__m128i __A,__m128i __B)1868 _mm_srl_epi64 (__m128i __A, __m128i __B)
1869 {
1870 __v2du __rshift;
1871 __vector __bool long long __shmask;
1872 const __v2du __shmax = { 64, 64 };
1873 __v2du __result;
1874
1875 __rshift = vec_splat ((__v2du) __B, 0);
1876 __shmask = vec_cmplt (__rshift, __shmax);
1877 __result = vec_sr ((__v2du) __A, __rshift);
1878 __result = vec_sel ((__v2du) __shmask, __result, __shmask);
1879
1880 return (__m128i) __result;
1881 }
1882 #endif
1883
1884 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd(__m128d __A,__m128d __B)1885 _mm_and_pd (__m128d __A, __m128d __B)
1886 {
1887 return (vec_and ((__v2df) __A, (__v2df) __B));
1888 }
1889
1890 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd(__m128d __A,__m128d __B)1891 _mm_andnot_pd (__m128d __A, __m128d __B)
1892 {
1893 return (vec_andc ((__v2df) __B, (__v2df) __A));
1894 }
1895
1896 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd(__m128d __A,__m128d __B)1897 _mm_or_pd (__m128d __A, __m128d __B)
1898 {
1899 return (vec_or ((__v2df) __A, (__v2df) __B));
1900 }
1901
1902 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd(__m128d __A,__m128d __B)1903 _mm_xor_pd (__m128d __A, __m128d __B)
1904 {
1905 return (vec_xor ((__v2df) __A, (__v2df) __B));
1906 }
1907
1908 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128(__m128i __A,__m128i __B)1909 _mm_and_si128 (__m128i __A, __m128i __B)
1910 {
1911 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1912 }
1913
1914 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128(__m128i __A,__m128i __B)1915 _mm_andnot_si128 (__m128i __A, __m128i __B)
1916 {
1917 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1918 }
1919
1920 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128(__m128i __A,__m128i __B)1921 _mm_or_si128 (__m128i __A, __m128i __B)
1922 {
1923 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1924 }
1925
1926 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128(__m128i __A,__m128i __B)1927 _mm_xor_si128 (__m128i __A, __m128i __B)
1928 {
1929 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1930 }
1931
1932 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8(__m128i __A,__m128i __B)1933 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1934 {
1935 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1936 }
1937
1938 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16(__m128i __A,__m128i __B)1939 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1940 {
1941 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1942 }
1943
1944 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32(__m128i __A,__m128i __B)1945 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1946 {
1947 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1948 }
1949
1950 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8(__m128i __A,__m128i __B)1951 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1952 {
1953 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1954 }
1955
1956 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16(__m128i __A,__m128i __B)1957 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1958 {
1959 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1960 }
1961
1962 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32(__m128i __A,__m128i __B)1963 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1964 {
1965 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1966 }
1967
1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8(__m128i __A,__m128i __B)1969 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1970 {
1971 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1972 }
1973
1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16(__m128i __A,__m128i __B)1975 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1976 {
1977 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1978 }
1979
1980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32(__m128i __A,__m128i __B)1981 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1982 {
1983 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1984 }
1985
1986 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16(__m128i const __A,int const __N)1987 _mm_extract_epi16 (__m128i const __A, int const __N)
1988 {
1989 return (unsigned short) ((__v8hi)__A)[__N & 7];
1990 }
1991
1992 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16(__m128i const __A,int const __D,int const __N)1993 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1994 {
1995 __v8hi __result = (__v8hi)__A;
1996
1997 __result [(__N & 7)] = __D;
1998
1999 return (__m128i) __result;
2000 }
2001
2002 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16(__m128i __A,__m128i __B)2003 _mm_max_epi16 (__m128i __A, __m128i __B)
2004 {
2005 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
2006 }
2007
2008 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8(__m128i __A,__m128i __B)2009 _mm_max_epu8 (__m128i __A, __m128i __B)
2010 {
2011 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
2012 }
2013
2014 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16(__m128i __A,__m128i __B)2015 _mm_min_epi16 (__m128i __A, __m128i __B)
2016 {
2017 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
2018 }
2019
2020 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8(__m128i __A,__m128i __B)2021 _mm_min_epu8 (__m128i __A, __m128i __B)
2022 {
2023 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
2024 }
2025
2026
2027 #ifdef _ARCH_PWR8
2028 /* Intrinsic functions that require PowerISA 2.07 minimum. */
2029
2030 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
2031 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8(__m128i __A)2032 _mm_movemask_epi8 (__m128i __A)
2033 {
2034 __vector unsigned long long __result;
2035 static const __vector unsigned char __perm_mask =
2036 {
2037 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2038 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2039 };
2040
2041 __result = ((__vector unsigned long long)
2042 vec_vbpermq ((__vector unsigned char) __A,
2043 (__vector unsigned char) __perm_mask));
2044
2045 #ifdef __LITTLE_ENDIAN__
2046 return __result[1];
2047 #else
2048 return __result[0];
2049 #endif
2050 }
2051 #endif /* _ARCH_PWR8 */
2052
2053 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16(__m128i __A,__m128i __B)2054 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2055 {
2056 __v4su __w0, __w1;
2057 __v16qu __xform1 = {
2058 #ifdef __LITTLE_ENDIAN__
2059 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2060 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2061 #else
2062 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2063 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2064 #endif
2065 };
2066
2067 __w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2068 __w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2069 return (__m128i) vec_perm (__w0, __w1, __xform1);
2070 }
2071
2072 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16(__m128i __A,const int __mask)2073 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2074 {
2075 unsigned long __element_selector_98 = __mask & 0x03;
2076 unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2077 unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2078 unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2079 static const unsigned short __permute_selectors[4] =
2080 {
2081 #ifdef __LITTLE_ENDIAN__
2082 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2083 #else
2084 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2085 #endif
2086 };
2087 __v2du __pmask =
2088 #ifdef __LITTLE_ENDIAN__
2089 { 0x1716151413121110UL, 0UL};
2090 #else
2091 { 0x1011121314151617UL, 0UL};
2092 #endif
2093 __m64_union __t;
2094 __v2du __a, __r;
2095
2096 __t.as_short[0] = __permute_selectors[__element_selector_98];
2097 __t.as_short[1] = __permute_selectors[__element_selector_BA];
2098 __t.as_short[2] = __permute_selectors[__element_selector_DC];
2099 __t.as_short[3] = __permute_selectors[__element_selector_FE];
2100 __pmask[1] = __t.as_m64;
2101 __a = (__v2du)__A;
2102 __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
2103 return (__m128i) __r;
2104 }
2105
2106 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16(__m128i __A,const int __mask)2107 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2108 {
2109 unsigned long __element_selector_10 = __mask & 0x03;
2110 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2111 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2112 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2113 static const unsigned short __permute_selectors[4] =
2114 {
2115 #ifdef __LITTLE_ENDIAN__
2116 0x0100, 0x0302, 0x0504, 0x0706
2117 #else
2118 0x0001, 0x0203, 0x0405, 0x0607
2119 #endif
2120 };
2121 __v2du __pmask =
2122 #ifdef __LITTLE_ENDIAN__
2123 { 0UL, 0x1f1e1d1c1b1a1918UL};
2124 #else
2125 { 0UL, 0x18191a1b1c1d1e1fUL};
2126 #endif
2127 __m64_union __t;
2128 __v2du __a, __r;
2129 __t.as_short[0] = __permute_selectors[__element_selector_10];
2130 __t.as_short[1] = __permute_selectors[__element_selector_32];
2131 __t.as_short[2] = __permute_selectors[__element_selector_54];
2132 __t.as_short[3] = __permute_selectors[__element_selector_76];
2133 __pmask[0] = __t.as_m64;
2134 __a = (__v2du)__A;
2135 __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
2136 return (__m128i) __r;
2137 }
2138
2139 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32(__m128i __A,const int __mask)2140 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2141 {
2142 unsigned long __element_selector_10 = __mask & 0x03;
2143 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2144 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2145 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2146 static const unsigned int __permute_selectors[4] =
2147 {
2148 #ifdef __LITTLE_ENDIAN__
2149 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2150 #else
2151 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2152 #endif
2153 };
2154 __v4su __t;
2155
2156 __t[0] = __permute_selectors[__element_selector_10];
2157 __t[1] = __permute_selectors[__element_selector_32];
2158 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2159 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2160 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)__t);
2161 }
2162
2163 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128(__m128i __A,__m128i __B,char * __C)2164 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2165 {
2166 __v2du __hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2167 __v16qu __mask, __tmp;
2168 __m128i_u *__p = (__m128i_u*)__C;
2169
2170 __tmp = (__v16qu)_mm_loadu_si128(__p);
2171 __mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)__hibit);
2172 __tmp = vec_sel (__tmp, (__v16qu)__A, __mask);
2173 _mm_storeu_si128 (__p, (__m128i)__tmp);
2174 }
2175
2176 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8(__m128i __A,__m128i __B)2177 _mm_avg_epu8 (__m128i __A, __m128i __B)
2178 {
2179 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2180 }
2181
2182 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16(__m128i __A,__m128i __B)2183 _mm_avg_epu16 (__m128i __A, __m128i __B)
2184 {
2185 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2186 }
2187
2188
2189 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8(__m128i __A,__m128i __B)2190 _mm_sad_epu8 (__m128i __A, __m128i __B)
2191 {
2192 __v16qu __a, __b;
2193 __v16qu __vmin, __vmax, __vabsdiff;
2194 __v4si __vsum;
2195 const __v4su __zero = { 0, 0, 0, 0 };
2196 __v4si __result;
2197
2198 __a = (__v16qu) __A;
2199 __b = (__v16qu) __B;
2200 __vmin = vec_min (__a, __b);
2201 __vmax = vec_max (__a, __b);
2202 __vabsdiff = vec_sub (__vmax, __vmin);
2203 /* Sum four groups of bytes into integers. */
2204 __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
2205 /* Sum across four integers with two integer results. */
2206 __result = vec_sum2s (__vsum, (__vector signed int) __zero);
2207 /* Rotate the sums into the correct position. */
2208 #ifdef __LITTLE_ENDIAN__
2209 __result = vec_sld (__result, __result, 4);
2210 #else
2211 __result = vec_sld (__result, __result, 6);
2212 #endif
2213 /* Rotate the sums into the correct position. */
2214 return (__m128i) __result;
2215 }
2216
2217 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32(int * __A,int __B)2218 _mm_stream_si32 (int *__A, int __B)
2219 {
2220 /* Use the data cache block touch for store transient. */
2221 __asm__ (
2222 "dcbtstt 0,%0"
2223 :
2224 : "b" (__A)
2225 : "memory"
2226 );
2227 *__A = __B;
2228 }
2229
2230 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64(long long int * __A,long long int __B)2231 _mm_stream_si64 (long long int *__A, long long int __B)
2232 {
2233 /* Use the data cache block touch for store transient. */
2234 __asm__ (
2235 " dcbtstt 0,%0"
2236 :
2237 : "b" (__A)
2238 : "memory"
2239 );
2240 *__A = __B;
2241 }
2242
2243 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128(__m128i * __A,__m128i __B)2244 _mm_stream_si128 (__m128i *__A, __m128i __B)
2245 {
2246 /* Use the data cache block touch for store transient. */
2247 __asm__ (
2248 "dcbtstt 0,%0"
2249 :
2250 : "b" (__A)
2251 : "memory"
2252 );
2253 *__A = __B;
2254 }
2255
2256 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd(double * __A,__m128d __B)2257 _mm_stream_pd (double *__A, __m128d __B)
2258 {
2259 /* Use the data cache block touch for store transient. */
2260 __asm__ (
2261 "dcbtstt 0,%0"
2262 :
2263 : "b" (__A)
2264 : "memory"
2265 );
2266 *(__m128d*)__A = __B;
2267 }
2268
2269 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush(void const * __A)2270 _mm_clflush (void const *__A)
2271 {
2272 /* Use the data cache block flush. */
2273 __asm__ (
2274 "dcbf 0,%0"
2275 :
2276 : "b" (__A)
2277 : "memory"
2278 );
2279 }
2280
2281 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence(void)2282 _mm_lfence (void)
2283 {
2284 /* Use light weight sync for load to load ordering. */
2285 __atomic_thread_fence (__ATOMIC_RELEASE);
2286 }
2287
2288 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence(void)2289 _mm_mfence (void)
2290 {
2291 /* Use heavy weight sync for any to any ordering. */
2292 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2293 }
2294
2295 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128(int __A)2296 _mm_cvtsi32_si128 (int __A)
2297 {
2298 return _mm_set_epi32 (0, 0, 0, __A);
2299 }
2300
2301 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128(long long __A)2302 _mm_cvtsi64_si128 (long long __A)
2303 {
2304 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2305 }
2306
2307 /* Microsoft intrinsic. */
2308 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128(long long __A)2309 _mm_cvtsi64x_si128 (long long __A)
2310 {
2311 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2312 }
2313
2314 /* Casts between various SP, DP, INT vector types. Note that these do no
2315 conversion of values, they just change the type. */
2316 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)2317 _mm_castpd_ps(__m128d __A)
2318 {
2319 return (__m128) __A;
2320 }
2321
2322 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)2323 _mm_castpd_si128(__m128d __A)
2324 {
2325 return (__m128i) __A;
2326 }
2327
2328 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)2329 _mm_castps_pd(__m128 __A)
2330 {
2331 return (__m128d) __A;
2332 }
2333
2334 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)2335 _mm_castps_si128(__m128 __A)
2336 {
2337 return (__m128i) __A;
2338 }
2339
2340 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)2341 _mm_castsi128_ps(__m128i __A)
2342 {
2343 return (__m128) __A;
2344 }
2345
2346 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)2347 _mm_castsi128_pd(__m128i __A)
2348 {
2349 return (__m128d) __A;
2350 }
2351
2352 #endif /* EMMINTRIN_H_ */
2353
2354