1 /* Copyright (C) 2003-2020 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 9.0.  */
26 
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29    makes explicit use of Intel intrinsics to powerpc64le.
30    It is the user's responsibility to determine if the results are
31    acceptable and make additional changes as necessary.
32    Note that much code that uses Intel intrinsics can be rewritten in
33    standard C or GNU C extensions, which are more portable and better
34    optimized across multiple targets.
35 
36    In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37    the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38    operations.  However scalar double operations in vector (XMM)
39    registers require the POWER8 VSX ISA (2.07) level. Also there are
40    important differences for data format and placement of double
41    scalars in the vector register.
42 
43    For PowerISA Scalar double is in FPRs (left most 64-bits of the
44    low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45    the XMM. These differences require extra steps on POWER to match
46    the SSE2 scalar double semantics.
47 
48    Most SSE2 scalar double intrinsic operations can be performed more
49    efficiently as C language double scalar operations or optimized to
50    use vector SIMD operations.  We recommend this for new applications.
51 
52    Another difference is the format and details of the X86_64 MXSCR vs
53    the PowerISA FPSCR / VSCR registers. We recommend applications
54    replace direct access to the MXSCR with the more portable <fenv.h>
55    Posix APIs. */
56 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #endif
58 
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
61 
62 #include <altivec.h>
63 #include <assert.h>
64 
65 /* We need definitions from the SSE header files.  */
66 #include <xmmintrin.h>
67 
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
78 
79 /* The Intel API is flexible enough that we must allow aliasing with other
80    vector types, and their scalar components.  */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
83 
84 /* Unaligned version of the same types.  */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
87 
88 /* Define two value permute mask.  */
89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
90 
91 /* Create a vector with element 0 as F and the rest zero.  */
92 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd(double __F)93 _mm_set_sd (double __F)
94 {
95   return __extension__ (__m128d){ __F, 0.0 };
96 }
97 
98 /* Create a vector with both elements equal to F.  */
99 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd(double __F)100 _mm_set1_pd (double __F)
101 {
102   return __extension__ (__m128d){ __F, __F };
103 }
104 
105 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1(double __F)106 _mm_set_pd1 (double __F)
107 {
108   return _mm_set1_pd (__F);
109 }
110 
111 /* Create a vector with the lower value X and upper value W.  */
112 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd(double __W,double __X)113 _mm_set_pd (double __W, double __X)
114 {
115   return __extension__ (__m128d){ __X, __W };
116 }
117 
118 /* Create a vector with the lower value W and upper value X.  */
119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd(double __W,double __X)120 _mm_setr_pd (double __W, double __X)
121 {
122   return __extension__ (__m128d){ __W, __X };
123 }
124 
125 /* Create an undefined vector.  */
126 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd(void)127 _mm_undefined_pd (void)
128 {
129   __m128d __Y = __Y;
130   return __Y;
131 }
132 
133 /* Create a vector of zeros.  */
134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd(void)135 _mm_setzero_pd (void)
136 {
137   return (__m128d) vec_splats (0);
138 }
139 
140 /* Sets the low DPFP value of A from the low value of B.  */
141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd(__m128d __A,__m128d __B)142 _mm_move_sd (__m128d __A, __m128d __B)
143 {
144   __v2df result = (__v2df) __A;
145   result [0] = ((__v2df) __B)[0];
146   return (__m128d) result;
147 }
148 
149 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd(double const * __P)151 _mm_load_pd (double const *__P)
152 {
153   assert(((unsigned long)__P & 0xfUL) == 0UL);
154   return ((__m128d)vec_ld(0, (__v16qu*)__P));
155 }
156 
157 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
158 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd(double const * __P)159 _mm_loadu_pd (double const *__P)
160 {
161   return (vec_vsx_ld(0, __P));
162 }
163 
164 /* Create a vector with all two elements equal to *P.  */
165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd(double const * __P)166 _mm_load1_pd (double const *__P)
167 {
168   return (vec_splats (*__P));
169 }
170 
171 /* Create a vector with element 0 as *P and the rest zero.  */
172 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd(double const * __P)173 _mm_load_sd (double const *__P)
174 {
175   return _mm_set_sd (*__P);
176 }
177 
178 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1(double const * __P)179 _mm_load_pd1 (double const *__P)
180 {
181   return _mm_load1_pd (__P);
182 }
183 
184 /* Load two DPFP values in reverse order.  The address must be aligned.  */
185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd(double const * __P)186 _mm_loadr_pd (double const *__P)
187 {
188   __v2df __tmp = _mm_load_pd (__P);
189   return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
190 }
191 
192 /* Store two DPFP values.  The address must be 16-byte aligned.  */
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd(double * __P,__m128d __A)194 _mm_store_pd (double *__P, __m128d __A)
195 {
196   assert(((unsigned long)__P & 0xfUL) == 0UL);
197   vec_st((__v16qu)__A, 0, (__v16qu*)__P);
198 }
199 
200 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd(double * __P,__m128d __A)202 _mm_storeu_pd (double *__P, __m128d __A)
203 {
204   *(__m128d_u *)__P = __A;
205 }
206 
207 /* Stores the lower DPFP value.  */
208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd(double * __P,__m128d __A)209 _mm_store_sd (double *__P, __m128d __A)
210 {
211   *__P = ((__v2df)__A)[0];
212 }
213 
214 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64(__m128d __A)215 _mm_cvtsd_f64 (__m128d __A)
216 {
217   return ((__v2df)__A)[0];
218 }
219 
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd(double * __P,__m128d __A)221 _mm_storel_pd (double *__P, __m128d __A)
222 {
223   _mm_store_sd (__P, __A);
224 }
225 
226 /* Stores the upper DPFP value.  */
227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd(double * __P,__m128d __A)228 _mm_storeh_pd (double *__P, __m128d __A)
229 {
230   *__P = ((__v2df)__A)[1];
231 }
232 /* Store the lower DPFP value across two words.
233    The address must be 16-byte aligned.  */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd(double * __P,__m128d __A)235 _mm_store1_pd (double *__P, __m128d __A)
236 {
237   _mm_store_pd (__P, vec_splat (__A, 0));
238 }
239 
240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1(double * __P,__m128d __A)241 _mm_store_pd1 (double *__P, __m128d __A)
242 {
243   _mm_store1_pd (__P, __A);
244 }
245 
246 /* Store two DPFP values in reverse order.  The address must be aligned.  */
247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd(double * __P,__m128d __A)248 _mm_storer_pd (double *__P, __m128d __A)
249 {
250   _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
251 }
252 
253 /* Intel intrinsic.  */
254 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64(__m128i __A)255 _mm_cvtsi128_si64 (__m128i __A)
256 {
257   return ((__v2di)__A)[0];
258 }
259 
260 /* Microsoft intrinsic.  */
261 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x(__m128i __A)262 _mm_cvtsi128_si64x (__m128i __A)
263 {
264   return ((__v2di)__A)[0];
265 }
266 
267 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd(__m128d __A,__m128d __B)268 _mm_add_pd (__m128d __A, __m128d __B)
269 {
270   return (__m128d) ((__v2df)__A + (__v2df)__B);
271 }
272 
273 /* Add the lower double-precision (64-bit) floating-point element in
274    a and b, store the result in the lower element of dst, and copy
275    the upper element from a to the upper element of dst. */
276 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd(__m128d __A,__m128d __B)277 _mm_add_sd (__m128d __A, __m128d __B)
278 {
279   __A[0] = __A[0] + __B[0];
280   return (__A);
281 }
282 
283 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd(__m128d __A,__m128d __B)284 _mm_sub_pd (__m128d __A, __m128d __B)
285 {
286   return (__m128d) ((__v2df)__A - (__v2df)__B);
287 }
288 
289 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd(__m128d __A,__m128d __B)290 _mm_sub_sd (__m128d __A, __m128d __B)
291 {
292   __A[0] = __A[0] - __B[0];
293   return (__A);
294 }
295 
296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd(__m128d __A,__m128d __B)297 _mm_mul_pd (__m128d __A, __m128d __B)
298 {
299   return (__m128d) ((__v2df)__A * (__v2df)__B);
300 }
301 
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd(__m128d __A,__m128d __B)303 _mm_mul_sd (__m128d __A, __m128d __B)
304 {
305   __A[0] = __A[0] * __B[0];
306   return (__A);
307 }
308 
309 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd(__m128d __A,__m128d __B)310 _mm_div_pd (__m128d __A, __m128d __B)
311 {
312   return (__m128d) ((__v2df)__A / (__v2df)__B);
313 }
314 
315 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd(__m128d __A,__m128d __B)316 _mm_div_sd (__m128d __A, __m128d __B)
317 {
318   __A[0] = __A[0] / __B[0];
319   return (__A);
320 }
321 
322 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd(__m128d __A)323 _mm_sqrt_pd (__m128d __A)
324 {
325   return (vec_sqrt (__A));
326 }
327 
328 /* Return pair {sqrt (B[0]), A[1]}.  */
329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd(__m128d __A,__m128d __B)330 _mm_sqrt_sd (__m128d __A, __m128d __B)
331 {
332   __v2df c;
333   c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
334   return (__m128d) _mm_setr_pd (c[0], __A[1]);
335 }
336 
337 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd(__m128d __A,__m128d __B)338 _mm_min_pd (__m128d __A, __m128d __B)
339 {
340   return (vec_min (__A, __B));
341 }
342 
343 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd(__m128d __A,__m128d __B)344 _mm_min_sd (__m128d __A, __m128d __B)
345 {
346   __v2df a, b, c;
347   a = vec_splats (__A[0]);
348   b = vec_splats (__B[0]);
349   c = vec_min (a, b);
350   return (__m128d) _mm_setr_pd (c[0], __A[1]);
351 }
352 
353 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd(__m128d __A,__m128d __B)354 _mm_max_pd (__m128d __A, __m128d __B)
355 {
356   return (vec_max (__A, __B));
357 }
358 
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd(__m128d __A,__m128d __B)360 _mm_max_sd (__m128d __A, __m128d __B)
361 {
362   __v2df a, b, c;
363   a = vec_splats (__A[0]);
364   b = vec_splats (__B[0]);
365   c = vec_max (a, b);
366   return (__m128d) _mm_setr_pd (c[0], __A[1]);
367 }
368 
369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd(__m128d __A,__m128d __B)370 _mm_cmpeq_pd (__m128d __A, __m128d __B)
371 {
372   return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
373 }
374 
375 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd(__m128d __A,__m128d __B)376 _mm_cmplt_pd (__m128d __A, __m128d __B)
377 {
378   return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
379 }
380 
381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd(__m128d __A,__m128d __B)382 _mm_cmple_pd (__m128d __A, __m128d __B)
383 {
384   return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
385 }
386 
387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd(__m128d __A,__m128d __B)388 _mm_cmpgt_pd (__m128d __A, __m128d __B)
389 {
390   return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391 }
392 
393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd(__m128d __A,__m128d __B)394 _mm_cmpge_pd (__m128d __A, __m128d __B)
395 {
396   return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
397 }
398 
399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd(__m128d __A,__m128d __B)400 _mm_cmpneq_pd (__m128d __A, __m128d __B)
401 {
402   __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
403   return ((__m128d)vec_nor (temp, temp));
404 }
405 
406 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd(__m128d __A,__m128d __B)407 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
408 {
409   return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
410 }
411 
412 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd(__m128d __A,__m128d __B)413 _mm_cmpnle_pd (__m128d __A, __m128d __B)
414 {
415   return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
416 }
417 
418 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd(__m128d __A,__m128d __B)419 _mm_cmpngt_pd (__m128d __A, __m128d __B)
420 {
421   return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
422 }
423 
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd(__m128d __A,__m128d __B)425 _mm_cmpnge_pd (__m128d __A, __m128d __B)
426 {
427   return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
428 }
429 
430 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd(__m128d __A,__m128d __B)431 _mm_cmpord_pd (__m128d __A, __m128d __B)
432 {
433 #if _ARCH_PWR8
434   __v2du c, d;
435   /* Compare against self will return false (0's) if NAN.  */
436   c = (__v2du)vec_cmpeq (__A, __A);
437   d = (__v2du)vec_cmpeq (__B, __B);
438 #else
439   __v2du a, b;
440   __v2du c, d;
441   const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
442   a = (__v2du)vec_abs ((__v2df)__A);
443   b = (__v2du)vec_abs ((__v2df)__B);
444   c = (__v2du)vec_cmpgt (double_exp_mask, a);
445   d = (__v2du)vec_cmpgt (double_exp_mask, b);
446 #endif
447   /* A != NAN and B != NAN.  */
448   return ((__m128d)vec_and(c, d));
449 }
450 
451 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd(__m128d __A,__m128d __B)452 _mm_cmpunord_pd (__m128d __A, __m128d __B)
453 {
454 #if _ARCH_PWR8
455   __v2du c, d;
456   /* Compare against self will return false (0's) if NAN.  */
457   c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
458   d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
459   /* A == NAN OR B == NAN converts too:
460      NOT(A != NAN) OR NOT(B != NAN).  */
461   c = vec_nor (c, c);
462   return ((__m128d)vec_orc(c, d));
463 #else
464   __v2du c, d;
465   /* Compare against self will return false (0's) if NAN.  */
466   c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
467   d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
468   /* Convert the true ('1's) is NAN.  */
469   c = vec_nor (c, c);
470   d = vec_nor (d, d);
471   return ((__m128d)vec_or(c, d));
472 #endif
473 }
474 
475 extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A,__m128d __B)476 _mm_cmpeq_sd(__m128d  __A, __m128d  __B)
477 {
478   __v2df a, b, c;
479   /* PowerISA VSX does not allow partial (for just lower double)
480      results. So to insure we don't generate spurious exceptions
481      (from the upper double values) we splat the lower double
482      before we do the operation. */
483   a = vec_splats (__A[0]);
484   b = vec_splats (__B[0]);
485   c = (__v2df) vec_cmpeq(a, b);
486   /* Then we merge the lower double result with the original upper
487      double from __A.  */
488   return (__m128d) _mm_setr_pd (c[0], __A[1]);
489 }
490 
491 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd(__m128d __A,__m128d __B)492 _mm_cmplt_sd (__m128d __A, __m128d __B)
493 {
494   __v2df a, b, c;
495   a = vec_splats (__A[0]);
496   b = vec_splats (__B[0]);
497   c = (__v2df) vec_cmplt(a, b);
498   return (__m128d) _mm_setr_pd (c[0], __A[1]);
499 }
500 
501 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd(__m128d __A,__m128d __B)502 _mm_cmple_sd (__m128d __A, __m128d __B)
503 {
504   __v2df a, b, c;
505   a = vec_splats (__A[0]);
506   b = vec_splats (__B[0]);
507   c = (__v2df) vec_cmple(a, b);
508   return (__m128d) _mm_setr_pd (c[0], __A[1]);
509 }
510 
511 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd(__m128d __A,__m128d __B)512 _mm_cmpgt_sd (__m128d __A, __m128d __B)
513 {
514   __v2df a, b, c;
515   a = vec_splats (__A[0]);
516   b = vec_splats (__B[0]);
517   c = (__v2df) vec_cmpgt(a, b);
518   return (__m128d) _mm_setr_pd (c[0], __A[1]);
519 }
520 
521 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd(__m128d __A,__m128d __B)522 _mm_cmpge_sd (__m128d __A, __m128d __B)
523 {
524   __v2df a, b, c;
525   a = vec_splats (__A[0]);
526   b = vec_splats (__B[0]);
527   c = (__v2df) vec_cmpge(a, b);
528   return (__m128d) _mm_setr_pd (c[0], __A[1]);
529 }
530 
531 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd(__m128d __A,__m128d __B)532 _mm_cmpneq_sd (__m128d __A, __m128d __B)
533 {
534   __v2df a, b, c;
535   a = vec_splats (__A[0]);
536   b = vec_splats (__B[0]);
537   c = (__v2df) vec_cmpeq(a, b);
538   c = vec_nor (c, c);
539   return (__m128d) _mm_setr_pd (c[0], __A[1]);
540 }
541 
542 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd(__m128d __A,__m128d __B)543 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
544 {
545   __v2df a, b, c;
546   a = vec_splats (__A[0]);
547   b = vec_splats (__B[0]);
548   /* Not less than is just greater than or equal.  */
549   c = (__v2df) vec_cmpge(a, b);
550   return (__m128d) _mm_setr_pd (c[0], __A[1]);
551 }
552 
553 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd(__m128d __A,__m128d __B)554 _mm_cmpnle_sd (__m128d __A, __m128d __B)
555 {
556   __v2df a, b, c;
557   a = vec_splats (__A[0]);
558   b = vec_splats (__B[0]);
559   /* Not less than or equal is just greater than.  */
560   c = (__v2df) vec_cmpge(a, b);
561   return (__m128d) _mm_setr_pd (c[0], __A[1]);
562 }
563 
564 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd(__m128d __A,__m128d __B)565 _mm_cmpngt_sd (__m128d __A, __m128d __B)
566 {
567   __v2df a, b, c;
568   a = vec_splats (__A[0]);
569   b = vec_splats (__B[0]);
570   /* Not greater than is just less than or equal.  */
571   c = (__v2df) vec_cmple(a, b);
572   return (__m128d) _mm_setr_pd (c[0], __A[1]);
573 }
574 
575 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd(__m128d __A,__m128d __B)576 _mm_cmpnge_sd (__m128d __A, __m128d __B)
577 {
578   __v2df a, b, c;
579   a = vec_splats (__A[0]);
580   b = vec_splats (__B[0]);
581   /* Not greater than or equal is just less than.  */
582   c = (__v2df) vec_cmplt(a, b);
583   return (__m128d) _mm_setr_pd (c[0], __A[1]);
584 }
585 
586 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd(__m128d __A,__m128d __B)587 _mm_cmpord_sd (__m128d __A, __m128d __B)
588 {
589   __v2df r;
590   r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
591   return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
592 }
593 
594 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd(__m128d __A,__m128d __B)595 _mm_cmpunord_sd (__m128d __A, __m128d __B)
596 {
597   __v2df r;
598   r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
599   return (__m128d) _mm_setr_pd (r[0], __A[1]);
600 }
601 
602 /* FIXME
603    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
604    exactly the same because GCC for PowerPC only generates unordered
605    compares (scalar and vector).
606    Technically __mm_comieq_sp et all should be using the ordered
607    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
608    be OK.   */
609 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd(__m128d __A,__m128d __B)610 _mm_comieq_sd (__m128d __A, __m128d __B)
611 {
612   return (__A[0] == __B[0]);
613 }
614 
615 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd(__m128d __A,__m128d __B)616 _mm_comilt_sd (__m128d __A, __m128d __B)
617 {
618   return (__A[0] < __B[0]);
619 }
620 
621 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd(__m128d __A,__m128d __B)622 _mm_comile_sd (__m128d __A, __m128d __B)
623 {
624   return (__A[0] <= __B[0]);
625 }
626 
627 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd(__m128d __A,__m128d __B)628 _mm_comigt_sd (__m128d __A, __m128d __B)
629 {
630   return (__A[0] > __B[0]);
631 }
632 
633 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd(__m128d __A,__m128d __B)634 _mm_comige_sd (__m128d __A, __m128d __B)
635 {
636   return (__A[0] >= __B[0]);
637 }
638 
639 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd(__m128d __A,__m128d __B)640 _mm_comineq_sd (__m128d __A, __m128d __B)
641 {
642   return (__A[0] != __B[0]);
643 }
644 
645 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd(__m128d __A,__m128d __B)646 _mm_ucomieq_sd (__m128d __A, __m128d __B)
647 {
648 	return (__A[0] == __B[0]);
649 }
650 
651 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd(__m128d __A,__m128d __B)652 _mm_ucomilt_sd (__m128d __A, __m128d __B)
653 {
654 	return (__A[0] < __B[0]);
655 }
656 
657 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd(__m128d __A,__m128d __B)658 _mm_ucomile_sd (__m128d __A, __m128d __B)
659 {
660 	return (__A[0] <= __B[0]);
661 }
662 
663 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd(__m128d __A,__m128d __B)664 _mm_ucomigt_sd (__m128d __A, __m128d __B)
665 {
666 	return (__A[0] > __B[0]);
667 }
668 
669 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd(__m128d __A,__m128d __B)670 _mm_ucomige_sd (__m128d __A, __m128d __B)
671 {
672 	return (__A[0] >= __B[0]);
673 }
674 
675 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd(__m128d __A,__m128d __B)676 _mm_ucomineq_sd (__m128d __A, __m128d __B)
677 {
678   return (__A[0] != __B[0]);
679 }
680 
681 /* Create a vector of Qi, where i is the element number.  */
682 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x(long long __q1,long long __q0)683 _mm_set_epi64x (long long __q1, long long __q0)
684 {
685   return __extension__ (__m128i)(__v2di){ __q0, __q1 };
686 }
687 
688 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64(__m64 __q1,__m64 __q0)689 _mm_set_epi64 (__m64 __q1,  __m64 __q0)
690 {
691   return _mm_set_epi64x ((long long)__q1, (long long)__q0);
692 }
693 
694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32(int __q3,int __q2,int __q1,int __q0)695 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
696 {
697   return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
698 }
699 
700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16(short __q7,short __q6,short __q5,short __q4,short __q3,short __q2,short __q1,short __q0)701 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
702 	       short __q3, short __q2, short __q1, short __q0)
703 {
704   return __extension__ (__m128i)(__v8hi){
705     __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
706 }
707 
708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8(char __q15,char __q14,char __q13,char __q12,char __q11,char __q10,char __q09,char __q08,char __q07,char __q06,char __q05,char __q04,char __q03,char __q02,char __q01,char __q00)709 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
710 	      char __q11, char __q10, char __q09, char __q08,
711 	      char __q07, char __q06, char __q05, char __q04,
712 	      char __q03, char __q02, char __q01, char __q00)
713 {
714   return __extension__ (__m128i)(__v16qi){
715     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
716     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
717   };
718 }
719 
720 /* Set all of the elements of the vector to A.  */
721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x(long long __A)722 _mm_set1_epi64x (long long __A)
723 {
724   return _mm_set_epi64x (__A, __A);
725 }
726 
727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64(__m64 __A)728 _mm_set1_epi64 (__m64 __A)
729 {
730   return _mm_set_epi64 (__A, __A);
731 }
732 
733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32(int __A)734 _mm_set1_epi32 (int __A)
735 {
736   return _mm_set_epi32 (__A, __A, __A, __A);
737 }
738 
739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16(short __A)740 _mm_set1_epi16 (short __A)
741 {
742   return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
743 }
744 
745 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8(char __A)746 _mm_set1_epi8 (char __A)
747 {
748   return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
749 		       __A, __A, __A, __A, __A, __A, __A, __A);
750 }
751 
752 /* Create a vector of Qi, where i is the element number.
753    The parameter order is reversed from the _mm_set_epi* functions.  */
754 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64(__m64 __q0,__m64 __q1)755 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
756 {
757   return _mm_set_epi64 (__q1, __q0);
758 }
759 
760 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32(int __q0,int __q1,int __q2,int __q3)761 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
762 {
763   return _mm_set_epi32 (__q3, __q2, __q1, __q0);
764 }
765 
766 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16(short __q0,short __q1,short __q2,short __q3,short __q4,short __q5,short __q6,short __q7)767 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
768 	        short __q4, short __q5, short __q6, short __q7)
769 {
770   return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
771 }
772 
773 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8(char __q00,char __q01,char __q02,char __q03,char __q04,char __q05,char __q06,char __q07,char __q08,char __q09,char __q10,char __q11,char __q12,char __q13,char __q14,char __q15)774 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
775 	       char __q04, char __q05, char __q06, char __q07,
776 	       char __q08, char __q09, char __q10, char __q11,
777 	       char __q12, char __q13, char __q14, char __q15)
778 {
779   return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
780 		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
781 }
782 
783 /* Create a vector with element 0 as *P and the rest zero.  */
784 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128(__m128i const * __P)785 _mm_load_si128 (__m128i const *__P)
786 {
787   return *__P;
788 }
789 
790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128(__m128i_u const * __P)791 _mm_loadu_si128 (__m128i_u const *__P)
792 {
793   return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
794 }
795 
796 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64(__m128i_u const * __P)797 _mm_loadl_epi64 (__m128i_u const *__P)
798 {
799   return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
800 }
801 
802 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128(__m128i * __P,__m128i __B)803 _mm_store_si128 (__m128i *__P, __m128i __B)
804 {
805   assert(((unsigned long )__P & 0xfUL) == 0UL);
806   vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
807 }
808 
809 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128(__m128i_u * __P,__m128i __B)810 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
811 {
812   *__P = __B;
813 }
814 
815 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64(__m128i_u * __P,__m128i __B)816 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
817 {
818   *(long long *)__P = ((__v2di)__B)[0];
819 }
820 
821 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64(__m128i_u __B)822 _mm_movepi64_pi64 (__m128i_u __B)
823 {
824   return (__m64) ((__v2di)__B)[0];
825 }
826 
827 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64(__m64 __A)828 _mm_movpi64_epi64 (__m64 __A)
829 {
830   return _mm_set_epi64 ((__m64)0LL, __A);
831 }
832 
833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64(__m128i __A)834 _mm_move_epi64 (__m128i __A)
835 {
836   return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
837 }
838 
839 /* Create an undefined vector.  */
840 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128(void)841 _mm_undefined_si128 (void)
842 {
843   __m128i __Y = __Y;
844   return __Y;
845 }
846 
847 /* Create a vector of zeros.  */
848 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128(void)849 _mm_setzero_si128 (void)
850 {
851   return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
852 }
853 
854 #ifdef _ARCH_PWR8
855 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd(__m128i __A)856 _mm_cvtepi32_pd (__m128i __A)
857 {
858   __v2di val;
859   /* For LE need to generate Vector Unpack Low Signed Word.
860      Which is generated from unpackh.  */
861   val = (__v2di)vec_unpackh ((__v4si)__A);
862 
863   return (__m128d)vec_ctf (val, 0);
864 }
865 #endif
866 
867 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps(__m128i __A)868 _mm_cvtepi32_ps (__m128i __A)
869 {
870   return ((__m128)vec_ctf((__v4si)__A, 0));
871 }
872 
873 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32(__m128d __A)874 _mm_cvtpd_epi32 (__m128d __A)
875 {
876   __v2df rounded = vec_rint (__A);
877   __v4si result, temp;
878   const __v4si vzero =
879     { 0, 0, 0, 0 };
880 
881   /* VSX Vector truncate Double-Precision to integer and Convert to
882    Signed Integer Word format with Saturate.  */
883   __asm__(
884       "xvcvdpsxws %x0,%x1"
885       : "=wa" (temp)
886       : "wa" (rounded)
887       : );
888 
889 #ifdef _ARCH_PWR8
890 #ifdef __LITTLE_ENDIAN__
891   temp = vec_mergeo (temp, temp);
892 #else
893   temp = vec_mergee (temp, temp);
894 #endif
895   result = (__v4si) vec_vpkudum ((__vector long long) temp,
896 				 (__vector long long) vzero);
897 #else
898   {
899     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
900 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
901     result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
902   }
903 #endif
904   return (__m128i) result;
905 }
906 
907 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32(__m128d __A)908 _mm_cvtpd_pi32 (__m128d __A)
909 {
910   __m128i result = _mm_cvtpd_epi32(__A);
911 
912   return (__m64) result[0];
913 }
914 
915 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps(__m128d __A)916 _mm_cvtpd_ps (__m128d __A)
917 {
918   __v4sf result;
919   __v4si temp;
920   const __v4si vzero = { 0, 0, 0, 0 };
921 
922   __asm__(
923       "xvcvdpsp %x0,%x1"
924       : "=wa" (temp)
925       : "wa" (__A)
926       : );
927 
928 #ifdef _ARCH_PWR8
929 #ifdef __LITTLE_ENDIAN__
930   temp = vec_mergeo (temp, temp);
931 #else
932   temp = vec_mergee (temp, temp);
933 #endif
934   result = (__v4sf) vec_vpkudum ((__vector long long) temp,
935 				 (__vector long long) vzero);
936 #else
937   {
938     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
939 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
940     result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
941   }
942 #endif
943   return ((__m128)result);
944 }
945 
946 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32(__m128d __A)947 _mm_cvttpd_epi32 (__m128d __A)
948 {
949   __v4si result;
950   __v4si temp;
951   const __v4si vzero = { 0, 0, 0, 0 };
952 
953   /* VSX Vector truncate Double-Precision to integer and Convert to
954    Signed Integer Word format with Saturate.  */
955   __asm__(
956       "xvcvdpsxws %x0,%x1"
957       : "=wa" (temp)
958       : "wa" (__A)
959       : );
960 
961 #ifdef _ARCH_PWR8
962 #ifdef __LITTLE_ENDIAN__
963   temp = vec_mergeo (temp, temp);
964 #else
965   temp = vec_mergee (temp, temp);
966 #endif
967   result = (__v4si) vec_vpkudum ((__vector long long) temp,
968 				 (__vector long long) vzero);
969 #else
970   {
971     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
972 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
973     result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
974   }
975 #endif
976 
977   return ((__m128i) result);
978 }
979 
980 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32(__m128d __A)981 _mm_cvttpd_pi32 (__m128d __A)
982 {
983   __m128i result = _mm_cvttpd_epi32 (__A);
984 
985   return (__m64) result[0];
986 }
987 
988 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32(__m128i __A)989 _mm_cvtsi128_si32 (__m128i __A)
990 {
991   return ((__v4si)__A)[0];
992 }
993 
994 #ifdef _ARCH_PWR8
995 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd(__m64 __A)996 _mm_cvtpi32_pd (__m64 __A)
997 {
998   __v4si temp;
999   __v2di tmp2;
1000   __v2df result;
1001 
1002   temp = (__v4si)vec_splats (__A);
1003   tmp2 = (__v2di)vec_unpackl (temp);
1004   result = vec_ctf ((__vector signed long long) tmp2, 0);
1005   return (__m128d)result;
1006 }
1007 #endif
1008 
1009 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32(__m128 __A)1010 _mm_cvtps_epi32 (__m128 __A)
1011 {
1012   __v4sf rounded;
1013   __v4si result;
1014 
1015   rounded = vec_rint((__v4sf) __A);
1016   result = vec_cts (rounded, 0);
1017   return (__m128i) result;
1018 }
1019 
1020 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32(__m128 __A)1021 _mm_cvttps_epi32 (__m128 __A)
1022 {
1023   __v4si result;
1024 
1025   result = vec_cts ((__v4sf) __A, 0);
1026   return (__m128i) result;
1027 }
1028 
1029 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd(__m128 __A)1030 _mm_cvtps_pd (__m128 __A)
1031 {
1032   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1033 #ifdef vec_doubleh
1034   return (__m128d) vec_doubleh ((__v4sf)__A);
1035 #else
1036   /* Otherwise the compiler is not current and so need to generate the
1037      equivalent code.  */
1038   __v4sf a = (__v4sf)__A;
1039   __v4sf temp;
1040   __v2df result;
1041 #ifdef __LITTLE_ENDIAN__
1042   /* The input float values are in elements {[0], [1]} but the convert
1043      instruction needs them in elements {[1], [3]}, So we use two
1044      shift left double vector word immediates to get the elements
1045      lined up.  */
1046   temp = __builtin_vsx_xxsldwi (a, a, 3);
1047   temp = __builtin_vsx_xxsldwi (a, temp, 2);
1048 #else
1049   /* The input float values are in elements {[0], [1]} but the convert
1050      instruction needs them in elements {[0], [2]}, So we use two
1051      shift left double vector word immediates to get the elements
1052      lined up.  */
1053   temp = vec_vmrghw (a, a);
1054 #endif
1055   __asm__(
1056       " xvcvspdp %x0,%x1"
1057       : "=wa" (result)
1058       : "wa" (temp)
1059       : );
1060   return (__m128d) result;
1061 #endif
1062 }
1063 
1064 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32(__m128d __A)1065 _mm_cvtsd_si32 (__m128d __A)
1066 {
1067   __v2df rounded = vec_rint((__v2df) __A);
1068   int result = ((__v2df)rounded)[0];
1069 
1070   return result;
1071 }
1072 /* Intel intrinsic.  */
1073 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64(__m128d __A)1074 _mm_cvtsd_si64 (__m128d __A)
1075 {
1076   __v2df rounded = vec_rint ((__v2df) __A );
1077   long long result = ((__v2df) rounded)[0];
1078 
1079   return result;
1080 }
1081 
1082 /* Microsoft intrinsic.  */
1083 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x(__m128d __A)1084 _mm_cvtsd_si64x (__m128d __A)
1085 {
1086   return _mm_cvtsd_si64 ((__v2df)__A);
1087 }
1088 
1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32(__m128d __A)1090 _mm_cvttsd_si32 (__m128d __A)
1091 {
1092   int result = ((__v2df)__A)[0];
1093 
1094   return result;
1095 }
1096 
1097 /* Intel intrinsic.  */
1098 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64(__m128d __A)1099 _mm_cvttsd_si64 (__m128d __A)
1100 {
1101   long long result = ((__v2df)__A)[0];
1102 
1103   return result;
1104 }
1105 
1106 /* Microsoft intrinsic.  */
1107 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x(__m128d __A)1108 _mm_cvttsd_si64x (__m128d __A)
1109 {
1110   return _mm_cvttsd_si64 (__A);
1111 }
1112 
1113 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss(__m128 __A,__m128d __B)1114 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1115 {
1116   __v4sf result = (__v4sf)__A;
1117 
1118 #ifdef __LITTLE_ENDIAN__
1119   __v4sf temp_s;
1120   /* Copy double element[0] to element [1] for conversion.  */
1121   __v2df temp_b = vec_splat((__v2df)__B, 0);
1122 
1123   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1124   result = __builtin_vsx_xxsldwi (result, result, 3);
1125   /* Convert double to single float scalar in a vector.  */
1126   __asm__(
1127       "xscvdpsp %x0,%x1"
1128       : "=wa" (temp_s)
1129       : "wa" (temp_b)
1130       : );
1131   /* Shift the resulting scalar into vector element [0].  */
1132   result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1133 #else
1134   result [0] = ((__v2df)__B)[0];
1135 #endif
1136   return (__m128) result;
1137 }
1138 
1139 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd(__m128d __A,int __B)1140 _mm_cvtsi32_sd (__m128d __A, int __B)
1141 {
1142   __v2df result = (__v2df)__A;
1143   double db = __B;
1144   result [0] = db;
1145   return (__m128d)result;
1146 }
1147 
1148 /* Intel intrinsic.  */
1149 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd(__m128d __A,long long __B)1150 _mm_cvtsi64_sd (__m128d __A, long long __B)
1151 {
1152   __v2df result = (__v2df)__A;
1153   double db = __B;
1154   result [0] = db;
1155   return (__m128d)result;
1156 }
1157 
1158 /* Microsoft intrinsic.  */
1159 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd(__m128d __A,long long __B)1160 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1161 {
1162   return _mm_cvtsi64_sd (__A, __B);
1163 }
1164 
1165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd(__m128d __A,__m128 __B)1166 _mm_cvtss_sd (__m128d __A, __m128 __B)
1167 {
1168 #ifdef __LITTLE_ENDIAN__
1169   /* Use splat to move element [0] into position for the convert. */
1170   __v4sf temp = vec_splat ((__v4sf)__B, 0);
1171   __v2df res;
1172   /* Convert single float scalar to double in a vector.  */
1173   __asm__(
1174       "xscvspdp %x0,%x1"
1175       : "=wa" (res)
1176       : "wa" (temp)
1177       : );
1178   return (__m128d) vec_mergel (res, (__v2df)__A);
1179 #else
1180   __v2df res = (__v2df)__A;
1181   res [0] = ((__v4sf)__B) [0];
1182   return (__m128d) res;
1183 #endif
1184 }
1185 
1186 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A,__m128d __B,const int __mask)1187 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1188 {
1189   __vector double result;
1190   const int litmsk = __mask & 0x3;
1191 
1192   if (litmsk == 0)
1193     result = vec_mergeh (__A, __B);
1194 #if __GNUC__ < 6
1195   else if (litmsk == 1)
1196     result = vec_xxpermdi (__B, __A, 2);
1197   else if (litmsk == 2)
1198     result = vec_xxpermdi (__B, __A, 1);
1199 #else
1200   else if (litmsk == 1)
1201     result = vec_xxpermdi (__A, __B, 2);
1202   else if (litmsk == 2)
1203     result = vec_xxpermdi (__A, __B, 1);
1204 #endif
1205   else
1206     result = vec_mergel (__A, __B);
1207 
1208   return result;
1209 }
1210 
1211 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd(__m128d __A,__m128d __B)1212 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1213 {
1214   return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1215 }
1216 
1217 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd(__m128d __A,__m128d __B)1218 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1219 {
1220   return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1221 }
1222 
1223 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd(__m128d __A,double const * __B)1224 _mm_loadh_pd (__m128d __A, double const *__B)
1225 {
1226   __v2df result = (__v2df)__A;
1227   result [1] = *__B;
1228   return (__m128d)result;
1229 }
1230 
1231 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd(__m128d __A,double const * __B)1232 _mm_loadl_pd (__m128d __A, double const *__B)
1233 {
1234   __v2df result = (__v2df)__A;
1235   result [0] = *__B;
1236   return (__m128d)result;
1237 }
1238 
1239 #ifdef _ARCH_PWR8
1240 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1241 
1242 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1243 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd(__m128d __A)1244 _mm_movemask_pd (__m128d  __A)
1245 {
1246   __vector unsigned long long result;
1247   static const __vector unsigned int perm_mask =
1248     {
1249 #ifdef __LITTLE_ENDIAN__
1250 	0x80800040, 0x80808080, 0x80808080, 0x80808080
1251 #else
1252       0x80808080, 0x80808080, 0x80808080, 0x80804000
1253 #endif
1254     };
1255 
1256   result = ((__vector unsigned long long)
1257 	    vec_vbpermq ((__vector unsigned char) __A,
1258 			 (__vector unsigned char) perm_mask));
1259 
1260 #ifdef __LITTLE_ENDIAN__
1261   return result[1];
1262 #else
1263   return result[0];
1264 #endif
1265 }
1266 #endif /* _ARCH_PWR8 */
1267 
1268 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16(__m128i __A,__m128i __B)1269 _mm_packs_epi16 (__m128i __A, __m128i __B)
1270 {
1271   return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1272 }
1273 
1274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32(__m128i __A,__m128i __B)1275 _mm_packs_epi32 (__m128i __A, __m128i __B)
1276 {
1277   return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1278 }
1279 
1280 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16(__m128i __A,__m128i __B)1281 _mm_packus_epi16 (__m128i __A, __m128i __B)
1282 {
1283   return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1284 }
1285 
1286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8(__m128i __A,__m128i __B)1287 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1288 {
1289   return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1290 }
1291 
1292 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16(__m128i __A,__m128i __B)1293 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1294 {
1295   return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1296 }
1297 
1298 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32(__m128i __A,__m128i __B)1299 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1300 {
1301   return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1302 }
1303 
1304 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64(__m128i __A,__m128i __B)1305 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1306 {
1307   return (__m128i) vec_mergel ((__vector long long) __A,
1308 			       (__vector long long) __B);
1309 }
1310 
1311 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8(__m128i __A,__m128i __B)1312 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1313 {
1314   return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1315 }
1316 
1317 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16(__m128i __A,__m128i __B)1318 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1319 {
1320   return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1321 }
1322 
1323 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32(__m128i __A,__m128i __B)1324 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1325 {
1326   return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1327 }
1328 
1329 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64(__m128i __A,__m128i __B)1330 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1331 {
1332   return (__m128i) vec_mergeh ((__vector long long) __A,
1333 			       (__vector long long) __B);
1334 }
1335 
1336 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8(__m128i __A,__m128i __B)1337 _mm_add_epi8 (__m128i __A, __m128i __B)
1338 {
1339   return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1340 }
1341 
1342 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16(__m128i __A,__m128i __B)1343 _mm_add_epi16 (__m128i __A, __m128i __B)
1344 {
1345   return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1346 }
1347 
1348 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32(__m128i __A,__m128i __B)1349 _mm_add_epi32 (__m128i __A, __m128i __B)
1350 {
1351   return (__m128i) ((__v4su)__A + (__v4su)__B);
1352 }
1353 
1354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64(__m128i __A,__m128i __B)1355 _mm_add_epi64 (__m128i __A, __m128i __B)
1356 {
1357   return (__m128i) ((__v2du)__A + (__v2du)__B);
1358 }
1359 
1360 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8(__m128i __A,__m128i __B)1361 _mm_adds_epi8 (__m128i __A, __m128i __B)
1362 {
1363   return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1364 }
1365 
1366 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16(__m128i __A,__m128i __B)1367 _mm_adds_epi16 (__m128i __A, __m128i __B)
1368 {
1369   return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1370 }
1371 
1372 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8(__m128i __A,__m128i __B)1373 _mm_adds_epu8 (__m128i __A, __m128i __B)
1374 {
1375   return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1376 }
1377 
1378 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16(__m128i __A,__m128i __B)1379 _mm_adds_epu16 (__m128i __A, __m128i __B)
1380 {
1381   return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1382 }
1383 
1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8(__m128i __A,__m128i __B)1385 _mm_sub_epi8 (__m128i __A, __m128i __B)
1386 {
1387   return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1388 }
1389 
1390 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16(__m128i __A,__m128i __B)1391 _mm_sub_epi16 (__m128i __A, __m128i __B)
1392 {
1393   return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1394 }
1395 
1396 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32(__m128i __A,__m128i __B)1397 _mm_sub_epi32 (__m128i __A, __m128i __B)
1398 {
1399   return (__m128i) ((__v4su)__A - (__v4su)__B);
1400 }
1401 
1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64(__m128i __A,__m128i __B)1403 _mm_sub_epi64 (__m128i __A, __m128i __B)
1404 {
1405   return (__m128i) ((__v2du)__A - (__v2du)__B);
1406 }
1407 
1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8(__m128i __A,__m128i __B)1409 _mm_subs_epi8 (__m128i __A, __m128i __B)
1410 {
1411   return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1412 }
1413 
1414 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16(__m128i __A,__m128i __B)1415 _mm_subs_epi16 (__m128i __A, __m128i __B)
1416 {
1417   return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1418 }
1419 
1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8(__m128i __A,__m128i __B)1421 _mm_subs_epu8 (__m128i __A, __m128i __B)
1422 {
1423   return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1424 }
1425 
1426 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16(__m128i __A,__m128i __B)1427 _mm_subs_epu16 (__m128i __A, __m128i __B)
1428 {
1429   return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1430 }
1431 
1432 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16(__m128i __A,__m128i __B)1433 _mm_madd_epi16 (__m128i __A, __m128i __B)
1434 {
1435   __vector signed int zero = {0, 0, 0, 0};
1436 
1437   return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1438 }
1439 
1440 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16(__m128i __A,__m128i __B)1441 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1442 {
1443   __vector signed int w0, w1;
1444 
1445   __vector unsigned char xform1 = {
1446 #ifdef __LITTLE_ENDIAN__
1447       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1448       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1449 #else
1450       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1451       0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
1452 #endif
1453     };
1454 
1455   w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1456   w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1457   return (__m128i) vec_perm (w0, w1, xform1);
1458 }
1459 
1460 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16(__m128i __A,__m128i __B)1461 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1462 {
1463     return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1464 }
1465 
1466 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32(__m64 __A,__m64 __B)1467 _mm_mul_su32 (__m64 __A, __m64 __B)
1468 {
1469   unsigned int a = __A;
1470   unsigned int b = __B;
1471 
1472   return ((__m64)a * (__m64)b);
1473 }
1474 
1475 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32(__m128i __A,__m128i __B)1476 _mm_mul_epu32 (__m128i __A, __m128i __B)
1477 {
1478 #if __GNUC__ < 8
1479   __v2du result;
1480 
1481 #ifdef __LITTLE_ENDIAN__
1482   /* VMX Vector Multiply Odd Unsigned Word.  */
1483   __asm__(
1484       "vmulouw %0,%1,%2"
1485       : "=v" (result)
1486       : "v" (__A), "v" (__B)
1487       : );
1488 #else
1489   /* VMX Vector Multiply Even Unsigned Word.  */
1490   __asm__(
1491       "vmuleuw %0,%1,%2"
1492       : "=v" (result)
1493       : "v" (__A), "v" (__B)
1494       : );
1495 #endif
1496   return (__m128i) result;
1497 #else
1498   return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1499 #endif
1500 }
1501 
1502 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16(__m128i __A,int __B)1503 _mm_slli_epi16 (__m128i __A, int __B)
1504 {
1505   __v8hu lshift;
1506   __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1507 
1508   if (__B >= 0 && __B < 16)
1509     {
1510       if (__builtin_constant_p(__B))
1511 	lshift = (__v8hu) vec_splat_s16(__B);
1512       else
1513 	lshift = vec_splats ((unsigned short) __B);
1514 
1515       result = vec_sl ((__v8hi) __A, lshift);
1516     }
1517 
1518   return (__m128i) result;
1519 }
1520 
1521 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32(__m128i __A,int __B)1522 _mm_slli_epi32 (__m128i __A, int __B)
1523 {
1524   __v4su lshift;
1525   __v4si result = { 0, 0, 0, 0 };
1526 
1527   if (__B >= 0 && __B < 32)
1528     {
1529       if (__builtin_constant_p(__B) && __B < 16)
1530 	lshift = (__v4su) vec_splat_s32(__B);
1531       else
1532 	lshift = vec_splats ((unsigned int) __B);
1533 
1534       result = vec_sl ((__v4si) __A, lshift);
1535     }
1536 
1537   return (__m128i) result;
1538 }
1539 
1540 #ifdef _ARCH_PWR8
1541 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64(__m128i __A,int __B)1542 _mm_slli_epi64 (__m128i __A, int __B)
1543 {
1544   __v2du lshift;
1545   __v2di result = { 0, 0 };
1546 
1547   if (__B >= 0 && __B < 64)
1548     {
1549       if (__builtin_constant_p(__B) && __B < 16)
1550 	lshift = (__v2du) vec_splat_s32(__B);
1551       else
1552 	lshift = (__v2du) vec_splats ((unsigned int) __B);
1553 
1554       result = vec_sl ((__v2di) __A, lshift);
1555     }
1556 
1557   return (__m128i) result;
1558 }
1559 #endif
1560 
1561 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16(__m128i __A,int __B)1562 _mm_srai_epi16 (__m128i __A, int __B)
1563 {
1564   __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1565   __v8hi result;
1566 
1567   if (__B < 16)
1568     {
1569       if (__builtin_constant_p(__B))
1570 	rshift = (__v8hu) vec_splat_s16(__B);
1571       else
1572 	rshift = vec_splats ((unsigned short) __B);
1573     }
1574   result = vec_sra ((__v8hi) __A, rshift);
1575 
1576   return (__m128i) result;
1577 }
1578 
1579 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32(__m128i __A,int __B)1580 _mm_srai_epi32 (__m128i __A, int __B)
1581 {
1582   __v4su rshift = { 31, 31, 31, 31 };
1583   __v4si result;
1584 
1585   if (__B < 32)
1586     {
1587       if (__builtin_constant_p(__B))
1588 	{
1589 	  if (__B < 16)
1590 	      rshift = (__v4su) vec_splat_s32(__B);
1591 	    else
1592 	      rshift = (__v4su) vec_splats((unsigned int)__B);
1593 	}
1594       else
1595 	rshift = vec_splats ((unsigned int) __B);
1596     }
1597   result = vec_sra ((__v4si) __A, rshift);
1598 
1599   return (__m128i) result;
1600 }
1601 
1602 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128(__m128i __A,const int __N)1603 _mm_bslli_si128 (__m128i __A, const int __N)
1604 {
1605   __v16qu result;
1606   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1607 
1608   if (__N < 16)
1609     result = vec_sld ((__v16qu) __A, zeros, __N);
1610   else
1611     result = zeros;
1612 
1613   return (__m128i) result;
1614 }
1615 
1616 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128(__m128i __A,const int __N)1617 _mm_bsrli_si128 (__m128i __A, const int __N)
1618 {
1619   __v16qu result;
1620   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1621 
1622   if (__N < 16)
1623 #ifdef __LITTLE_ENDIAN__
1624     if (__builtin_constant_p(__N))
1625       /* Would like to use Vector Shift Left Double by Octet
1626 	 Immediate here to use the immediate form and avoid
1627 	 load of __N * 8 value into a separate VR.  */
1628       result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1629     else
1630 #endif
1631       {
1632 	__v16qu shift = vec_splats((unsigned char)(__N*8));
1633 #ifdef __LITTLE_ENDIAN__
1634 	result = vec_sro ((__v16qu)__A, shift);
1635 #else
1636 	result = vec_slo ((__v16qu)__A, shift);
1637 #endif
1638       }
1639   else
1640     result = zeros;
1641 
1642   return (__m128i) result;
1643 }
1644 
1645 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128(__m128i __A,const int __N)1646 _mm_srli_si128 (__m128i __A, const int __N)
1647 {
1648   return _mm_bsrli_si128 (__A, __N);
1649 }
1650 
1651 extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128(__m128i __A,const int _imm5)1652 _mm_slli_si128 (__m128i __A, const int _imm5)
1653 {
1654   __v16qu result;
1655   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1656 
1657   if (_imm5 < 16)
1658 #ifdef __LITTLE_ENDIAN__
1659     result = vec_sld ((__v16qu) __A, zeros, _imm5);
1660 #else
1661     result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1662 #endif
1663   else
1664     result = zeros;
1665 
1666   return (__m128i) result;
1667 }
1668 
1669 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1670 
_mm_srli_epi16(__m128i __A,int __B)1671 _mm_srli_epi16 (__m128i  __A, int __B)
1672 {
1673   __v8hu rshift;
1674   __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1675 
1676   if (__B < 16)
1677     {
1678       if (__builtin_constant_p(__B))
1679 	rshift = (__v8hu) vec_splat_s16(__B);
1680       else
1681 	rshift = vec_splats ((unsigned short) __B);
1682 
1683       result = vec_sr ((__v8hi) __A, rshift);
1684     }
1685 
1686   return (__m128i) result;
1687 }
1688 
1689 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32(__m128i __A,int __B)1690 _mm_srli_epi32 (__m128i __A, int __B)
1691 {
1692   __v4su rshift;
1693   __v4si result = { 0, 0, 0, 0 };
1694 
1695   if (__B < 32)
1696     {
1697       if (__builtin_constant_p(__B))
1698 	{
1699 	  if (__B < 16)
1700 	      rshift = (__v4su) vec_splat_s32(__B);
1701 	    else
1702 	      rshift = (__v4su) vec_splats((unsigned int)__B);
1703 	}
1704       else
1705 	rshift = vec_splats ((unsigned int) __B);
1706 
1707       result = vec_sr ((__v4si) __A, rshift);
1708     }
1709 
1710   return (__m128i) result;
1711 }
1712 
1713 #ifdef _ARCH_PWR8
1714 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64(__m128i __A,int __B)1715 _mm_srli_epi64 (__m128i __A, int __B)
1716 {
1717   __v2du rshift;
1718   __v2di result = { 0, 0 };
1719 
1720   if (__B < 64)
1721     {
1722       if (__builtin_constant_p(__B))
1723 	{
1724 	  if (__B < 16)
1725 	      rshift = (__v2du) vec_splat_s32(__B);
1726 	    else
1727 	      rshift = (__v2du) vec_splats((unsigned long long)__B);
1728 	}
1729       else
1730 	rshift = (__v2du) vec_splats ((unsigned int) __B);
1731 
1732       result = vec_sr ((__v2di) __A, rshift);
1733     }
1734 
1735   return (__m128i) result;
1736 }
1737 #endif
1738 
1739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16(__m128i __A,__m128i __B)1740 _mm_sll_epi16 (__m128i __A, __m128i __B)
1741 {
1742   __v8hu lshift;
1743   __vector __bool short shmask;
1744   const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1745   __v8hu result;
1746 
1747 #ifdef __LITTLE_ENDIAN__
1748   lshift = vec_splat ((__v8hu) __B, 0);
1749 #else
1750   lshift = vec_splat ((__v8hu) __B, 3);
1751 #endif
1752   shmask = vec_cmple (lshift, shmax);
1753   result = vec_sl ((__v8hu) __A, lshift);
1754   result = vec_sel ((__v8hu) shmask, result, shmask);
1755 
1756   return (__m128i) result;
1757 }
1758 
1759 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32(__m128i __A,__m128i __B)1760 _mm_sll_epi32 (__m128i __A, __m128i __B)
1761 {
1762   __v4su lshift;
1763   __vector __bool int shmask;
1764   const __v4su shmax = { 32, 32, 32, 32 };
1765   __v4su result;
1766 #ifdef __LITTLE_ENDIAN__
1767   lshift = vec_splat ((__v4su) __B, 0);
1768 #else
1769   lshift = vec_splat ((__v4su) __B, 1);
1770 #endif
1771   shmask = vec_cmplt (lshift, shmax);
1772   result = vec_sl ((__v4su) __A, lshift);
1773   result = vec_sel ((__v4su) shmask, result, shmask);
1774 
1775   return (__m128i) result;
1776 }
1777 
1778 #ifdef _ARCH_PWR8
1779 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64(__m128i __A,__m128i __B)1780 _mm_sll_epi64 (__m128i __A, __m128i __B)
1781 {
1782   __v2du lshift;
1783   __vector __bool long long shmask;
1784   const __v2du shmax = { 64, 64 };
1785   __v2du result;
1786 
1787   lshift = vec_splat ((__v2du) __B, 0);
1788   shmask = vec_cmplt (lshift, shmax);
1789   result = vec_sl ((__v2du) __A, lshift);
1790   result = vec_sel ((__v2du) shmask, result, shmask);
1791 
1792   return (__m128i) result;
1793 }
1794 #endif
1795 
1796 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16(__m128i __A,__m128i __B)1797 _mm_sra_epi16 (__m128i __A, __m128i __B)
1798 {
1799   const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1800   __v8hu rshift;
1801   __v8hi result;
1802 
1803 #ifdef __LITTLE_ENDIAN__
1804   rshift = vec_splat ((__v8hu)__B, 0);
1805 #else
1806   rshift = vec_splat ((__v8hu)__B, 3);
1807 #endif
1808   rshift = vec_min (rshift, rshmax);
1809   result = vec_sra ((__v8hi) __A, rshift);
1810 
1811   return (__m128i) result;
1812 }
1813 
1814 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32(__m128i __A,__m128i __B)1815 _mm_sra_epi32 (__m128i __A, __m128i __B)
1816 {
1817   const __v4su rshmax = { 31, 31, 31, 31 };
1818   __v4su rshift;
1819   __v4si result;
1820 
1821 #ifdef __LITTLE_ENDIAN__
1822   rshift = vec_splat ((__v4su)__B, 0);
1823 #else
1824   rshift = vec_splat ((__v4su)__B, 1);
1825 #endif
1826   rshift = vec_min (rshift, rshmax);
1827   result = vec_sra ((__v4si) __A, rshift);
1828 
1829   return (__m128i) result;
1830 }
1831 
1832 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16(__m128i __A,__m128i __B)1833 _mm_srl_epi16 (__m128i __A, __m128i __B)
1834 {
1835   __v8hu rshift;
1836   __vector __bool short shmask;
1837   const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1838   __v8hu result;
1839 
1840 #ifdef __LITTLE_ENDIAN__
1841   rshift = vec_splat ((__v8hu) __B, 0);
1842 #else
1843   rshift = vec_splat ((__v8hu) __B, 3);
1844 #endif
1845   shmask = vec_cmple (rshift, shmax);
1846   result = vec_sr ((__v8hu) __A, rshift);
1847   result = vec_sel ((__v8hu) shmask, result, shmask);
1848 
1849   return (__m128i) result;
1850 }
1851 
1852 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32(__m128i __A,__m128i __B)1853 _mm_srl_epi32 (__m128i __A, __m128i __B)
1854 {
1855   __v4su rshift;
1856   __vector __bool int shmask;
1857   const __v4su shmax = { 32, 32, 32, 32 };
1858   __v4su result;
1859 
1860 #ifdef __LITTLE_ENDIAN__
1861   rshift = vec_splat ((__v4su) __B, 0);
1862 #else
1863   rshift = vec_splat ((__v4su) __B, 1);
1864 #endif
1865   shmask = vec_cmplt (rshift, shmax);
1866   result = vec_sr ((__v4su) __A, rshift);
1867   result = vec_sel ((__v4su) shmask, result, shmask);
1868 
1869   return (__m128i) result;
1870 }
1871 
1872 #ifdef _ARCH_PWR8
1873 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64(__m128i __A,__m128i __B)1874 _mm_srl_epi64 (__m128i __A, __m128i __B)
1875 {
1876   __v2du rshift;
1877   __vector __bool long long shmask;
1878   const __v2du shmax = { 64, 64 };
1879   __v2du result;
1880 
1881   rshift = vec_splat ((__v2du) __B, 0);
1882   shmask = vec_cmplt (rshift, shmax);
1883   result = vec_sr ((__v2du) __A, rshift);
1884   result = vec_sel ((__v2du) shmask, result, shmask);
1885 
1886   return (__m128i) result;
1887 }
1888 #endif
1889 
1890 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd(__m128d __A,__m128d __B)1891 _mm_and_pd (__m128d __A, __m128d __B)
1892 {
1893   return (vec_and ((__v2df) __A, (__v2df) __B));
1894 }
1895 
1896 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd(__m128d __A,__m128d __B)1897 _mm_andnot_pd (__m128d __A, __m128d __B)
1898 {
1899   return (vec_andc ((__v2df) __B, (__v2df) __A));
1900 }
1901 
1902 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd(__m128d __A,__m128d __B)1903 _mm_or_pd (__m128d __A, __m128d __B)
1904 {
1905   return (vec_or ((__v2df) __A, (__v2df) __B));
1906 }
1907 
1908 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd(__m128d __A,__m128d __B)1909 _mm_xor_pd (__m128d __A, __m128d __B)
1910 {
1911   return (vec_xor ((__v2df) __A, (__v2df) __B));
1912 }
1913 
1914 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128(__m128i __A,__m128i __B)1915 _mm_and_si128 (__m128i __A, __m128i __B)
1916 {
1917   return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1918 }
1919 
1920 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128(__m128i __A,__m128i __B)1921 _mm_andnot_si128 (__m128i __A, __m128i __B)
1922 {
1923   return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1924 }
1925 
1926 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128(__m128i __A,__m128i __B)1927 _mm_or_si128 (__m128i __A, __m128i __B)
1928 {
1929   return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1930 }
1931 
1932 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128(__m128i __A,__m128i __B)1933 _mm_xor_si128 (__m128i __A, __m128i __B)
1934 {
1935   return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1936 }
1937 
1938 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8(__m128i __A,__m128i __B)1939 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1940 {
1941   return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1942 }
1943 
1944 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16(__m128i __A,__m128i __B)1945 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1946 {
1947   return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1948 }
1949 
1950 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32(__m128i __A,__m128i __B)1951 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1952 {
1953   return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1954 }
1955 
1956 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8(__m128i __A,__m128i __B)1957 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1958 {
1959   return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1960 }
1961 
1962 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16(__m128i __A,__m128i __B)1963 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1964 {
1965   return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1966 }
1967 
1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32(__m128i __A,__m128i __B)1969 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1970 {
1971   return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1972 }
1973 
1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8(__m128i __A,__m128i __B)1975 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1976 {
1977   return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1978 }
1979 
1980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16(__m128i __A,__m128i __B)1981 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1982 {
1983   return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1984 }
1985 
1986 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32(__m128i __A,__m128i __B)1987 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1988 {
1989   return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1990 }
1991 
1992 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16(__m128i const __A,int const __N)1993 _mm_extract_epi16 (__m128i const __A, int const __N)
1994 {
1995   return (unsigned short) ((__v8hi)__A)[__N & 7];
1996 }
1997 
1998 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16(__m128i const __A,int const __D,int const __N)1999 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
2000 {
2001   __v8hi result = (__v8hi)__A;
2002 
2003   result [(__N & 7)] = __D;
2004 
2005   return (__m128i) result;
2006 }
2007 
2008 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16(__m128i __A,__m128i __B)2009 _mm_max_epi16 (__m128i __A, __m128i __B)
2010 {
2011   return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
2012 }
2013 
2014 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8(__m128i __A,__m128i __B)2015 _mm_max_epu8 (__m128i __A, __m128i __B)
2016 {
2017   return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
2018 }
2019 
2020 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16(__m128i __A,__m128i __B)2021 _mm_min_epi16 (__m128i __A, __m128i __B)
2022 {
2023   return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
2024 }
2025 
2026 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8(__m128i __A,__m128i __B)2027 _mm_min_epu8 (__m128i __A, __m128i __B)
2028 {
2029   return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
2030 }
2031 
2032 
2033 #ifdef _ARCH_PWR8
2034 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
2035 
2036 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
2037 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8(__m128i __A)2038 _mm_movemask_epi8 (__m128i __A)
2039 {
2040   __vector unsigned long long result;
2041   static const __vector unsigned char perm_mask =
2042     {
2043 	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2044 	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2045     };
2046 
2047   result = ((__vector unsigned long long)
2048 	    vec_vbpermq ((__vector unsigned char) __A,
2049 			 (__vector unsigned char) perm_mask));
2050 
2051 #ifdef __LITTLE_ENDIAN__
2052   return result[1];
2053 #else
2054   return result[0];
2055 #endif
2056 }
2057 #endif /* _ARCH_PWR8 */
2058 
2059 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16(__m128i __A,__m128i __B)2060 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2061 {
2062   __v4su w0, w1;
2063   __v16qu xform1 = {
2064 #ifdef __LITTLE_ENDIAN__
2065       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
2066       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
2067 #else
2068       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
2069       0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
2070 #endif
2071     };
2072 
2073   w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2074   w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2075   return (__m128i) vec_perm (w0, w1, xform1);
2076 }
2077 
2078 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16(__m128i __A,const int __mask)2079 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2080 {
2081   unsigned long element_selector_98 = __mask & 0x03;
2082   unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2083   unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2084   unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2085   static const unsigned short permute_selectors[4] =
2086     {
2087 #ifdef __LITTLE_ENDIAN__
2088 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2089 #else
2090 	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2091 #endif
2092     };
2093   __v2du pmask =
2094 #ifdef __LITTLE_ENDIAN__
2095       { 0x1716151413121110UL,  0UL};
2096 #else
2097       { 0x1011121314151617UL,  0UL};
2098 #endif
2099   __m64_union t;
2100   __v2du a, r;
2101 
2102   t.as_short[0] = permute_selectors[element_selector_98];
2103   t.as_short[1] = permute_selectors[element_selector_BA];
2104   t.as_short[2] = permute_selectors[element_selector_DC];
2105   t.as_short[3] = permute_selectors[element_selector_FE];
2106   pmask[1] = t.as_m64;
2107   a = (__v2du)__A;
2108   r = vec_perm (a, a, (__vector unsigned char)pmask);
2109   return (__m128i) r;
2110 }
2111 
2112 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16(__m128i __A,const int __mask)2113 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2114 {
2115   unsigned long element_selector_10 = __mask & 0x03;
2116   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2117   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2118   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2119   static const unsigned short permute_selectors[4] =
2120     {
2121 #ifdef __LITTLE_ENDIAN__
2122 	      0x0100, 0x0302, 0x0504, 0x0706
2123 #else
2124 	      0x0001, 0x0203, 0x0405, 0x0607
2125 #endif
2126     };
2127   __v2du pmask =
2128 #ifdef __LITTLE_ENDIAN__
2129                  { 0UL,  0x1f1e1d1c1b1a1918UL};
2130 #else
2131                  { 0UL,  0x18191a1b1c1d1e1fUL};
2132 #endif
2133   __m64_union t;
2134   __v2du a, r;
2135   t.as_short[0] = permute_selectors[element_selector_10];
2136   t.as_short[1] = permute_selectors[element_selector_32];
2137   t.as_short[2] = permute_selectors[element_selector_54];
2138   t.as_short[3] = permute_selectors[element_selector_76];
2139   pmask[0] = t.as_m64;
2140   a = (__v2du)__A;
2141   r = vec_perm (a, a, (__vector unsigned char)pmask);
2142   return (__m128i) r;
2143 }
2144 
2145 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32(__m128i __A,const int __mask)2146 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2147 {
2148   unsigned long element_selector_10 = __mask & 0x03;
2149   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2150   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2151   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2152   static const unsigned int permute_selectors[4] =
2153     {
2154 #ifdef __LITTLE_ENDIAN__
2155 	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2156 #else
2157       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2158 #endif
2159     };
2160   __v4su t;
2161 
2162   t[0] = permute_selectors[element_selector_10];
2163   t[1] = permute_selectors[element_selector_32];
2164   t[2] = permute_selectors[element_selector_54] + 0x10101010;
2165   t[3] = permute_selectors[element_selector_76] + 0x10101010;
2166   return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2167 }
2168 
2169 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128(__m128i __A,__m128i __B,char * __C)2170 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2171 {
2172   __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2173   __v16qu mask, tmp;
2174   __m128i_u *p = (__m128i_u*)__C;
2175 
2176   tmp = (__v16qu)_mm_loadu_si128(p);
2177   mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2178   tmp = vec_sel (tmp, (__v16qu)__A, mask);
2179   _mm_storeu_si128 (p, (__m128i)tmp);
2180 }
2181 
2182 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8(__m128i __A,__m128i __B)2183 _mm_avg_epu8 (__m128i __A, __m128i __B)
2184 {
2185   return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2186 }
2187 
2188 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16(__m128i __A,__m128i __B)2189 _mm_avg_epu16 (__m128i __A, __m128i __B)
2190 {
2191   return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2192 }
2193 
2194 
2195 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8(__m128i __A,__m128i __B)2196 _mm_sad_epu8 (__m128i __A, __m128i __B)
2197 {
2198   __v16qu a, b;
2199   __v16qu vmin, vmax, vabsdiff;
2200   __v4si vsum;
2201   const __v4su zero = { 0, 0, 0, 0 };
2202   __v4si result;
2203 
2204   a = (__v16qu) __A;
2205   b = (__v16qu) __B;
2206   vmin = vec_min (a, b);
2207   vmax = vec_max (a, b);
2208   vabsdiff = vec_sub (vmax, vmin);
2209   /* Sum four groups of bytes into integers.  */
2210   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2211   /* Sum across four integers with two integer results.  */
2212   result = vec_sum2s (vsum, (__vector signed int) zero);
2213   /* Rotate the sums into the correct position.  */
2214 #ifdef __LITTLE_ENDIAN__
2215   result = vec_sld (result, result, 4);
2216 #else
2217   result = vec_sld (result, result, 6);
2218 #endif
2219   /* Rotate the sums into the correct position.  */
2220   return (__m128i) result;
2221 }
2222 
2223 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32(int * __A,int __B)2224 _mm_stream_si32 (int *__A, int __B)
2225 {
2226   /* Use the data cache block touch for store transient.  */
2227   __asm__ (
2228     "dcbtstt 0,%0"
2229     :
2230     : "b" (__A)
2231     : "memory"
2232   );
2233   *__A = __B;
2234 }
2235 
2236 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64(long long int * __A,long long int __B)2237 _mm_stream_si64 (long long int *__A, long long int __B)
2238 {
2239   /* Use the data cache block touch for store transient.  */
2240   __asm__ (
2241     "	dcbtstt	0,%0"
2242     :
2243     : "b" (__A)
2244     : "memory"
2245   );
2246   *__A = __B;
2247 }
2248 
2249 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128(__m128i * __A,__m128i __B)2250 _mm_stream_si128 (__m128i *__A, __m128i __B)
2251 {
2252   /* Use the data cache block touch for store transient.  */
2253   __asm__ (
2254     "dcbtstt 0,%0"
2255     :
2256     : "b" (__A)
2257     : "memory"
2258   );
2259   *__A = __B;
2260 }
2261 
2262 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd(double * __A,__m128d __B)2263 _mm_stream_pd (double *__A, __m128d __B)
2264 {
2265   /* Use the data cache block touch for store transient.  */
2266   __asm__ (
2267     "dcbtstt 0,%0"
2268     :
2269     : "b" (__A)
2270     : "memory"
2271   );
2272   *(__m128d*)__A = __B;
2273 }
2274 
2275 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush(void const * __A)2276 _mm_clflush (void const *__A)
2277 {
2278   /* Use the data cache block flush.  */
2279   __asm__ (
2280     "dcbf 0,%0"
2281     :
2282     : "b" (__A)
2283     : "memory"
2284   );
2285 }
2286 
2287 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence(void)2288 _mm_lfence (void)
2289 {
2290   /* Use light weight sync for load to load ordering.  */
2291   __atomic_thread_fence (__ATOMIC_RELEASE);
2292 }
2293 
2294 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence(void)2295 _mm_mfence (void)
2296 {
2297   /* Use heavy weight sync for any to any ordering.  */
2298   __atomic_thread_fence (__ATOMIC_SEQ_CST);
2299 }
2300 
2301 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128(int __A)2302 _mm_cvtsi32_si128 (int __A)
2303 {
2304   return _mm_set_epi32 (0, 0, 0, __A);
2305 }
2306 
2307 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128(long long __A)2308 _mm_cvtsi64_si128 (long long __A)
2309 {
2310   return __extension__ (__m128i)(__v2di){ __A, 0LL };
2311 }
2312 
2313 /* Microsoft intrinsic.  */
2314 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128(long long __A)2315 _mm_cvtsi64x_si128 (long long __A)
2316 {
2317   return __extension__ (__m128i)(__v2di){ __A, 0LL };
2318 }
2319 
2320 /* Casts between various SP, DP, INT vector types.  Note that these do no
2321    conversion of values, they just change the type.  */
2322 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)2323 _mm_castpd_ps(__m128d __A)
2324 {
2325   return (__m128) __A;
2326 }
2327 
2328 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)2329 _mm_castpd_si128(__m128d __A)
2330 {
2331   return (__m128i) __A;
2332 }
2333 
2334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)2335 _mm_castps_pd(__m128 __A)
2336 {
2337   return (__m128d) __A;
2338 }
2339 
2340 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)2341 _mm_castps_si128(__m128 __A)
2342 {
2343   return (__m128i) __A;
2344 }
2345 
2346 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)2347 _mm_castsi128_ps(__m128i __A)
2348 {
2349   return (__m128) __A;
2350 }
2351 
2352 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)2353 _mm_castsi128_pd(__m128i __A)
2354 {
2355   return (__m128d) __A;
2356 }
2357 
2358 #endif /* EMMINTRIN_H_ */
2359 
2360