1 /* Copyright (C) 2003-2018 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 9.0.  */
26 
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29    makes explicit use of Intel intrinsics to powerpc64le.
30    It is the user's responsibility to determine if the results are
31    acceptable and make additional changes as necessary.
32    Note that much code that uses Intel intrinsics can be rewritten in
33    standard C or GNU C extensions, which are more portable and better
34    optimized across multiple targets.
35 
36    In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37    the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38    operations.  However scalar double operations in vector (XMM)
39    registers require the POWER8 VSX ISA (2.07) level. Also there are
40    important differences for data format and placement of double
41    scalars in the vector register.
42 
43    For PowerISA Scalar double is in FPRs (left most 64-bits of the
44    low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45    the XMM. These differences require extra steps on POWER to match
46    the SSE2 scalar double semantics.
47 
48    Most SSE2 scalar double intrinsic operations can be performed more
49    efficiently as C language double scalar operations or optimized to
50    use vector SIMD operations.  We recommend this for new applications.
51 
52    Another difference is the format and details of the X86_64 MXSCR vs
53    the PowerISA FPSCR / VSCR registers. We recommend applications
54    replace direct access to the MXSCR with the more portable <fenv.h>
55    Posix APIs. */
56 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #endif
58 
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
61 
62 #include <altivec.h>
63 #include <assert.h>
64 
65 /* We need definitions from the SSE header files.  */
66 #include <xmmintrin.h>
67 
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
78 
79 /* The Intel API is flexible enough that we must allow aliasing with other
80    vector types, and their scalar components.  */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
83 
84 /* Unaligned version of the same types.  */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
87 
88 /* Define two value permute mask */
89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
90 
91 /* Create a vector with element 0 as F and the rest zero.  */
92 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd(double __F)93 _mm_set_sd (double __F)
94 {
95   return __extension__ (__m128d){ __F, 0.0 };
96 }
97 
98 /* Create a vector with both elements equal to F.  */
99 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd(double __F)100 _mm_set1_pd (double __F)
101 {
102   return __extension__ (__m128d){ __F, __F };
103 }
104 
105 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1(double __F)106 _mm_set_pd1 (double __F)
107 {
108   return _mm_set1_pd (__F);
109 }
110 
111 /* Create a vector with the lower value X and upper value W.  */
112 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd(double __W,double __X)113 _mm_set_pd (double __W, double __X)
114 {
115   return __extension__ (__m128d){ __X, __W };
116 }
117 
118 /* Create a vector with the lower value W and upper value X.  */
119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd(double __W,double __X)120 _mm_setr_pd (double __W, double __X)
121 {
122   return __extension__ (__m128d){ __W, __X };
123 }
124 
125 /* Create an undefined vector.  */
126 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd(void)127 _mm_undefined_pd (void)
128 {
129   __m128d __Y = __Y;
130   return __Y;
131 }
132 
133 /* Create a vector of zeros.  */
134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd(void)135 _mm_setzero_pd (void)
136 {
137   return (__m128d) vec_splats (0);
138 }
139 
140 /* Sets the low DPFP value of A from the low value of B.  */
141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd(__m128d __A,__m128d __B)142 _mm_move_sd (__m128d __A, __m128d __B)
143 {
144   __v2df result = (__v2df) __A;
145   result [0] = ((__v2df) __B)[0];
146   return (__m128d) result;
147 }
148 
149 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd(double const * __P)151 _mm_load_pd (double const *__P)
152 {
153   assert(((unsigned long)__P & 0xfUL) == 0UL);
154   return ((__m128d)vec_ld(0, (__v16qu*)__P));
155 }
156 
157 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
158 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd(double const * __P)159 _mm_loadu_pd (double const *__P)
160 {
161   return (vec_vsx_ld(0, __P));
162 }
163 
164 /* Create a vector with all two elements equal to *P.  */
165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd(double const * __P)166 _mm_load1_pd (double const *__P)
167 {
168   return (vec_splats (*__P));
169 }
170 
171 /* Create a vector with element 0 as *P and the rest zero.  */
172 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd(double const * __P)173 _mm_load_sd (double const *__P)
174 {
175   return _mm_set_sd (*__P);
176 }
177 
178 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1(double const * __P)179 _mm_load_pd1 (double const *__P)
180 {
181   return _mm_load1_pd (__P);
182 }
183 
184 /* Load two DPFP values in reverse order.  The address must be aligned.  */
185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd(double const * __P)186 _mm_loadr_pd (double const *__P)
187 {
188   __v2df __tmp = _mm_load_pd (__P);
189   return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
190 }
191 
192 /* Store two DPFP values.  The address must be 16-byte aligned.  */
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd(double * __P,__m128d __A)194 _mm_store_pd (double *__P, __m128d __A)
195 {
196   assert(((unsigned long)__P & 0xfUL) == 0UL);
197   vec_st((__v16qu)__A, 0, (__v16qu*)__P);
198 }
199 
200 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd(double * __P,__m128d __A)202 _mm_storeu_pd (double *__P, __m128d __A)
203 {
204   *(__m128d *)__P = __A;
205 }
206 
207 /* Stores the lower DPFP value.  */
208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd(double * __P,__m128d __A)209 _mm_store_sd (double *__P, __m128d __A)
210 {
211   *__P = ((__v2df)__A)[0];
212 }
213 
214 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64(__m128d __A)215 _mm_cvtsd_f64 (__m128d __A)
216 {
217   return ((__v2df)__A)[0];
218 }
219 
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd(double * __P,__m128d __A)221 _mm_storel_pd (double *__P, __m128d __A)
222 {
223   _mm_store_sd (__P, __A);
224 }
225 
226 /* Stores the upper DPFP value.  */
227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd(double * __P,__m128d __A)228 _mm_storeh_pd (double *__P, __m128d __A)
229 {
230   *__P = ((__v2df)__A)[1];
231 }
232 /* Store the lower DPFP value across two words.
233    The address must be 16-byte aligned.  */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd(double * __P,__m128d __A)235 _mm_store1_pd (double *__P, __m128d __A)
236 {
237   _mm_store_pd (__P, vec_splat (__A, 0));
238 }
239 
240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1(double * __P,__m128d __A)241 _mm_store_pd1 (double *__P, __m128d __A)
242 {
243   _mm_store1_pd (__P, __A);
244 }
245 
246 /* Store two DPFP values in reverse order.  The address must be aligned.  */
247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd(double * __P,__m128d __A)248 _mm_storer_pd (double *__P, __m128d __A)
249 {
250   _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
251 }
252 
253 /* Intel intrinsic.  */
254 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64(__m128i __A)255 _mm_cvtsi128_si64 (__m128i __A)
256 {
257   return ((__v2di)__A)[0];
258 }
259 
260 /* Microsoft intrinsic.  */
261 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x(__m128i __A)262 _mm_cvtsi128_si64x (__m128i __A)
263 {
264   return ((__v2di)__A)[0];
265 }
266 
267 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd(__m128d __A,__m128d __B)268 _mm_add_pd (__m128d __A, __m128d __B)
269 {
270   return (__m128d) ((__v2df)__A + (__v2df)__B);
271 }
272 
273 /* Add the lower double-precision (64-bit) floating-point element in
274    a and b, store the result in the lower element of dst, and copy
275    the upper element from a to the upper element of dst. */
276 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd(__m128d __A,__m128d __B)277 _mm_add_sd (__m128d __A, __m128d __B)
278 {
279   __A[0] = __A[0] + __B[0];
280   return (__A);
281 }
282 
283 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd(__m128d __A,__m128d __B)284 _mm_sub_pd (__m128d __A, __m128d __B)
285 {
286   return (__m128d) ((__v2df)__A - (__v2df)__B);
287 }
288 
289 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd(__m128d __A,__m128d __B)290 _mm_sub_sd (__m128d __A, __m128d __B)
291 {
292   __A[0] = __A[0] - __B[0];
293   return (__A);
294 }
295 
296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd(__m128d __A,__m128d __B)297 _mm_mul_pd (__m128d __A, __m128d __B)
298 {
299   return (__m128d) ((__v2df)__A * (__v2df)__B);
300 }
301 
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd(__m128d __A,__m128d __B)303 _mm_mul_sd (__m128d __A, __m128d __B)
304 {
305   __A[0] = __A[0] * __B[0];
306   return (__A);
307 }
308 
309 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd(__m128d __A,__m128d __B)310 _mm_div_pd (__m128d __A, __m128d __B)
311 {
312   return (__m128d) ((__v2df)__A / (__v2df)__B);
313 }
314 
315 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd(__m128d __A,__m128d __B)316 _mm_div_sd (__m128d __A, __m128d __B)
317 {
318   __A[0] = __A[0] / __B[0];
319   return (__A);
320 }
321 
322 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd(__m128d __A)323 _mm_sqrt_pd (__m128d __A)
324 {
325   return (vec_sqrt (__A));
326 }
327 
328 /* Return pair {sqrt (B[0]), A[1]}.  */
329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd(__m128d __A,__m128d __B)330 _mm_sqrt_sd (__m128d __A, __m128d __B)
331 {
332   __v2df c;
333   c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
334   return (__m128d) _mm_setr_pd (c[0], __A[1]);
335 }
336 
337 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd(__m128d __A,__m128d __B)338 _mm_min_pd (__m128d __A, __m128d __B)
339 {
340   return (vec_min (__A, __B));
341 }
342 
343 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd(__m128d __A,__m128d __B)344 _mm_min_sd (__m128d __A, __m128d __B)
345 {
346   __v2df a, b, c;
347   a = vec_splats (__A[0]);
348   b = vec_splats (__B[0]);
349   c = vec_min (a, b);
350   return (__m128d) _mm_setr_pd (c[0], __A[1]);
351 }
352 
353 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd(__m128d __A,__m128d __B)354 _mm_max_pd (__m128d __A, __m128d __B)
355 {
356   return (vec_max (__A, __B));
357 }
358 
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd(__m128d __A,__m128d __B)360 _mm_max_sd (__m128d __A, __m128d __B)
361 {
362   __v2df a, b, c;
363   a = vec_splats (__A[0]);
364   b = vec_splats (__B[0]);
365   c = vec_max (a, b);
366   return (__m128d) _mm_setr_pd (c[0], __A[1]);
367 }
368 
369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd(__m128d __A,__m128d __B)370 _mm_cmpeq_pd (__m128d __A, __m128d __B)
371 {
372   return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
373 }
374 
375 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd(__m128d __A,__m128d __B)376 _mm_cmplt_pd (__m128d __A, __m128d __B)
377 {
378   return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
379 }
380 
381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd(__m128d __A,__m128d __B)382 _mm_cmple_pd (__m128d __A, __m128d __B)
383 {
384   return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
385 }
386 
387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd(__m128d __A,__m128d __B)388 _mm_cmpgt_pd (__m128d __A, __m128d __B)
389 {
390   return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391 }
392 
393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd(__m128d __A,__m128d __B)394 _mm_cmpge_pd (__m128d __A, __m128d __B)
395 {
396   return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
397 }
398 
399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd(__m128d __A,__m128d __B)400 _mm_cmpneq_pd (__m128d __A, __m128d __B)
401 {
402   __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
403   return ((__m128d)vec_nor (temp, temp));
404 }
405 
406 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd(__m128d __A,__m128d __B)407 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
408 {
409   return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
410 }
411 
412 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd(__m128d __A,__m128d __B)413 _mm_cmpnle_pd (__m128d __A, __m128d __B)
414 {
415   return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
416 }
417 
418 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd(__m128d __A,__m128d __B)419 _mm_cmpngt_pd (__m128d __A, __m128d __B)
420 {
421   return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
422 }
423 
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd(__m128d __A,__m128d __B)425 _mm_cmpnge_pd (__m128d __A, __m128d __B)
426 {
427   return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
428 }
429 
430 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd(__m128d __A,__m128d __B)431 _mm_cmpord_pd (__m128d __A, __m128d __B)
432 {
433 #if _ARCH_PWR8
434   __v2du c, d;
435   /* Compare against self will return false (0's) if NAN.  */
436   c = (__v2du)vec_cmpeq (__A, __A);
437   d = (__v2du)vec_cmpeq (__B, __B);
438 #else
439   __v2du a, b;
440   __v2du c, d;
441   const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
442   a = (__v2du)vec_abs ((__v2df)__A);
443   b = (__v2du)vec_abs ((__v2df)__B);
444   c = (__v2du)vec_cmpgt (double_exp_mask, a);
445   d = (__v2du)vec_cmpgt (double_exp_mask, b);
446 #endif
447   /* A != NAN and B != NAN.  */
448   return ((__m128d)vec_and(c, d));
449 }
450 
451 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd(__m128d __A,__m128d __B)452 _mm_cmpunord_pd (__m128d __A, __m128d __B)
453 {
454 #if _ARCH_PWR8
455   __v2du c, d;
456   /* Compare against self will return false (0's) if NAN.  */
457   c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
458   d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
459   /* A == NAN OR B == NAN converts too:
460      NOT(A != NAN) OR NOT(B != NAN).  */
461   c = vec_nor (c, c);
462   return ((__m128d)vec_orc(c, d));
463 #else
464   __v2du c, d;
465   /* Compare against self will return false (0's) if NAN.  */
466   c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
467   d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
468   /* Convert the true ('1's) is NAN.  */
469   c = vec_nor (c, c);
470   d = vec_nor (d, d);
471   return ((__m128d)vec_or(c, d));
472 #endif
473 }
474 
475 extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A,__m128d __B)476 _mm_cmpeq_sd(__m128d  __A, __m128d  __B)
477 {
478   __v2df a, b, c;
479   /* PowerISA VSX does not allow partial (for just lower double)
480      results. So to insure we don't generate spurious exceptions
481      (from the upper double values) we splat the lower double
482      before we do the operation. */
483   a = vec_splats (__A[0]);
484   b = vec_splats (__B[0]);
485   c = (__v2df) vec_cmpeq(a, b);
486   /* Then we merge the lower double result with the original upper
487      double from __A.  */
488   return (__m128d) _mm_setr_pd (c[0], __A[1]);
489 }
490 
491 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd(__m128d __A,__m128d __B)492 _mm_cmplt_sd (__m128d __A, __m128d __B)
493 {
494   __v2df a, b, c;
495   a = vec_splats (__A[0]);
496   b = vec_splats (__B[0]);
497   c = (__v2df) vec_cmplt(a, b);
498   return (__m128d) _mm_setr_pd (c[0], __A[1]);
499 }
500 
501 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd(__m128d __A,__m128d __B)502 _mm_cmple_sd (__m128d __A, __m128d __B)
503 {
504   __v2df a, b, c;
505   a = vec_splats (__A[0]);
506   b = vec_splats (__B[0]);
507   c = (__v2df) vec_cmple(a, b);
508   return (__m128d) _mm_setr_pd (c[0], __A[1]);
509 }
510 
511 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd(__m128d __A,__m128d __B)512 _mm_cmpgt_sd (__m128d __A, __m128d __B)
513 {
514   __v2df a, b, c;
515   a = vec_splats (__A[0]);
516   b = vec_splats (__B[0]);
517   c = (__v2df) vec_cmpgt(a, b);
518   return (__m128d) _mm_setr_pd (c[0], __A[1]);
519 }
520 
521 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd(__m128d __A,__m128d __B)522 _mm_cmpge_sd (__m128d __A, __m128d __B)
523 {
524   __v2df a, b, c;
525   a = vec_splats (__A[0]);
526   b = vec_splats (__B[0]);
527   c = (__v2df) vec_cmpge(a, b);
528   return (__m128d) _mm_setr_pd (c[0], __A[1]);
529 }
530 
531 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd(__m128d __A,__m128d __B)532 _mm_cmpneq_sd (__m128d __A, __m128d __B)
533 {
534   __v2df a, b, c;
535   a = vec_splats (__A[0]);
536   b = vec_splats (__B[0]);
537   c = (__v2df) vec_cmpeq(a, b);
538   c = vec_nor (c, c);
539   return (__m128d) _mm_setr_pd (c[0], __A[1]);
540 }
541 
542 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd(__m128d __A,__m128d __B)543 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
544 {
545   __v2df a, b, c;
546   a = vec_splats (__A[0]);
547   b = vec_splats (__B[0]);
548   /* Not less than is just greater than or equal.  */
549   c = (__v2df) vec_cmpge(a, b);
550   return (__m128d) _mm_setr_pd (c[0], __A[1]);
551 }
552 
553 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd(__m128d __A,__m128d __B)554 _mm_cmpnle_sd (__m128d __A, __m128d __B)
555 {
556   __v2df a, b, c;
557   a = vec_splats (__A[0]);
558   b = vec_splats (__B[0]);
559   /* Not less than or equal is just greater than.  */
560   c = (__v2df) vec_cmpge(a, b);
561   return (__m128d) _mm_setr_pd (c[0], __A[1]);
562 }
563 
564 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd(__m128d __A,__m128d __B)565 _mm_cmpngt_sd (__m128d __A, __m128d __B)
566 {
567   __v2df a, b, c;
568   a = vec_splats (__A[0]);
569   b = vec_splats (__B[0]);
570   /* Not greater than is just less than or equal.  */
571   c = (__v2df) vec_cmple(a, b);
572   return (__m128d) _mm_setr_pd (c[0], __A[1]);
573 }
574 
575 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd(__m128d __A,__m128d __B)576 _mm_cmpnge_sd (__m128d __A, __m128d __B)
577 {
578   __v2df a, b, c;
579   a = vec_splats (__A[0]);
580   b = vec_splats (__B[0]);
581   /* Not greater than or equal is just less than.  */
582   c = (__v2df) vec_cmplt(a, b);
583   return (__m128d) _mm_setr_pd (c[0], __A[1]);
584 }
585 
586 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd(__m128d __A,__m128d __B)587 _mm_cmpord_sd (__m128d __A, __m128d __B)
588 {
589   __v2df r;
590   r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
591   return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
592 }
593 
594 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd(__m128d __A,__m128d __B)595 _mm_cmpunord_sd (__m128d __A, __m128d __B)
596 {
597   __v2df r;
598   r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
599   return (__m128d) _mm_setr_pd (r[0], __A[1]);
600 }
601 
602 /* FIXME
603    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
604    exactly the same because GCC for PowerPC only generates unordered
605    compares (scalar and vector).
606    Technically __mm_comieq_sp et all should be using the ordered
607    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
608    be OK.   */
609 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd(__m128d __A,__m128d __B)610 _mm_comieq_sd (__m128d __A, __m128d __B)
611 {
612   return (__A[0] == __B[0]);
613 }
614 
615 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd(__m128d __A,__m128d __B)616 _mm_comilt_sd (__m128d __A, __m128d __B)
617 {
618   return (__A[0] < __B[0]);
619 }
620 
621 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd(__m128d __A,__m128d __B)622 _mm_comile_sd (__m128d __A, __m128d __B)
623 {
624   return (__A[0] <= __B[0]);
625 }
626 
627 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd(__m128d __A,__m128d __B)628 _mm_comigt_sd (__m128d __A, __m128d __B)
629 {
630   return (__A[0] > __B[0]);
631 }
632 
633 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd(__m128d __A,__m128d __B)634 _mm_comige_sd (__m128d __A, __m128d __B)
635 {
636   return (__A[0] >= __B[0]);
637 }
638 
639 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd(__m128d __A,__m128d __B)640 _mm_comineq_sd (__m128d __A, __m128d __B)
641 {
642   return (__A[0] != __B[0]);
643 }
644 
645 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd(__m128d __A,__m128d __B)646 _mm_ucomieq_sd (__m128d __A, __m128d __B)
647 {
648 	return (__A[0] == __B[0]);
649 }
650 
651 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd(__m128d __A,__m128d __B)652 _mm_ucomilt_sd (__m128d __A, __m128d __B)
653 {
654 	return (__A[0] < __B[0]);
655 }
656 
657 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd(__m128d __A,__m128d __B)658 _mm_ucomile_sd (__m128d __A, __m128d __B)
659 {
660 	return (__A[0] <= __B[0]);
661 }
662 
663 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd(__m128d __A,__m128d __B)664 _mm_ucomigt_sd (__m128d __A, __m128d __B)
665 {
666 	return (__A[0] > __B[0]);
667 }
668 
669 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd(__m128d __A,__m128d __B)670 _mm_ucomige_sd (__m128d __A, __m128d __B)
671 {
672 	return (__A[0] >= __B[0]);
673 }
674 
675 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd(__m128d __A,__m128d __B)676 _mm_ucomineq_sd (__m128d __A, __m128d __B)
677 {
678   return (__A[0] != __B[0]);
679 }
680 
681 /* Create a vector of Qi, where i is the element number.  */
682 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x(long long __q1,long long __q0)683 _mm_set_epi64x (long long __q1, long long __q0)
684 {
685   return __extension__ (__m128i)(__v2di){ __q0, __q1 };
686 }
687 
688 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64(__m64 __q1,__m64 __q0)689 _mm_set_epi64 (__m64 __q1,  __m64 __q0)
690 {
691   return _mm_set_epi64x ((long long)__q1, (long long)__q0);
692 }
693 
694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32(int __q3,int __q2,int __q1,int __q0)695 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
696 {
697   return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
698 }
699 
700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16(short __q7,short __q6,short __q5,short __q4,short __q3,short __q2,short __q1,short __q0)701 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
702 	       short __q3, short __q2, short __q1, short __q0)
703 {
704   return __extension__ (__m128i)(__v8hi){
705     __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
706 }
707 
708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8(char __q15,char __q14,char __q13,char __q12,char __q11,char __q10,char __q09,char __q08,char __q07,char __q06,char __q05,char __q04,char __q03,char __q02,char __q01,char __q00)709 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
710 	      char __q11, char __q10, char __q09, char __q08,
711 	      char __q07, char __q06, char __q05, char __q04,
712 	      char __q03, char __q02, char __q01, char __q00)
713 {
714   return __extension__ (__m128i)(__v16qi){
715     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
716     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
717   };
718 }
719 
720 /* Set all of the elements of the vector to A.  */
721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x(long long __A)722 _mm_set1_epi64x (long long __A)
723 {
724   return _mm_set_epi64x (__A, __A);
725 }
726 
727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64(__m64 __A)728 _mm_set1_epi64 (__m64 __A)
729 {
730   return _mm_set_epi64 (__A, __A);
731 }
732 
733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32(int __A)734 _mm_set1_epi32 (int __A)
735 {
736   return _mm_set_epi32 (__A, __A, __A, __A);
737 }
738 
739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16(short __A)740 _mm_set1_epi16 (short __A)
741 {
742   return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
743 }
744 
745 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8(char __A)746 _mm_set1_epi8 (char __A)
747 {
748   return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
749 		       __A, __A, __A, __A, __A, __A, __A, __A);
750 }
751 
752 /* Create a vector of Qi, where i is the element number.
753    The parameter order is reversed from the _mm_set_epi* functions.  */
754 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64(__m64 __q0,__m64 __q1)755 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
756 {
757   return _mm_set_epi64 (__q1, __q0);
758 }
759 
760 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32(int __q0,int __q1,int __q2,int __q3)761 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
762 {
763   return _mm_set_epi32 (__q3, __q2, __q1, __q0);
764 }
765 
766 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16(short __q0,short __q1,short __q2,short __q3,short __q4,short __q5,short __q6,short __q7)767 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
768 	        short __q4, short __q5, short __q6, short __q7)
769 {
770   return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
771 }
772 
773 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8(char __q00,char __q01,char __q02,char __q03,char __q04,char __q05,char __q06,char __q07,char __q08,char __q09,char __q10,char __q11,char __q12,char __q13,char __q14,char __q15)774 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
775 	       char __q04, char __q05, char __q06, char __q07,
776 	       char __q08, char __q09, char __q10, char __q11,
777 	       char __q12, char __q13, char __q14, char __q15)
778 {
779   return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
780 		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
781 }
782 
783 /* Create a vector with element 0 as *P and the rest zero.  */
784 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128(__m128i const * __P)785 _mm_load_si128 (__m128i const *__P)
786 {
787   return *__P;
788 }
789 
790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128(__m128i_u const * __P)791 _mm_loadu_si128 (__m128i_u const *__P)
792 {
793   return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
794 }
795 
796 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64(__m128i_u const * __P)797 _mm_loadl_epi64 (__m128i_u const *__P)
798 {
799   return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
800 }
801 
802 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128(__m128i * __P,__m128i __B)803 _mm_store_si128 (__m128i *__P, __m128i __B)
804 {
805   assert(((unsigned long )__P & 0xfUL) == 0UL);
806   vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
807 }
808 
809 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128(__m128i_u * __P,__m128i __B)810 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
811 {
812   *__P = __B;
813 }
814 
815 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64(__m128i_u * __P,__m128i __B)816 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
817 {
818   *(long long *)__P = ((__v2di)__B)[0];
819 }
820 
821 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64(__m128i_u __B)822 _mm_movepi64_pi64 (__m128i_u __B)
823 {
824   return (__m64) ((__v2di)__B)[0];
825 }
826 
827 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64(__m64 __A)828 _mm_movpi64_epi64 (__m64 __A)
829 {
830   return _mm_set_epi64 ((__m64)0LL, __A);
831 }
832 
833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64(__m128i __A)834 _mm_move_epi64 (__m128i __A)
835 {
836   return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
837 }
838 
839 /* Create an undefined vector.  */
840 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128(void)841 _mm_undefined_si128 (void)
842 {
843   __m128i __Y = __Y;
844   return __Y;
845 }
846 
847 /* Create a vector of zeros.  */
848 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128(void)849 _mm_setzero_si128 (void)
850 {
851   return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
852 }
853 
854 #ifdef _ARCH_PWR8
855 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd(__m128i __A)856 _mm_cvtepi32_pd (__m128i __A)
857 {
858   __v2di val;
859   /* For LE need to generate Vector Unpack Low Signed Word.
860      Which is generated from unpackh.  */
861   val = (__v2di)vec_unpackh ((__v4si)__A);
862 
863   return (__m128d)vec_ctf (val, 0);
864 }
865 #endif
866 
867 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps(__m128i __A)868 _mm_cvtepi32_ps (__m128i __A)
869 {
870   return ((__m128)vec_ctf((__v4si)__A, 0));
871 }
872 
873 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32(__m128d __A)874 _mm_cvtpd_epi32 (__m128d __A)
875 {
876   __v2df rounded = vec_rint (__A);
877   __v4si result, temp;
878   const __v4si vzero =
879     { 0, 0, 0, 0 };
880 
881   /* VSX Vector truncate Double-Precision to integer and Convert to
882    Signed Integer Word format with Saturate.  */
883   __asm__(
884       "xvcvdpsxws %x0,%x1"
885       : "=wa" (temp)
886       : "wa" (rounded)
887       : );
888 
889 #ifdef _ARCH_PWR8
890   temp = vec_mergeo (temp, temp);
891   result = (__v4si)vec_vpkudum ((__vector long)temp, (__vector long)vzero);
892 #else
893   {
894     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
895 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
896     result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
897   }
898 #endif
899   return (__m128i) result;
900 }
901 
902 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32(__m128d __A)903 _mm_cvtpd_pi32 (__m128d __A)
904 {
905   __m128i result = _mm_cvtpd_epi32(__A);
906 
907   return (__m64) result[0];
908 }
909 
910 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps(__m128d __A)911 _mm_cvtpd_ps (__m128d __A)
912 {
913   __v4sf result;
914   __v4si temp;
915   const __v4si vzero = { 0, 0, 0, 0 };
916 
917   __asm__(
918       "xvcvdpsp %x0,%x1"
919       : "=wa" (temp)
920       : "wa" (__A)
921       : );
922 
923 #ifdef _ARCH_PWR8
924   temp = vec_mergeo (temp, temp);
925   result = (__v4sf)vec_vpkudum ((__vector long)temp, (__vector long)vzero);
926 #else
927   {
928     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
929 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
930     result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
931   }
932 #endif
933   return ((__m128)result);
934 }
935 
936 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32(__m128d __A)937 _mm_cvttpd_epi32 (__m128d __A)
938 {
939   __v4si result;
940   __v4si temp;
941   const __v4si vzero = { 0, 0, 0, 0 };
942 
943   /* VSX Vector truncate Double-Precision to integer and Convert to
944    Signed Integer Word format with Saturate.  */
945   __asm__(
946       "xvcvdpsxws %x0,%x1"
947       : "=wa" (temp)
948       : "wa" (__A)
949       : );
950 
951 #ifdef _ARCH_PWR8
952   temp = vec_mergeo (temp, temp);
953   result = (__v4si)vec_vpkudum ((__vector long)temp, (__vector long)vzero);
954 #else
955   {
956     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
957 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
958     result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
959   }
960 #endif
961 
962   return ((__m128i) result);
963 }
964 
965 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32(__m128d __A)966 _mm_cvttpd_pi32 (__m128d __A)
967 {
968   __m128i result = _mm_cvttpd_epi32 (__A);
969 
970   return (__m64) result[0];
971 }
972 
973 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32(__m128i __A)974 _mm_cvtsi128_si32 (__m128i __A)
975 {
976   return ((__v4si)__A)[0];
977 }
978 
979 #ifdef _ARCH_PWR8
980 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd(__m64 __A)981 _mm_cvtpi32_pd (__m64 __A)
982 {
983   __v4si temp;
984   __v2di tmp2;
985   __v2df result;
986 
987   temp = (__v4si)vec_splats (__A);
988   tmp2 = (__v2di)vec_unpackl (temp);
989   result = vec_ctf ((__vector signed long)tmp2, 0);
990   return (__m128d)result;
991 }
992 #endif
993 
994 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32(__m128 __A)995 _mm_cvtps_epi32 (__m128 __A)
996 {
997   __v4sf rounded;
998   __v4si result;
999 
1000   rounded = vec_rint((__v4sf) __A);
1001   result = vec_cts (rounded, 0);
1002   return (__m128i) result;
1003 }
1004 
1005 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32(__m128 __A)1006 _mm_cvttps_epi32 (__m128 __A)
1007 {
1008   __v4si result;
1009 
1010   result = vec_cts ((__v4sf) __A, 0);
1011   return (__m128i) result;
1012 }
1013 
1014 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd(__m128 __A)1015 _mm_cvtps_pd (__m128 __A)
1016 {
1017   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1018 #ifdef vec_doubleh
1019   return (__m128d) vec_doubleh ((__v4sf)__A);
1020 #else
1021   /* Otherwise the compiler is not current and so need to generate the
1022      equivalent code.  */
1023   __v4sf a = (__v4sf)__A;
1024   __v4sf temp;
1025   __v2df result;
1026 #ifdef __LITTLE_ENDIAN__
1027   /* The input float values are in elements {[0], [1]} but the convert
1028      instruction needs them in elements {[1], [3]}, So we use two
1029      shift left double vector word immediates to get the elements
1030      lined up.  */
1031   temp = __builtin_vsx_xxsldwi (a, a, 3);
1032   temp = __builtin_vsx_xxsldwi (a, temp, 2);
1033 #elif __BIG_ENDIAN__
1034   /* The input float values are in elements {[0], [1]} but the convert
1035      instruction needs them in elements {[0], [2]}, So we use two
1036      shift left double vector word immediates to get the elements
1037      lined up.  */
1038   temp = vec_vmrghw (a, a);
1039 #endif
1040   __asm__(
1041       " xvcvspdp %x0,%x1"
1042       : "=wa" (result)
1043       : "wa" (temp)
1044       : );
1045   return (__m128d) result;
1046 #endif
1047 }
1048 
1049 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32(__m128d __A)1050 _mm_cvtsd_si32 (__m128d __A)
1051 {
1052   __v2df rounded = vec_rint((__v2df) __A);
1053   int result = ((__v2df)rounded)[0];
1054 
1055   return result;
1056 }
1057 /* Intel intrinsic.  */
1058 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64(__m128d __A)1059 _mm_cvtsd_si64 (__m128d __A)
1060 {
1061   __v2df rounded = vec_rint ((__v2df) __A );
1062   long long result = ((__v2df) rounded)[0];
1063 
1064   return result;
1065 }
1066 
1067 /* Microsoft intrinsic.  */
1068 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x(__m128d __A)1069 _mm_cvtsd_si64x (__m128d __A)
1070 {
1071   return _mm_cvtsd_si64 ((__v2df)__A);
1072 }
1073 
1074 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32(__m128d __A)1075 _mm_cvttsd_si32 (__m128d __A)
1076 {
1077   int result = ((__v2df)__A)[0];
1078 
1079   return result;
1080 }
1081 
1082 /* Intel intrinsic.  */
1083 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64(__m128d __A)1084 _mm_cvttsd_si64 (__m128d __A)
1085 {
1086   long long result = ((__v2df)__A)[0];
1087 
1088   return result;
1089 }
1090 
1091 /* Microsoft intrinsic.  */
1092 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x(__m128d __A)1093 _mm_cvttsd_si64x (__m128d __A)
1094 {
1095   return _mm_cvttsd_si64 (__A);
1096 }
1097 
1098 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss(__m128 __A,__m128d __B)1099 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1100 {
1101   __v4sf result = (__v4sf)__A;
1102 
1103 #ifdef __LITTLE_ENDIAN__
1104   __v4sf temp_s;
1105   /* Copy double element[0] to element [1] for conversion.  */
1106   __v2df temp_b = vec_splat((__v2df)__B, 0);
1107 
1108   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1109   result = __builtin_vsx_xxsldwi (result, result, 3);
1110   /* Convert double to single float scalar in a vector.  */
1111   __asm__(
1112       "xscvdpsp %x0,%x1"
1113       : "=wa" (temp_s)
1114       : "wa" (temp_b)
1115       : );
1116   /* Shift the resulting scalar into vector element [0].  */
1117   result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1118 #else
1119   result [0] = ((__v2df)__B)[0];
1120 #endif
1121   return (__m128) result;
1122 }
1123 
1124 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd(__m128d __A,int __B)1125 _mm_cvtsi32_sd (__m128d __A, int __B)
1126 {
1127   __v2df result = (__v2df)__A;
1128   double db = __B;
1129   result [0] = db;
1130   return (__m128d)result;
1131 }
1132 
1133 /* Intel intrinsic.  */
1134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd(__m128d __A,long long __B)1135 _mm_cvtsi64_sd (__m128d __A, long long __B)
1136 {
1137   __v2df result = (__v2df)__A;
1138   double db = __B;
1139   result [0] = db;
1140   return (__m128d)result;
1141 }
1142 
1143 /* Microsoft intrinsic.  */
1144 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd(__m128d __A,long long __B)1145 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1146 {
1147   return _mm_cvtsi64_sd (__A, __B);
1148 }
1149 
1150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd(__m128d __A,__m128 __B)1151 _mm_cvtss_sd (__m128d __A, __m128 __B)
1152 {
1153 #ifdef __LITTLE_ENDIAN__
1154   /* Use splat to move element [0] into position for the convert. */
1155   __v4sf temp = vec_splat ((__v4sf)__B, 0);
1156   __v2df res;
1157   /* Convert single float scalar to double in a vector.  */
1158   __asm__(
1159       "xscvspdp %x0,%x1"
1160       : "=wa" (res)
1161       : "wa" (temp)
1162       : );
1163   return (__m128d) vec_mergel (res, (__v2df)__A);
1164 #else
1165   __v2df res = (__v2df)__A;
1166   res [0] = ((__v4sf)__B) [0];
1167   return (__m128d) res;
1168 #endif
1169 }
1170 
1171 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A,__m128d __B,const int __mask)1172 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1173 {
1174   __vector double result;
1175   const int litmsk = __mask & 0x3;
1176 
1177   if (litmsk == 0)
1178     result = vec_mergeh (__A, __B);
1179 #if __GNUC__ < 6
1180   else if (litmsk == 1)
1181     result = vec_xxpermdi (__B, __A, 2);
1182   else if (litmsk == 2)
1183     result = vec_xxpermdi (__B, __A, 1);
1184 #else
1185   else if (litmsk == 1)
1186     result = vec_xxpermdi (__A, __B, 2);
1187   else if (litmsk == 2)
1188     result = vec_xxpermdi (__A, __B, 1);
1189 #endif
1190   else
1191     result = vec_mergel (__A, __B);
1192 
1193   return result;
1194 }
1195 
1196 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd(__m128d __A,__m128d __B)1197 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1198 {
1199   return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1200 }
1201 
1202 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd(__m128d __A,__m128d __B)1203 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1204 {
1205   return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1206 }
1207 
1208 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd(__m128d __A,double const * __B)1209 _mm_loadh_pd (__m128d __A, double const *__B)
1210 {
1211   __v2df result = (__v2df)__A;
1212   result [1] = *__B;
1213   return (__m128d)result;
1214 }
1215 
1216 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd(__m128d __A,double const * __B)1217 _mm_loadl_pd (__m128d __A, double const *__B)
1218 {
1219   __v2df result = (__v2df)__A;
1220   result [0] = *__B;
1221   return (__m128d)result;
1222 }
1223 
1224 #ifdef _ARCH_PWR8
1225 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1226 
1227 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1228 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd(__m128d __A)1229 _mm_movemask_pd (__m128d  __A)
1230 {
1231   __vector __m64 result;
1232   static const __vector unsigned int perm_mask =
1233     {
1234 #ifdef __LITTLE_ENDIAN__
1235 	0x80800040, 0x80808080, 0x80808080, 0x80808080
1236 #elif __BIG_ENDIAN__
1237       0x80808080, 0x80808080, 0x80808080, 0x80800040
1238 #endif
1239     };
1240 
1241   result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1242 					 (__vector unsigned char) perm_mask);
1243 
1244 #ifdef __LITTLE_ENDIAN__
1245   return result[1];
1246 #elif __BIG_ENDIAN__
1247   return result[0];
1248 #endif
1249 }
1250 #endif /* _ARCH_PWR8 */
1251 
1252 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16(__m128i __A,__m128i __B)1253 _mm_packs_epi16 (__m128i __A, __m128i __B)
1254 {
1255   return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1256 }
1257 
1258 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32(__m128i __A,__m128i __B)1259 _mm_packs_epi32 (__m128i __A, __m128i __B)
1260 {
1261   return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1262 }
1263 
1264 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16(__m128i __A,__m128i __B)1265 _mm_packus_epi16 (__m128i __A, __m128i __B)
1266 {
1267   return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1268 }
1269 
1270 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8(__m128i __A,__m128i __B)1271 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1272 {
1273   return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1274 }
1275 
1276 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16(__m128i __A,__m128i __B)1277 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1278 {
1279   return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1280 }
1281 
1282 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32(__m128i __A,__m128i __B)1283 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1284 {
1285   return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1286 }
1287 
1288 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64(__m128i __A,__m128i __B)1289 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1290 {
1291   return (__m128i) vec_mergel ((__vector long)__A, (__vector long)__B);
1292 }
1293 
1294 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8(__m128i __A,__m128i __B)1295 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1296 {
1297   return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1298 }
1299 
1300 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16(__m128i __A,__m128i __B)1301 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1302 {
1303   return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1304 }
1305 
1306 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32(__m128i __A,__m128i __B)1307 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1308 {
1309   return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1310 }
1311 
1312 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64(__m128i __A,__m128i __B)1313 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1314 {
1315   return (__m128i) vec_mergeh ((__vector long)__A, (__vector long)__B);
1316 }
1317 
1318 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8(__m128i __A,__m128i __B)1319 _mm_add_epi8 (__m128i __A, __m128i __B)
1320 {
1321   return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1322 }
1323 
1324 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16(__m128i __A,__m128i __B)1325 _mm_add_epi16 (__m128i __A, __m128i __B)
1326 {
1327   return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1328 }
1329 
1330 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32(__m128i __A,__m128i __B)1331 _mm_add_epi32 (__m128i __A, __m128i __B)
1332 {
1333   return (__m128i) ((__v4su)__A + (__v4su)__B);
1334 }
1335 
1336 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64(__m128i __A,__m128i __B)1337 _mm_add_epi64 (__m128i __A, __m128i __B)
1338 {
1339   return (__m128i) ((__v2du)__A + (__v2du)__B);
1340 }
1341 
1342 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8(__m128i __A,__m128i __B)1343 _mm_adds_epi8 (__m128i __A, __m128i __B)
1344 {
1345   return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1346 }
1347 
1348 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16(__m128i __A,__m128i __B)1349 _mm_adds_epi16 (__m128i __A, __m128i __B)
1350 {
1351   return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1352 }
1353 
1354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8(__m128i __A,__m128i __B)1355 _mm_adds_epu8 (__m128i __A, __m128i __B)
1356 {
1357   return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1358 }
1359 
1360 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16(__m128i __A,__m128i __B)1361 _mm_adds_epu16 (__m128i __A, __m128i __B)
1362 {
1363   return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1364 }
1365 
1366 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8(__m128i __A,__m128i __B)1367 _mm_sub_epi8 (__m128i __A, __m128i __B)
1368 {
1369   return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1370 }
1371 
1372 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16(__m128i __A,__m128i __B)1373 _mm_sub_epi16 (__m128i __A, __m128i __B)
1374 {
1375   return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1376 }
1377 
1378 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32(__m128i __A,__m128i __B)1379 _mm_sub_epi32 (__m128i __A, __m128i __B)
1380 {
1381   return (__m128i) ((__v4su)__A - (__v4su)__B);
1382 }
1383 
1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64(__m128i __A,__m128i __B)1385 _mm_sub_epi64 (__m128i __A, __m128i __B)
1386 {
1387   return (__m128i) ((__v2du)__A - (__v2du)__B);
1388 }
1389 
1390 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8(__m128i __A,__m128i __B)1391 _mm_subs_epi8 (__m128i __A, __m128i __B)
1392 {
1393   return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1394 }
1395 
1396 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16(__m128i __A,__m128i __B)1397 _mm_subs_epi16 (__m128i __A, __m128i __B)
1398 {
1399   return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1400 }
1401 
1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8(__m128i __A,__m128i __B)1403 _mm_subs_epu8 (__m128i __A, __m128i __B)
1404 {
1405   return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1406 }
1407 
1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16(__m128i __A,__m128i __B)1409 _mm_subs_epu16 (__m128i __A, __m128i __B)
1410 {
1411   return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1412 }
1413 
1414 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16(__m128i __A,__m128i __B)1415 _mm_madd_epi16 (__m128i __A, __m128i __B)
1416 {
1417   __vector signed int zero = {0, 0, 0, 0};
1418 
1419   return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1420 }
1421 
1422 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16(__m128i __A,__m128i __B)1423 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1424 {
1425   __vector signed int w0, w1;
1426 
1427   __vector unsigned char xform1 = {
1428 #ifdef __LITTLE_ENDIAN__
1429       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1430       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1431 #elif __BIG_ENDIAN__
1432       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1433       0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
1434 #endif
1435     };
1436 
1437   w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1438   w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1439   return (__m128i) vec_perm (w0, w1, xform1);
1440 }
1441 
1442 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16(__m128i __A,__m128i __B)1443 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1444 {
1445     return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1446 }
1447 
1448 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32(__m64 __A,__m64 __B)1449 _mm_mul_su32 (__m64 __A, __m64 __B)
1450 {
1451   unsigned int a = __A;
1452   unsigned int b = __B;
1453 
1454   return ((__m64)a * (__m64)b);
1455 }
1456 
1457 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32(__m128i __A,__m128i __B)1458 _mm_mul_epu32 (__m128i __A, __m128i __B)
1459 {
1460 #if __GNUC__ < 8
1461   __v2du result;
1462 
1463 #ifdef __LITTLE_ENDIAN__
1464   /* VMX Vector Multiply Odd Unsigned Word.  */
1465   __asm__(
1466       "vmulouw %0,%1,%2"
1467       : "=v" (result)
1468       : "v" (__A), "v" (__B)
1469       : );
1470 #elif __BIG_ENDIAN__
1471   /* VMX Vector Multiply Even Unsigned Word.  */
1472   __asm__(
1473       "vmuleuw %0,%1,%2"
1474       : "=v" (result)
1475       : "v" (__A), "v" (__B)
1476       : );
1477 #endif
1478   return (__m128i) result;
1479 #else
1480 #ifdef __LITTLE_ENDIAN__
1481   return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1482 #elif __BIG_ENDIAN__
1483   return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B);
1484 #endif
1485 #endif
1486 }
1487 
1488 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16(__m128i __A,int __B)1489 _mm_slli_epi16 (__m128i __A, int __B)
1490 {
1491   __v8hu lshift;
1492   __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1493 
1494   if (__B >= 0 && __B < 16)
1495     {
1496       if (__builtin_constant_p(__B))
1497 	lshift = (__v8hu) vec_splat_s16(__B);
1498       else
1499 	lshift = vec_splats ((unsigned short) __B);
1500 
1501       result = vec_vslh ((__v8hi) __A, lshift);
1502     }
1503 
1504   return (__m128i) result;
1505 }
1506 
1507 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32(__m128i __A,int __B)1508 _mm_slli_epi32 (__m128i __A, int __B)
1509 {
1510   __v4su lshift;
1511   __v4si result = { 0, 0, 0, 0 };
1512 
1513   if (__B >= 0 && __B < 32)
1514     {
1515       if (__builtin_constant_p(__B) && __B < 16)
1516 	lshift = (__v4su) vec_splat_s32(__B);
1517       else
1518 	lshift = vec_splats ((unsigned int) __B);
1519 
1520       result = vec_vslw ((__v4si) __A, lshift);
1521     }
1522 
1523   return (__m128i) result;
1524 }
1525 
1526 #ifdef _ARCH_PWR8
1527 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64(__m128i __A,int __B)1528 _mm_slli_epi64 (__m128i __A, int __B)
1529 {
1530   __v2du lshift;
1531   __v2di result = { 0, 0 };
1532 
1533   if (__B >= 0 && __B < 64)
1534     {
1535       if (__builtin_constant_p(__B) && __B < 16)
1536 	lshift = (__v2du) vec_splat_s32(__B);
1537       else
1538 	lshift = (__v2du) vec_splats ((unsigned int) __B);
1539 
1540       result = vec_vsld ((__v2di) __A, lshift);
1541     }
1542 
1543   return (__m128i) result;
1544 }
1545 #endif
1546 
1547 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16(__m128i __A,int __B)1548 _mm_srai_epi16 (__m128i __A, int __B)
1549 {
1550   __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1551   __v8hi result;
1552 
1553   if (__B < 16)
1554     {
1555       if (__builtin_constant_p(__B))
1556 	rshift = (__v8hu) vec_splat_s16(__B);
1557       else
1558 	rshift = vec_splats ((unsigned short) __B);
1559     }
1560   result = vec_vsrah ((__v8hi) __A, rshift);
1561 
1562   return (__m128i) result;
1563 }
1564 
1565 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32(__m128i __A,int __B)1566 _mm_srai_epi32 (__m128i __A, int __B)
1567 {
1568   __v4su rshift = { 31, 31, 31, 31 };
1569   __v4si result;
1570 
1571   if (__B < 32)
1572     {
1573       if (__builtin_constant_p(__B))
1574 	{
1575 	  if (__B < 16)
1576 	      rshift = (__v4su) vec_splat_s32(__B);
1577 	    else
1578 	      rshift = (__v4su) vec_splats((unsigned int)__B);
1579 	}
1580       else
1581 	rshift = vec_splats ((unsigned int) __B);
1582     }
1583   result = vec_vsraw ((__v4si) __A, rshift);
1584 
1585   return (__m128i) result;
1586 }
1587 
1588 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128(__m128i __A,const int __N)1589 _mm_bslli_si128 (__m128i __A, const int __N)
1590 {
1591   __v16qu result;
1592   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1593 
1594   if (__N < 16)
1595     result = vec_sld ((__v16qu) __A, zeros, __N);
1596   else
1597     result = zeros;
1598 
1599   return (__m128i) result;
1600 }
1601 
1602 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128(__m128i __A,const int __N)1603 _mm_bsrli_si128 (__m128i __A, const int __N)
1604 {
1605   __v16qu result;
1606   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1607 
1608   if (__N < 16)
1609     if (__builtin_constant_p(__N))
1610       /* Would like to use Vector Shift Left Double by Octet
1611 	 Immediate here to use the immediate form and avoid
1612 	 load of __N * 8 value into a separate VR.  */
1613       result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1614     else
1615       {
1616 	__v16qu shift = vec_splats((unsigned char)(__N*8));
1617 	result = vec_sro ((__v16qu)__A, shift);
1618       }
1619   else
1620     result = zeros;
1621 
1622   return (__m128i) result;
1623 }
1624 
1625 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128(__m128i __A,const int __N)1626 _mm_srli_si128 (__m128i __A, const int __N)
1627 {
1628   return _mm_bsrli_si128 (__A, __N);
1629 }
1630 
1631 extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128(__m128i __A,const int _imm5)1632 _mm_slli_si128 (__m128i __A, const int _imm5)
1633 {
1634   __v16qu result;
1635   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1636 
1637   if (_imm5 < 16)
1638 #ifdef __LITTLE_ENDIAN__
1639     result = vec_sld ((__v16qu) __A, zeros, _imm5);
1640 #elif __BIG_ENDIAN__
1641     result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1642 #endif
1643   else
1644     result = zeros;
1645 
1646   return (__m128i) result;
1647 }
1648 
1649 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650 
_mm_srli_epi16(__m128i __A,int __B)1651 _mm_srli_epi16 (__m128i  __A, int __B)
1652 {
1653   __v8hu rshift;
1654   __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1655 
1656   if (__B < 16)
1657     {
1658       if (__builtin_constant_p(__B))
1659 	rshift = (__v8hu) vec_splat_s16(__B);
1660       else
1661 	rshift = vec_splats ((unsigned short) __B);
1662 
1663       result = vec_vsrh ((__v8hi) __A, rshift);
1664     }
1665 
1666   return (__m128i) result;
1667 }
1668 
1669 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32(__m128i __A,int __B)1670 _mm_srli_epi32 (__m128i __A, int __B)
1671 {
1672   __v4su rshift;
1673   __v4si result = { 0, 0, 0, 0 };
1674 
1675   if (__B < 32)
1676     {
1677       if (__builtin_constant_p(__B))
1678 	{
1679 	  if (__B < 16)
1680 	      rshift = (__v4su) vec_splat_s32(__B);
1681 	    else
1682 	      rshift = (__v4su) vec_splats((unsigned int)__B);
1683 	}
1684       else
1685 	rshift = vec_splats ((unsigned int) __B);
1686 
1687       result = vec_vsrw ((__v4si) __A, rshift);
1688     }
1689 
1690   return (__m128i) result;
1691 }
1692 
1693 #ifdef _ARCH_PWR8
1694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64(__m128i __A,int __B)1695 _mm_srli_epi64 (__m128i __A, int __B)
1696 {
1697   __v2du rshift;
1698   __v2di result = { 0, 0 };
1699 
1700   if (__B < 64)
1701     {
1702       if (__builtin_constant_p(__B))
1703 	{
1704 	  if (__B < 16)
1705 	      rshift = (__v2du) vec_splat_s32(__B);
1706 	    else
1707 	      rshift = (__v2du) vec_splats((unsigned long long)__B);
1708 	}
1709       else
1710 	rshift = (__v2du) vec_splats ((unsigned int) __B);
1711 
1712       result = vec_vsrd ((__v2di) __A, rshift);
1713     }
1714 
1715   return (__m128i) result;
1716 }
1717 #endif
1718 
1719 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16(__m128i __A,__m128i __B)1720 _mm_sll_epi16 (__m128i __A, __m128i __B)
1721 {
1722   __v8hu lshift, shmask;
1723   const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1724   __v8hu result;
1725 
1726 #ifdef __LITTLE_ENDIAN__
1727   lshift = vec_splat ((__v8hu)__B, 0);
1728 #elif __BIG_ENDIAN__
1729   lshift = vec_splat ((__v8hu)__B, 3);
1730 #endif
1731   shmask = lshift <= shmax;
1732   result = vec_vslh ((__v8hu) __A, lshift);
1733   result = vec_sel (shmask, result, shmask);
1734 
1735   return (__m128i) result;
1736 }
1737 
1738 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32(__m128i __A,__m128i __B)1739 _mm_sll_epi32 (__m128i __A, __m128i __B)
1740 {
1741   __v4su lshift, shmask;
1742   const __v4su shmax = { 32, 32, 32, 32 };
1743   __v4su result;
1744 #ifdef __LITTLE_ENDIAN__
1745   lshift = vec_splat ((__v4su)__B, 0);
1746 #elif __BIG_ENDIAN__
1747   lshift = vec_splat ((__v4su)__B, 1);
1748 #endif
1749   shmask = lshift < shmax;
1750   result = vec_vslw ((__v4su) __A, lshift);
1751   result = vec_sel (shmask, result, shmask);
1752 
1753   return (__m128i) result;
1754 }
1755 
1756 #ifdef _ARCH_PWR8
1757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64(__m128i __A,__m128i __B)1758 _mm_sll_epi64 (__m128i __A, __m128i __B)
1759 {
1760   __v2du lshift, shmask;
1761   const __v2du shmax = { 64, 64 };
1762   __v2du result;
1763 
1764   lshift = (__v2du) vec_splat ((__v2du)__B, 0);
1765   shmask = lshift < shmax;
1766   result = vec_vsld ((__v2du) __A, lshift);
1767   result = (__v2du) vec_sel ((__v2df) shmask, (__v2df) result,
1768 			      (__v2df) shmask);
1769 
1770   return (__m128i) result;
1771 }
1772 #endif
1773 
1774 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16(__m128i __A,__m128i __B)1775 _mm_sra_epi16 (__m128i __A, __m128i __B)
1776 {
1777   const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1778   __v8hu rshift;
1779   __v8hi result;
1780 
1781 #ifdef __LITTLE_ENDIAN__
1782   rshift = vec_splat ((__v8hu)__B, 0);
1783 #elif __BIG_ENDIAN__
1784   rshift = vec_splat ((__v8hu)__B, 3);
1785 #endif
1786   rshift = vec_min (rshift, rshmax);
1787   result = vec_vsrah ((__v8hi) __A, rshift);
1788 
1789   return (__m128i) result;
1790 }
1791 
1792 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32(__m128i __A,__m128i __B)1793 _mm_sra_epi32 (__m128i __A, __m128i __B)
1794 {
1795   const __v4su rshmax = { 31, 31, 31, 31 };
1796   __v4su rshift;
1797   __v4si result;
1798 
1799 #ifdef __LITTLE_ENDIAN__
1800   rshift = vec_splat ((__v4su)__B, 0);
1801 #elif __BIG_ENDIAN__
1802   rshift = vec_splat ((__v4su)__B, 1);
1803 #endif
1804   rshift = vec_min (rshift, rshmax);
1805   result = vec_vsraw ((__v4si) __A, rshift);
1806 
1807   return (__m128i) result;
1808 }
1809 
1810 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16(__m128i __A,__m128i __B)1811 _mm_srl_epi16 (__m128i __A, __m128i __B)
1812 {
1813   __v8hu rshift, shmask;
1814   const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1815   __v8hu result;
1816 
1817 #ifdef __LITTLE_ENDIAN__
1818   rshift = vec_splat ((__v8hu)__B, 0);
1819 #elif __BIG_ENDIAN__
1820   rshift = vec_splat ((__v8hu)__B, 3);
1821 #endif
1822   shmask = rshift <= shmax;
1823   result = vec_vsrh ((__v8hu) __A, rshift);
1824   result = vec_sel (shmask, result, shmask);
1825 
1826   return (__m128i) result;
1827 }
1828 
1829 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32(__m128i __A,__m128i __B)1830 _mm_srl_epi32 (__m128i __A, __m128i __B)
1831 {
1832   __v4su rshift, shmask;
1833   const __v4su shmax = { 32, 32, 32, 32 };
1834   __v4su result;
1835 
1836 #ifdef __LITTLE_ENDIAN__
1837   rshift = vec_splat ((__v4su)__B, 0);
1838 #elif __BIG_ENDIAN__
1839   rshift = vec_splat ((__v4su)__B, 1);
1840 #endif
1841   shmask = rshift < shmax;
1842   result = vec_vsrw ((__v4su) __A, rshift);
1843   result = vec_sel (shmask, result, shmask);
1844 
1845   return (__m128i) result;
1846 }
1847 
1848 #ifdef _ARCH_PWR8
1849 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64(__m128i __A,__m128i __B)1850 _mm_srl_epi64 (__m128i __A, __m128i __B)
1851 {
1852   __v2du rshift, shmask;
1853   const __v2du shmax = { 64, 64 };
1854   __v2du result;
1855 
1856   rshift = (__v2du) vec_splat ((__v2du)__B, 0);
1857   shmask = rshift < shmax;
1858   result = vec_vsrd ((__v2du) __A, rshift);
1859   result = (__v2du)vec_sel ((__v2du)shmask, (__v2du)result, (__v2du)shmask);
1860 
1861   return (__m128i) result;
1862 }
1863 #endif
1864 
1865 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd(__m128d __A,__m128d __B)1866 _mm_and_pd (__m128d __A, __m128d __B)
1867 {
1868   return (vec_and ((__v2df) __A, (__v2df) __B));
1869 }
1870 
1871 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd(__m128d __A,__m128d __B)1872 _mm_andnot_pd (__m128d __A, __m128d __B)
1873 {
1874   return (vec_andc ((__v2df) __B, (__v2df) __A));
1875 }
1876 
1877 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd(__m128d __A,__m128d __B)1878 _mm_or_pd (__m128d __A, __m128d __B)
1879 {
1880   return (vec_or ((__v2df) __A, (__v2df) __B));
1881 }
1882 
1883 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd(__m128d __A,__m128d __B)1884 _mm_xor_pd (__m128d __A, __m128d __B)
1885 {
1886   return (vec_xor ((__v2df) __A, (__v2df) __B));
1887 }
1888 
1889 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128(__m128i __A,__m128i __B)1890 _mm_and_si128 (__m128i __A, __m128i __B)
1891 {
1892   return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1893 }
1894 
1895 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128(__m128i __A,__m128i __B)1896 _mm_andnot_si128 (__m128i __A, __m128i __B)
1897 {
1898   return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1899 }
1900 
1901 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128(__m128i __A,__m128i __B)1902 _mm_or_si128 (__m128i __A, __m128i __B)
1903 {
1904   return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1905 }
1906 
1907 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128(__m128i __A,__m128i __B)1908 _mm_xor_si128 (__m128i __A, __m128i __B)
1909 {
1910   return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1911 }
1912 
1913 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8(__m128i __A,__m128i __B)1914 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1915 {
1916   return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1917 }
1918 
1919 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16(__m128i __A,__m128i __B)1920 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1921 {
1922   return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1923 }
1924 
1925 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32(__m128i __A,__m128i __B)1926 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1927 {
1928   return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1929 }
1930 
1931 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8(__m128i __A,__m128i __B)1932 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1933 {
1934   return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1935 }
1936 
1937 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16(__m128i __A,__m128i __B)1938 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1939 {
1940   return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1941 }
1942 
1943 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32(__m128i __A,__m128i __B)1944 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1945 {
1946   return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1947 }
1948 
1949 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8(__m128i __A,__m128i __B)1950 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1951 {
1952   return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1953 }
1954 
1955 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16(__m128i __A,__m128i __B)1956 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1957 {
1958   return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1959 }
1960 
1961 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32(__m128i __A,__m128i __B)1962 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1963 {
1964   return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1965 }
1966 
1967 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16(__m128i const __A,int const __N)1968 _mm_extract_epi16 (__m128i const __A, int const __N)
1969 {
1970   return (unsigned short) ((__v8hi)__A)[__N & 7];
1971 }
1972 
1973 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16(__m128i const __A,int const __D,int const __N)1974 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1975 {
1976   __v8hi result = (__v8hi)__A;
1977 
1978   result [(__N & 7)] = __D;
1979 
1980   return (__m128i) result;
1981 }
1982 
1983 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16(__m128i __A,__m128i __B)1984 _mm_max_epi16 (__m128i __A, __m128i __B)
1985 {
1986   return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1987 }
1988 
1989 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8(__m128i __A,__m128i __B)1990 _mm_max_epu8 (__m128i __A, __m128i __B)
1991 {
1992   return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1993 }
1994 
1995 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16(__m128i __A,__m128i __B)1996 _mm_min_epi16 (__m128i __A, __m128i __B)
1997 {
1998   return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1999 }
2000 
2001 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8(__m128i __A,__m128i __B)2002 _mm_min_epu8 (__m128i __A, __m128i __B)
2003 {
2004   return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
2005 }
2006 
2007 
2008 #ifdef _ARCH_PWR8
2009 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
2010 
2011 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
2012 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8(__m128i __A)2013 _mm_movemask_epi8 (__m128i __A)
2014 {
2015   __vector __m64 result;
2016   static const __vector unsigned char perm_mask =
2017     {
2018 #ifdef __LITTLE_ENDIAN__
2019 	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2020 	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2021 #elif __BIG_ENDIAN__
2022 	0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
2023 	0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78
2024 #endif
2025     };
2026 
2027   result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
2028 					 (__vector unsigned char) perm_mask);
2029 
2030 #ifdef __LITTLE_ENDIAN__
2031   return result[1];
2032 #elif __BIG_ENDIAN__
2033   return result[0];
2034 #endif
2035 }
2036 #endif /* _ARCH_PWR8 */
2037 
2038 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16(__m128i __A,__m128i __B)2039 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2040 {
2041   __v4su w0, w1;
2042   __v16qu xform1 = {
2043 #ifdef __LITTLE_ENDIAN__
2044       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
2045       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
2046 #elif __BIG_ENDIAN__
2047       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
2048       0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
2049 #endif
2050     };
2051 
2052   w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2053   w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2054   return (__m128i) vec_perm (w0, w1, xform1);
2055 }
2056 
2057 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16(__m128i __A,const int __mask)2058 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2059 {
2060   unsigned long element_selector_98 = __mask & 0x03;
2061   unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2062   unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2063   unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2064   static const unsigned short permute_selectors[4] =
2065     {
2066 #ifdef __LITTLE_ENDIAN__
2067 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2068 #elif __BIG_ENDIAN__
2069 	      0x0607, 0x0405, 0x0203, 0x0001
2070 #endif
2071     };
2072   __v2du pmask =
2073 #ifdef __LITTLE_ENDIAN__
2074       { 0x1716151413121110UL,  0x1f1e1d1c1b1a1918UL};
2075 #elif __BIG_ENDIAN__
2076       { 0x1011121314151617UL,  0x18191a1b1c1d1e1fUL};
2077 #endif
2078   __m64_union t;
2079   __v2du a, r;
2080 
2081 #ifdef __LITTLE_ENDIAN__
2082   t.as_short[0] = permute_selectors[element_selector_98];
2083   t.as_short[1] = permute_selectors[element_selector_BA];
2084   t.as_short[2] = permute_selectors[element_selector_DC];
2085   t.as_short[3] = permute_selectors[element_selector_FE];
2086 #elif __BIG_ENDIAN__
2087   t.as_short[3] = permute_selectors[element_selector_98];
2088   t.as_short[2] = permute_selectors[element_selector_BA];
2089   t.as_short[1] = permute_selectors[element_selector_DC];
2090   t.as_short[0] = permute_selectors[element_selector_FE];
2091 #endif
2092 #ifdef __LITTLE_ENDIAN__
2093   pmask[1] = t.as_m64;
2094 #elif __BIG_ENDIAN__
2095   pmask[0] = t.as_m64;
2096 #endif
2097   a = (__v2du)__A;
2098   r = vec_perm (a, a, (__vector unsigned char)pmask);
2099   return (__m128i) r;
2100 }
2101 
2102 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16(__m128i __A,const int __mask)2103 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2104 {
2105   unsigned long element_selector_10 = __mask & 0x03;
2106   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2107   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2108   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2109   static const unsigned short permute_selectors[4] =
2110     {
2111 #ifdef __LITTLE_ENDIAN__
2112 	      0x0100, 0x0302, 0x0504, 0x0706
2113 #elif __BIG_ENDIAN__
2114 	      0x0e0f, 0x0c0d, 0x0a0b, 0x0809
2115 #endif
2116     };
2117   __v2du pmask = { 0x1011121314151617UL,  0x1f1e1d1c1b1a1918UL};
2118   __m64_union t;
2119   __v2du a, r;
2120 
2121 #ifdef __LITTLE_ENDIAN__
2122   t.as_short[0] = permute_selectors[element_selector_10];
2123   t.as_short[1] = permute_selectors[element_selector_32];
2124   t.as_short[2] = permute_selectors[element_selector_54];
2125   t.as_short[3] = permute_selectors[element_selector_76];
2126 #elif __BIG_ENDIAN__
2127   t.as_short[3] = permute_selectors[element_selector_10];
2128   t.as_short[2] = permute_selectors[element_selector_32];
2129   t.as_short[1] = permute_selectors[element_selector_54];
2130   t.as_short[0] = permute_selectors[element_selector_76];
2131 #endif
2132 #ifdef __LITTLE_ENDIAN__
2133   pmask[0] = t.as_m64;
2134 #elif __BIG_ENDIAN__
2135   pmask[1] = t.as_m64;
2136 #endif
2137   a = (__v2du)__A;
2138   r = vec_perm (a, a, (__vector unsigned char)pmask);
2139   return (__m128i) r;
2140 }
2141 
2142 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32(__m128i __A,const int __mask)2143 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2144 {
2145   unsigned long element_selector_10 = __mask & 0x03;
2146   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2147   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2148   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2149   static const unsigned int permute_selectors[4] =
2150     {
2151 #ifdef __LITTLE_ENDIAN__
2152 	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2153 #elif __BIG_ENDIAN__
2154       0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
2155 #endif
2156     };
2157   __v4su t;
2158 
2159 #ifdef __LITTLE_ENDIAN__
2160   t[0] = permute_selectors[element_selector_10];
2161   t[1] = permute_selectors[element_selector_32];
2162   t[2] = permute_selectors[element_selector_54] + 0x10101010;
2163   t[3] = permute_selectors[element_selector_76] + 0x10101010;
2164 #elif __BIG_ENDIAN__
2165   t[3] = permute_selectors[element_selector_10] + 0x10101010;
2166   t[2] = permute_selectors[element_selector_32] + 0x10101010;
2167   t[1] = permute_selectors[element_selector_54];
2168   t[0] = permute_selectors[element_selector_76];
2169 #endif
2170   return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2171 }
2172 
2173 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128(__m128i __A,__m128i __B,char * __C)2174 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2175 {
2176   __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2177   __v16qu mask, tmp;
2178   __m128i *p = (__m128i*)__C;
2179 
2180   tmp = (__v16qu)_mm_loadu_si128(p);
2181   mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2182   tmp = vec_sel (tmp, (__v16qu)__A, mask);
2183   _mm_storeu_si128 (p, (__m128i)tmp);
2184 }
2185 
2186 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8(__m128i __A,__m128i __B)2187 _mm_avg_epu8 (__m128i __A, __m128i __B)
2188 {
2189   return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2190 }
2191 
2192 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16(__m128i __A,__m128i __B)2193 _mm_avg_epu16 (__m128i __A, __m128i __B)
2194 {
2195   return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2196 }
2197 
2198 
2199 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8(__m128i __A,__m128i __B)2200 _mm_sad_epu8 (__m128i __A, __m128i __B)
2201 {
2202   __v16qu a, b;
2203   __v16qu vmin, vmax, vabsdiff;
2204   __v4si vsum;
2205   const __v4su zero = { 0, 0, 0, 0 };
2206   __v4si result;
2207 
2208   a = (__v16qu) __A;
2209   b = (__v16qu) __B;
2210   vmin = vec_min (a, b);
2211   vmax = vec_max (a, b);
2212   vabsdiff = vec_sub (vmax, vmin);
2213   /* Sum four groups of bytes into integers.  */
2214   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2215   /* Sum across four integers with two integer results.  */
2216   result = vec_sum2s (vsum, (__vector signed int) zero);
2217   /* Rotate the sums into the correct position.  */
2218 #ifdef __LITTLE_ENDIAN__
2219   result = vec_sld (result, result, 4);
2220 #elif __BIG_ENDIAN__
2221   result = vec_sld (result, result, 6);
2222 #endif
2223   /* Rotate the sums into the correct position.  */
2224   return (__m128i) result;
2225 }
2226 
2227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32(int * __A,int __B)2228 _mm_stream_si32 (int *__A, int __B)
2229 {
2230   /* Use the data cache block touch for store transient.  */
2231   __asm__ (
2232     "dcbtstt 0,%0"
2233     :
2234     : "b" (__A)
2235     : "memory"
2236   );
2237   *__A = __B;
2238 }
2239 
2240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64(long long int * __A,long long int __B)2241 _mm_stream_si64 (long long int *__A, long long int __B)
2242 {
2243   /* Use the data cache block touch for store transient.  */
2244   __asm__ (
2245     "	dcbtstt	0,%0"
2246     :
2247     : "b" (__A)
2248     : "memory"
2249   );
2250   *__A = __B;
2251 }
2252 
2253 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128(__m128i * __A,__m128i __B)2254 _mm_stream_si128 (__m128i *__A, __m128i __B)
2255 {
2256   /* Use the data cache block touch for store transient.  */
2257   __asm__ (
2258     "dcbtstt 0,%0"
2259     :
2260     : "b" (__A)
2261     : "memory"
2262   );
2263   *__A = __B;
2264 }
2265 
2266 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd(double * __A,__m128d __B)2267 _mm_stream_pd (double *__A, __m128d __B)
2268 {
2269   /* Use the data cache block touch for store transient.  */
2270   __asm__ (
2271     "dcbtstt 0,%0"
2272     :
2273     : "b" (__A)
2274     : "memory"
2275   );
2276   *(__m128d*)__A = __B;
2277 }
2278 
2279 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush(void const * __A)2280 _mm_clflush (void const *__A)
2281 {
2282   /* Use the data cache block flush.  */
2283   __asm__ (
2284     "dcbf 0,%0"
2285     :
2286     : "b" (__A)
2287     : "memory"
2288   );
2289 }
2290 
2291 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence(void)2292 _mm_lfence (void)
2293 {
2294   /* Use light weight sync for load to load ordering.  */
2295   __atomic_thread_fence (__ATOMIC_RELEASE);
2296 }
2297 
2298 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence(void)2299 _mm_mfence (void)
2300 {
2301   /* Use heavy weight sync for any to any ordering.  */
2302   __atomic_thread_fence (__ATOMIC_SEQ_CST);
2303 }
2304 
2305 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128(int __A)2306 _mm_cvtsi32_si128 (int __A)
2307 {
2308   return _mm_set_epi32 (0, 0, 0, __A);
2309 }
2310 
2311 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128(long long __A)2312 _mm_cvtsi64_si128 (long long __A)
2313 {
2314   return __extension__ (__m128i)(__v2di){ __A, 0LL };
2315 }
2316 
2317 /* Microsoft intrinsic.  */
2318 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128(long long __A)2319 _mm_cvtsi64x_si128 (long long __A)
2320 {
2321   return __extension__ (__m128i)(__v2di){ __A, 0LL };
2322 }
2323 
2324 /* Casts between various SP, DP, INT vector types.  Note that these do no
2325    conversion of values, they just change the type.  */
2326 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)2327 _mm_castpd_ps(__m128d __A)
2328 {
2329   return (__m128) __A;
2330 }
2331 
2332 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)2333 _mm_castpd_si128(__m128d __A)
2334 {
2335   return (__m128i) __A;
2336 }
2337 
2338 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)2339 _mm_castps_pd(__m128 __A)
2340 {
2341   return (__m128d) __A;
2342 }
2343 
2344 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)2345 _mm_castps_si128(__m128 __A)
2346 {
2347   return (__m128i) __A;
2348 }
2349 
2350 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)2351 _mm_castsi128_ps(__m128i __A)
2352 {
2353   return (__m128) __A;
2354 }
2355 
2356 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)2357 _mm_castsi128_pd(__m128i __A)
2358 {
2359   return (__m128d) __A;
2360 }
2361 
2362 #endif /* EMMINTRIN_H_ */
2363 
2364