1 /* Copyright (C) 2003-2020 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 9.0.  */
26 
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29    makes explicit use of Intel intrinsics to powerpc64le.
30    It is the user's responsibility to determine if the results are
31    acceptable and make additional changes as necessary.
32    Note that much code that uses Intel intrinsics can be rewritten in
33    standard C or GNU C extensions, which are more portable and better
34    optimized across multiple targets.
35 
36    In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37    the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38    operations.  However scalar double operations in vector (XMM)
39    registers require the POWER8 VSX ISA (2.07) level. Also there are
40    important differences for data format and placement of double
41    scalars in the vector register.
42 
43    For PowerISA Scalar double is in FPRs (left most 64-bits of the
44    low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45    the XMM. These differences require extra steps on POWER to match
46    the SSE2 scalar double semantics.
47 
48    Most SSE2 scalar double intrinsic operations can be performed more
49    efficiently as C language double scalar operations or optimized to
50    use vector SIMD operations.  We recommend this for new applications.
51 
52    Another difference is the format and details of the X86_64 MXSCR vs
53    the PowerISA FPSCR / VSCR registers. We recommend applications
54    replace direct access to the MXSCR with the more portable <fenv.h>
55    Posix APIs. */
56 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #endif
58 
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
61 
62 #include <altivec.h>
63 #include <assert.h>
64 
65 /* We need definitions from the SSE header files.  */
66 #include <xmmintrin.h>
67 
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
78 
79 /* The Intel API is flexible enough that we must allow aliasing with other
80    vector types, and their scalar components.  */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
83 
84 /* Unaligned version of the same types.  */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
87 
88 /* Define two value permute mask.  */
89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
90 
91 /* Create a vector with element 0 as F and the rest zero.  */
92 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd(double __F)93 _mm_set_sd (double __F)
94 {
95   return __extension__ (__m128d){ __F, 0.0 };
96 }
97 
98 /* Create a vector with both elements equal to F.  */
99 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd(double __F)100 _mm_set1_pd (double __F)
101 {
102   return __extension__ (__m128d){ __F, __F };
103 }
104 
105 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1(double __F)106 _mm_set_pd1 (double __F)
107 {
108   return _mm_set1_pd (__F);
109 }
110 
111 /* Create a vector with the lower value X and upper value W.  */
112 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd(double __W,double __X)113 _mm_set_pd (double __W, double __X)
114 {
115   return __extension__ (__m128d){ __X, __W };
116 }
117 
118 /* Create a vector with the lower value W and upper value X.  */
119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd(double __W,double __X)120 _mm_setr_pd (double __W, double __X)
121 {
122   return __extension__ (__m128d){ __W, __X };
123 }
124 
125 /* Create an undefined vector.  */
126 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd(void)127 _mm_undefined_pd (void)
128 {
129   __m128d __Y = __Y;
130   return __Y;
131 }
132 
133 /* Create a vector of zeros.  */
134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd(void)135 _mm_setzero_pd (void)
136 {
137   return (__m128d) vec_splats (0);
138 }
139 
140 /* Sets the low DPFP value of A from the low value of B.  */
141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd(__m128d __A,__m128d __B)142 _mm_move_sd (__m128d __A, __m128d __B)
143 {
144   __v2df __result = (__v2df) __A;
145   __result [0] = ((__v2df) __B)[0];
146   return (__m128d) __result;
147 }
148 
149 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd(double const * __P)151 _mm_load_pd (double const *__P)
152 {
153   assert(((unsigned long)__P & 0xfUL) == 0UL);
154   return ((__m128d)vec_ld(0, (__v16qu*)__P));
155 }
156 
157 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
158 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd(double const * __P)159 _mm_loadu_pd (double const *__P)
160 {
161   return (vec_vsx_ld(0, __P));
162 }
163 
164 /* Create a vector with all two elements equal to *P.  */
165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd(double const * __P)166 _mm_load1_pd (double const *__P)
167 {
168   return (vec_splats (*__P));
169 }
170 
171 /* Create a vector with element 0 as *P and the rest zero.  */
172 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd(double const * __P)173 _mm_load_sd (double const *__P)
174 {
175   return _mm_set_sd (*__P);
176 }
177 
178 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1(double const * __P)179 _mm_load_pd1 (double const *__P)
180 {
181   return _mm_load1_pd (__P);
182 }
183 
184 /* Load two DPFP values in reverse order.  The address must be aligned.  */
185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd(double const * __P)186 _mm_loadr_pd (double const *__P)
187 {
188   __v2df __tmp = _mm_load_pd (__P);
189   return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
190 }
191 
192 /* Store two DPFP values.  The address must be 16-byte aligned.  */
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd(double * __P,__m128d __A)194 _mm_store_pd (double *__P, __m128d __A)
195 {
196   assert(((unsigned long)__P & 0xfUL) == 0UL);
197   vec_st((__v16qu)__A, 0, (__v16qu*)__P);
198 }
199 
200 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd(double * __P,__m128d __A)202 _mm_storeu_pd (double *__P, __m128d __A)
203 {
204   *(__m128d_u *)__P = __A;
205 }
206 
207 /* Stores the lower DPFP value.  */
208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd(double * __P,__m128d __A)209 _mm_store_sd (double *__P, __m128d __A)
210 {
211   *__P = ((__v2df)__A)[0];
212 }
213 
214 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64(__m128d __A)215 _mm_cvtsd_f64 (__m128d __A)
216 {
217   return ((__v2df)__A)[0];
218 }
219 
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd(double * __P,__m128d __A)221 _mm_storel_pd (double *__P, __m128d __A)
222 {
223   _mm_store_sd (__P, __A);
224 }
225 
226 /* Stores the upper DPFP value.  */
227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd(double * __P,__m128d __A)228 _mm_storeh_pd (double *__P, __m128d __A)
229 {
230   *__P = ((__v2df)__A)[1];
231 }
232 /* Store the lower DPFP value across two words.
233    The address must be 16-byte aligned.  */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd(double * __P,__m128d __A)235 _mm_store1_pd (double *__P, __m128d __A)
236 {
237   _mm_store_pd (__P, vec_splat (__A, 0));
238 }
239 
240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1(double * __P,__m128d __A)241 _mm_store_pd1 (double *__P, __m128d __A)
242 {
243   _mm_store1_pd (__P, __A);
244 }
245 
246 /* Store two DPFP values in reverse order.  The address must be aligned.  */
247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd(double * __P,__m128d __A)248 _mm_storer_pd (double *__P, __m128d __A)
249 {
250   _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
251 }
252 
253 /* Intel intrinsic.  */
254 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64(__m128i __A)255 _mm_cvtsi128_si64 (__m128i __A)
256 {
257   return ((__v2di)__A)[0];
258 }
259 
260 /* Microsoft intrinsic.  */
261 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x(__m128i __A)262 _mm_cvtsi128_si64x (__m128i __A)
263 {
264   return ((__v2di)__A)[0];
265 }
266 
267 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd(__m128d __A,__m128d __B)268 _mm_add_pd (__m128d __A, __m128d __B)
269 {
270   return (__m128d) ((__v2df)__A + (__v2df)__B);
271 }
272 
273 /* Add the lower double-precision (64-bit) floating-point element in
274    a and b, store the result in the lower element of dst, and copy
275    the upper element from a to the upper element of dst. */
276 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd(__m128d __A,__m128d __B)277 _mm_add_sd (__m128d __A, __m128d __B)
278 {
279   __A[0] = __A[0] + __B[0];
280   return (__A);
281 }
282 
283 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd(__m128d __A,__m128d __B)284 _mm_sub_pd (__m128d __A, __m128d __B)
285 {
286   return (__m128d) ((__v2df)__A - (__v2df)__B);
287 }
288 
289 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd(__m128d __A,__m128d __B)290 _mm_sub_sd (__m128d __A, __m128d __B)
291 {
292   __A[0] = __A[0] - __B[0];
293   return (__A);
294 }
295 
296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd(__m128d __A,__m128d __B)297 _mm_mul_pd (__m128d __A, __m128d __B)
298 {
299   return (__m128d) ((__v2df)__A * (__v2df)__B);
300 }
301 
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd(__m128d __A,__m128d __B)303 _mm_mul_sd (__m128d __A, __m128d __B)
304 {
305   __A[0] = __A[0] * __B[0];
306   return (__A);
307 }
308 
309 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd(__m128d __A,__m128d __B)310 _mm_div_pd (__m128d __A, __m128d __B)
311 {
312   return (__m128d) ((__v2df)__A / (__v2df)__B);
313 }
314 
315 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd(__m128d __A,__m128d __B)316 _mm_div_sd (__m128d __A, __m128d __B)
317 {
318   __A[0] = __A[0] / __B[0];
319   return (__A);
320 }
321 
322 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd(__m128d __A)323 _mm_sqrt_pd (__m128d __A)
324 {
325   return (vec_sqrt (__A));
326 }
327 
328 /* Return pair {sqrt (B[0]), A[1]}.  */
329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd(__m128d __A,__m128d __B)330 _mm_sqrt_sd (__m128d __A, __m128d __B)
331 {
332   __v2df __c;
333   __c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
334   return (__m128d) _mm_setr_pd (__c[0], __A[1]);
335 }
336 
337 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd(__m128d __A,__m128d __B)338 _mm_min_pd (__m128d __A, __m128d __B)
339 {
340   return (vec_min (__A, __B));
341 }
342 
343 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd(__m128d __A,__m128d __B)344 _mm_min_sd (__m128d __A, __m128d __B)
345 {
346   __v2df __a, __b, __c;
347   __a = vec_splats (__A[0]);
348   __b = vec_splats (__B[0]);
349   __c = vec_min (__a, __b);
350   return (__m128d) _mm_setr_pd (__c[0], __A[1]);
351 }
352 
353 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd(__m128d __A,__m128d __B)354 _mm_max_pd (__m128d __A, __m128d __B)
355 {
356   return (vec_max (__A, __B));
357 }
358 
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd(__m128d __A,__m128d __B)360 _mm_max_sd (__m128d __A, __m128d __B)
361 {
362   __v2df __a, __b, __c;
363   __a = vec_splats (__A[0]);
364   __b = vec_splats (__B[0]);
365   __c = vec_max (__a, __b);
366   return (__m128d) _mm_setr_pd (__c[0], __A[1]);
367 }
368 
369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd(__m128d __A,__m128d __B)370 _mm_cmpeq_pd (__m128d __A, __m128d __B)
371 {
372   return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
373 }
374 
375 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd(__m128d __A,__m128d __B)376 _mm_cmplt_pd (__m128d __A, __m128d __B)
377 {
378   return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
379 }
380 
381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd(__m128d __A,__m128d __B)382 _mm_cmple_pd (__m128d __A, __m128d __B)
383 {
384   return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
385 }
386 
387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd(__m128d __A,__m128d __B)388 _mm_cmpgt_pd (__m128d __A, __m128d __B)
389 {
390   return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391 }
392 
393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd(__m128d __A,__m128d __B)394 _mm_cmpge_pd (__m128d __A, __m128d __B)
395 {
396   return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
397 }
398 
399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd(__m128d __A,__m128d __B)400 _mm_cmpneq_pd (__m128d __A, __m128d __B)
401 {
402   __v2df __temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
403   return ((__m128d)vec_nor (__temp, __temp));
404 }
405 
406 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd(__m128d __A,__m128d __B)407 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
408 {
409   return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
410 }
411 
412 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd(__m128d __A,__m128d __B)413 _mm_cmpnle_pd (__m128d __A, __m128d __B)
414 {
415   return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
416 }
417 
418 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd(__m128d __A,__m128d __B)419 _mm_cmpngt_pd (__m128d __A, __m128d __B)
420 {
421   return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
422 }
423 
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd(__m128d __A,__m128d __B)425 _mm_cmpnge_pd (__m128d __A, __m128d __B)
426 {
427   return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
428 }
429 
430 #if _ARCH_PWR8
431 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd(__m128d __A,__m128d __B)432 _mm_cmpord_pd (__m128d __A, __m128d __B)
433 {
434   __v2du c, d;
435   /* Compare against self will return false (0's) if NAN.  */
436   c = (__v2du)vec_cmpeq (__A, __A);
437   d = (__v2du)vec_cmpeq (__B, __B);
438   /* A != NAN and B != NAN.  */
439   return ((__m128d)vec_and(c, d));
440 }
441 #endif
442 
443 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd(__m128d __A,__m128d __B)444 _mm_cmpunord_pd (__m128d __A, __m128d __B)
445 {
446 #if _ARCH_PWR8
447   __v2du c, d;
448   /* Compare against self will return false (0's) if NAN.  */
449   c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
450   d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
451   /* A == NAN OR B == NAN converts too:
452      NOT(A != NAN) OR NOT(B != NAN).  */
453   c = vec_nor (c, c);
454   return ((__m128d)vec_orc(c, d));
455 #else
456   __v2du c, d;
457   /* Compare against self will return false (0's) if NAN.  */
458   c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
459   d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
460   /* Convert the true ('1's) is NAN.  */
461   c = vec_nor (c, c);
462   d = vec_nor (d, d);
463   return ((__m128d)vec_or(c, d));
464 #endif
465 }
466 
467 extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A,__m128d __B)468 _mm_cmpeq_sd(__m128d  __A, __m128d  __B)
469 {
470   __v2df a, b, c;
471   /* PowerISA VSX does not allow partial (for just lower double)
472      results. So to insure we don't generate spurious exceptions
473      (from the upper double values) we splat the lower double
474      before we do the operation. */
475   a = vec_splats (__A[0]);
476   b = vec_splats (__B[0]);
477   c = (__v2df) vec_cmpeq(a, b);
478   /* Then we merge the lower double result with the original upper
479      double from __A.  */
480   return (__m128d) _mm_setr_pd (c[0], __A[1]);
481 }
482 
483 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd(__m128d __A,__m128d __B)484 _mm_cmplt_sd (__m128d __A, __m128d __B)
485 {
486   __v2df a, b, c;
487   a = vec_splats (__A[0]);
488   b = vec_splats (__B[0]);
489   c = (__v2df) vec_cmplt(a, b);
490   return (__m128d) _mm_setr_pd (c[0], __A[1]);
491 }
492 
493 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd(__m128d __A,__m128d __B)494 _mm_cmple_sd (__m128d __A, __m128d __B)
495 {
496   __v2df a, b, c;
497   a = vec_splats (__A[0]);
498   b = vec_splats (__B[0]);
499   c = (__v2df) vec_cmple(a, b);
500   return (__m128d) _mm_setr_pd (c[0], __A[1]);
501 }
502 
503 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd(__m128d __A,__m128d __B)504 _mm_cmpgt_sd (__m128d __A, __m128d __B)
505 {
506   __v2df a, b, c;
507   a = vec_splats (__A[0]);
508   b = vec_splats (__B[0]);
509   c = (__v2df) vec_cmpgt(a, b);
510   return (__m128d) _mm_setr_pd (c[0], __A[1]);
511 }
512 
513 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd(__m128d __A,__m128d __B)514 _mm_cmpge_sd (__m128d __A, __m128d __B)
515 {
516   __v2df a, b, c;
517   a = vec_splats (__A[0]);
518   b = vec_splats (__B[0]);
519   c = (__v2df) vec_cmpge(a, b);
520   return (__m128d) _mm_setr_pd (c[0], __A[1]);
521 }
522 
523 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd(__m128d __A,__m128d __B)524 _mm_cmpneq_sd (__m128d __A, __m128d __B)
525 {
526   __v2df a, b, c;
527   a = vec_splats (__A[0]);
528   b = vec_splats (__B[0]);
529   c = (__v2df) vec_cmpeq(a, b);
530   c = vec_nor (c, c);
531   return (__m128d) _mm_setr_pd (c[0], __A[1]);
532 }
533 
534 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd(__m128d __A,__m128d __B)535 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
536 {
537   __v2df a, b, c;
538   a = vec_splats (__A[0]);
539   b = vec_splats (__B[0]);
540   /* Not less than is just greater than or equal.  */
541   c = (__v2df) vec_cmpge(a, b);
542   return (__m128d) _mm_setr_pd (c[0], __A[1]);
543 }
544 
545 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd(__m128d __A,__m128d __B)546 _mm_cmpnle_sd (__m128d __A, __m128d __B)
547 {
548   __v2df a, b, c;
549   a = vec_splats (__A[0]);
550   b = vec_splats (__B[0]);
551   /* Not less than or equal is just greater than.  */
552   c = (__v2df) vec_cmpge(a, b);
553   return (__m128d) _mm_setr_pd (c[0], __A[1]);
554 }
555 
556 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd(__m128d __A,__m128d __B)557 _mm_cmpngt_sd (__m128d __A, __m128d __B)
558 {
559   __v2df a, b, c;
560   a = vec_splats (__A[0]);
561   b = vec_splats (__B[0]);
562   /* Not greater than is just less than or equal.  */
563   c = (__v2df) vec_cmple(a, b);
564   return (__m128d) _mm_setr_pd (c[0], __A[1]);
565 }
566 
567 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd(__m128d __A,__m128d __B)568 _mm_cmpnge_sd (__m128d __A, __m128d __B)
569 {
570   __v2df a, b, c;
571   a = vec_splats (__A[0]);
572   b = vec_splats (__B[0]);
573   /* Not greater than or equal is just less than.  */
574   c = (__v2df) vec_cmplt(a, b);
575   return (__m128d) _mm_setr_pd (c[0], __A[1]);
576 }
577 
578 #if _ARCH_PWR8
579 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd(__m128d __A,__m128d __B)580 _mm_cmpord_sd (__m128d __A, __m128d __B)
581 {
582   __v2df r;
583   r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
584   return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
585 }
586 #endif
587 
588 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd(__m128d __A,__m128d __B)589 _mm_cmpunord_sd (__m128d __A, __m128d __B)
590 {
591   __v2df r;
592   r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
593   return (__m128d) _mm_setr_pd (r[0], __A[1]);
594 }
595 
596 /* FIXME
597    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
598    exactly the same because GCC for PowerPC only generates unordered
599    compares (scalar and vector).
600    Technically __mm_comieq_sp et all should be using the ordered
601    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
602    be OK.   */
603 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd(__m128d __A,__m128d __B)604 _mm_comieq_sd (__m128d __A, __m128d __B)
605 {
606   return (__A[0] == __B[0]);
607 }
608 
609 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd(__m128d __A,__m128d __B)610 _mm_comilt_sd (__m128d __A, __m128d __B)
611 {
612   return (__A[0] < __B[0]);
613 }
614 
615 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd(__m128d __A,__m128d __B)616 _mm_comile_sd (__m128d __A, __m128d __B)
617 {
618   return (__A[0] <= __B[0]);
619 }
620 
621 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd(__m128d __A,__m128d __B)622 _mm_comigt_sd (__m128d __A, __m128d __B)
623 {
624   return (__A[0] > __B[0]);
625 }
626 
627 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd(__m128d __A,__m128d __B)628 _mm_comige_sd (__m128d __A, __m128d __B)
629 {
630   return (__A[0] >= __B[0]);
631 }
632 
633 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd(__m128d __A,__m128d __B)634 _mm_comineq_sd (__m128d __A, __m128d __B)
635 {
636   return (__A[0] != __B[0]);
637 }
638 
639 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd(__m128d __A,__m128d __B)640 _mm_ucomieq_sd (__m128d __A, __m128d __B)
641 {
642 	return (__A[0] == __B[0]);
643 }
644 
645 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd(__m128d __A,__m128d __B)646 _mm_ucomilt_sd (__m128d __A, __m128d __B)
647 {
648 	return (__A[0] < __B[0]);
649 }
650 
651 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd(__m128d __A,__m128d __B)652 _mm_ucomile_sd (__m128d __A, __m128d __B)
653 {
654 	return (__A[0] <= __B[0]);
655 }
656 
657 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd(__m128d __A,__m128d __B)658 _mm_ucomigt_sd (__m128d __A, __m128d __B)
659 {
660 	return (__A[0] > __B[0]);
661 }
662 
663 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd(__m128d __A,__m128d __B)664 _mm_ucomige_sd (__m128d __A, __m128d __B)
665 {
666 	return (__A[0] >= __B[0]);
667 }
668 
669 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd(__m128d __A,__m128d __B)670 _mm_ucomineq_sd (__m128d __A, __m128d __B)
671 {
672   return (__A[0] != __B[0]);
673 }
674 
675 /* Create a vector of Qi, where i is the element number.  */
676 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x(long long __q1,long long __q0)677 _mm_set_epi64x (long long __q1, long long __q0)
678 {
679   return __extension__ (__m128i)(__v2di){ __q0, __q1 };
680 }
681 
682 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64(__m64 __q1,__m64 __q0)683 _mm_set_epi64 (__m64 __q1,  __m64 __q0)
684 {
685   return _mm_set_epi64x ((long long)__q1, (long long)__q0);
686 }
687 
688 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32(int __q3,int __q2,int __q1,int __q0)689 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
690 {
691   return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
692 }
693 
694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16(short __q7,short __q6,short __q5,short __q4,short __q3,short __q2,short __q1,short __q0)695 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
696 	       short __q3, short __q2, short __q1, short __q0)
697 {
698   return __extension__ (__m128i)(__v8hi){
699     __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
700 }
701 
702 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8(char __q15,char __q14,char __q13,char __q12,char __q11,char __q10,char __q09,char __q08,char __q07,char __q06,char __q05,char __q04,char __q03,char __q02,char __q01,char __q00)703 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
704 	      char __q11, char __q10, char __q09, char __q08,
705 	      char __q07, char __q06, char __q05, char __q04,
706 	      char __q03, char __q02, char __q01, char __q00)
707 {
708   return __extension__ (__m128i)(__v16qi){
709     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
710     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
711   };
712 }
713 
714 /* Set all of the elements of the vector to A.  */
715 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x(long long __A)716 _mm_set1_epi64x (long long __A)
717 {
718   return _mm_set_epi64x (__A, __A);
719 }
720 
721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64(__m64 __A)722 _mm_set1_epi64 (__m64 __A)
723 {
724   return _mm_set_epi64 (__A, __A);
725 }
726 
727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32(int __A)728 _mm_set1_epi32 (int __A)
729 {
730   return _mm_set_epi32 (__A, __A, __A, __A);
731 }
732 
733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16(short __A)734 _mm_set1_epi16 (short __A)
735 {
736   return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
737 }
738 
739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8(char __A)740 _mm_set1_epi8 (char __A)
741 {
742   return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
743 		       __A, __A, __A, __A, __A, __A, __A, __A);
744 }
745 
746 /* Create a vector of Qi, where i is the element number.
747    The parameter order is reversed from the _mm_set_epi* functions.  */
748 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64(__m64 __q0,__m64 __q1)749 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
750 {
751   return _mm_set_epi64 (__q1, __q0);
752 }
753 
754 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32(int __q0,int __q1,int __q2,int __q3)755 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
756 {
757   return _mm_set_epi32 (__q3, __q2, __q1, __q0);
758 }
759 
760 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16(short __q0,short __q1,short __q2,short __q3,short __q4,short __q5,short __q6,short __q7)761 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
762 	        short __q4, short __q5, short __q6, short __q7)
763 {
764   return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
765 }
766 
767 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8(char __q00,char __q01,char __q02,char __q03,char __q04,char __q05,char __q06,char __q07,char __q08,char __q09,char __q10,char __q11,char __q12,char __q13,char __q14,char __q15)768 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
769 	       char __q04, char __q05, char __q06, char __q07,
770 	       char __q08, char __q09, char __q10, char __q11,
771 	       char __q12, char __q13, char __q14, char __q15)
772 {
773   return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
774 		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
775 }
776 
777 /* Create a vector with element 0 as *P and the rest zero.  */
778 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128(__m128i const * __P)779 _mm_load_si128 (__m128i const *__P)
780 {
781   return *__P;
782 }
783 
784 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128(__m128i_u const * __P)785 _mm_loadu_si128 (__m128i_u const *__P)
786 {
787   return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
788 }
789 
790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64(__m128i_u const * __P)791 _mm_loadl_epi64 (__m128i_u const *__P)
792 {
793   return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
794 }
795 
796 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128(__m128i * __P,__m128i __B)797 _mm_store_si128 (__m128i *__P, __m128i __B)
798 {
799   assert(((unsigned long )__P & 0xfUL) == 0UL);
800   vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
801 }
802 
803 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128(__m128i_u * __P,__m128i __B)804 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
805 {
806   *__P = __B;
807 }
808 
809 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64(__m128i_u * __P,__m128i __B)810 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
811 {
812   *(long long *)__P = ((__v2di)__B)[0];
813 }
814 
815 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64(__m128i_u __B)816 _mm_movepi64_pi64 (__m128i_u __B)
817 {
818   return (__m64) ((__v2di)__B)[0];
819 }
820 
821 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64(__m64 __A)822 _mm_movpi64_epi64 (__m64 __A)
823 {
824   return _mm_set_epi64 ((__m64)0LL, __A);
825 }
826 
827 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64(__m128i __A)828 _mm_move_epi64 (__m128i __A)
829 {
830   return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
831 }
832 
833 /* Create an undefined vector.  */
834 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128(void)835 _mm_undefined_si128 (void)
836 {
837   __m128i __Y = __Y;
838   return __Y;
839 }
840 
841 /* Create a vector of zeros.  */
842 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128(void)843 _mm_setzero_si128 (void)
844 {
845   return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
846 }
847 
848 #ifdef _ARCH_PWR8
849 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd(__m128i __A)850 _mm_cvtepi32_pd (__m128i __A)
851 {
852   __v2di __val;
853   /* For LE need to generate Vector Unpack Low Signed Word.
854      Which is generated from unpackh.  */
855   __val = (__v2di)vec_unpackh ((__v4si)__A);
856 
857   return (__m128d)vec_ctf (__val, 0);
858 }
859 #endif
860 
861 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps(__m128i __A)862 _mm_cvtepi32_ps (__m128i __A)
863 {
864   return ((__m128)vec_ctf((__v4si)__A, 0));
865 }
866 
867 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32(__m128d __A)868 _mm_cvtpd_epi32 (__m128d __A)
869 {
870   __v2df __rounded = vec_rint (__A);
871   __v4si __result, __temp;
872   const __v4si __vzero =
873     { 0, 0, 0, 0 };
874 
875   /* VSX Vector truncate Double-Precision to integer and Convert to
876    Signed Integer Word format with Saturate.  */
877   __asm__(
878       "xvcvdpsxws %x0,%x1"
879       : "=wa" (__temp)
880       : "wa" (__rounded)
881       : );
882 
883 #ifdef _ARCH_PWR8
884 #ifdef __LITTLE_ENDIAN__
885   __temp = vec_mergeo (__temp, __temp);
886 #else
887   __temp = vec_mergee (__temp, __temp);
888 #endif
889   __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
890 				 (__vector long long) __vzero);
891 #else
892   {
893     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
894 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
895     __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
896   }
897 #endif
898   return (__m128i) __result;
899 }
900 
901 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32(__m128d __A)902 _mm_cvtpd_pi32 (__m128d __A)
903 {
904   __m128i __result = _mm_cvtpd_epi32(__A);
905 
906   return (__m64) __result[0];
907 }
908 
909 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps(__m128d __A)910 _mm_cvtpd_ps (__m128d __A)
911 {
912   __v4sf __result;
913   __v4si __temp;
914   const __v4si __vzero = { 0, 0, 0, 0 };
915 
916   __asm__(
917       "xvcvdpsp %x0,%x1"
918       : "=wa" (__temp)
919       : "wa" (__A)
920       : );
921 
922 #ifdef _ARCH_PWR8
923 #ifdef __LITTLE_ENDIAN__
924   __temp = vec_mergeo (__temp, __temp);
925 #else
926   __temp = vec_mergee (__temp, __temp);
927 #endif
928   __result = (__v4sf) vec_vpkudum ((__vector long long) __temp,
929 				 (__vector long long) __vzero);
930 #else
931   {
932     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
933 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
934     __result = (__v4sf) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
935   }
936 #endif
937   return ((__m128)__result);
938 }
939 
940 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32(__m128d __A)941 _mm_cvttpd_epi32 (__m128d __A)
942 {
943   __v4si __result;
944   __v4si __temp;
945   const __v4si __vzero = { 0, 0, 0, 0 };
946 
947   /* VSX Vector truncate Double-Precision to integer and Convert to
948    Signed Integer Word format with Saturate.  */
949   __asm__(
950       "xvcvdpsxws %x0,%x1"
951       : "=wa" (__temp)
952       : "wa" (__A)
953       : );
954 
955 #ifdef _ARCH_PWR8
956 #ifdef __LITTLE_ENDIAN__
957   __temp = vec_mergeo (__temp, __temp);
958 #else
959   __temp = vec_mergee (__temp, __temp);
960 #endif
961   __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
962 				 (__vector long long) __vzero);
963 #else
964   {
965     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
966 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
967     __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
968   }
969 #endif
970 
971   return ((__m128i) __result);
972 }
973 
974 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32(__m128d __A)975 _mm_cvttpd_pi32 (__m128d __A)
976 {
977   __m128i __result = _mm_cvttpd_epi32 (__A);
978 
979   return (__m64) __result[0];
980 }
981 
982 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32(__m128i __A)983 _mm_cvtsi128_si32 (__m128i __A)
984 {
985   return ((__v4si)__A)[0];
986 }
987 
988 #ifdef _ARCH_PWR8
989 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd(__m64 __A)990 _mm_cvtpi32_pd (__m64 __A)
991 {
992   __v4si __temp;
993   __v2di __tmp2;
994   __v2df __result;
995 
996   __temp = (__v4si)vec_splats (__A);
997   __tmp2 = (__v2di)vec_unpackl (__temp);
998   __result = vec_ctf ((__vector signed long long) __tmp2, 0);
999   return (__m128d)__result;
1000 }
1001 #endif
1002 
1003 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32(__m128 __A)1004 _mm_cvtps_epi32 (__m128 __A)
1005 {
1006   __v4sf __rounded;
1007   __v4si __result;
1008 
1009   __rounded = vec_rint((__v4sf) __A);
1010   __result = vec_cts (__rounded, 0);
1011   return (__m128i) __result;
1012 }
1013 
1014 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32(__m128 __A)1015 _mm_cvttps_epi32 (__m128 __A)
1016 {
1017   __v4si __result;
1018 
1019   __result = vec_cts ((__v4sf) __A, 0);
1020   return (__m128i) __result;
1021 }
1022 
1023 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd(__m128 __A)1024 _mm_cvtps_pd (__m128 __A)
1025 {
1026   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1027 #ifdef vec_doubleh
1028   return (__m128d) vec_doubleh ((__v4sf)__A);
1029 #else
1030   /* Otherwise the compiler is not current and so need to generate the
1031      equivalent code.  */
1032   __v4sf __a = (__v4sf)__A;
1033   __v4sf __temp;
1034   __v2df __result;
1035 #ifdef __LITTLE_ENDIAN__
1036   /* The input float values are in elements {[0], [1]} but the convert
1037      instruction needs them in elements {[1], [3]}, So we use two
1038      shift left double vector word immediates to get the elements
1039      lined up.  */
1040   __temp = __builtin_vsx_xxsldwi (__a, __a, 3);
1041   __temp = __builtin_vsx_xxsldwi (__a, __temp, 2);
1042 #else
1043   /* The input float values are in elements {[0], [1]} but the convert
1044      instruction needs them in elements {[0], [2]}, So we use two
1045      shift left double vector word immediates to get the elements
1046      lined up.  */
1047   __temp = vec_vmrghw (__a, __a);
1048 #endif
1049   __asm__(
1050       " xvcvspdp %x0,%x1"
1051       : "=wa" (__result)
1052       : "wa" (__temp)
1053       : );
1054   return (__m128d) __result;
1055 #endif
1056 }
1057 
1058 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32(__m128d __A)1059 _mm_cvtsd_si32 (__m128d __A)
1060 {
1061   __v2df __rounded = vec_rint((__v2df) __A);
1062   int __result = ((__v2df)__rounded)[0];
1063 
1064   return __result;
1065 }
1066 /* Intel intrinsic.  */
1067 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64(__m128d __A)1068 _mm_cvtsd_si64 (__m128d __A)
1069 {
1070   __v2df __rounded = vec_rint ((__v2df) __A );
1071   long long __result = ((__v2df) __rounded)[0];
1072 
1073   return __result;
1074 }
1075 
1076 /* Microsoft intrinsic.  */
1077 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x(__m128d __A)1078 _mm_cvtsd_si64x (__m128d __A)
1079 {
1080   return _mm_cvtsd_si64 ((__v2df)__A);
1081 }
1082 
1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32(__m128d __A)1084 _mm_cvttsd_si32 (__m128d __A)
1085 {
1086   int __result = ((__v2df)__A)[0];
1087 
1088   return __result;
1089 }
1090 
1091 /* Intel intrinsic.  */
1092 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64(__m128d __A)1093 _mm_cvttsd_si64 (__m128d __A)
1094 {
1095   long long __result = ((__v2df)__A)[0];
1096 
1097   return __result;
1098 }
1099 
1100 /* Microsoft intrinsic.  */
1101 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x(__m128d __A)1102 _mm_cvttsd_si64x (__m128d __A)
1103 {
1104   return _mm_cvttsd_si64 (__A);
1105 }
1106 
1107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss(__m128 __A,__m128d __B)1108 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1109 {
1110   __v4sf __result = (__v4sf)__A;
1111 
1112 #ifdef __LITTLE_ENDIAN__
1113   __v4sf __temp_s;
1114   /* Copy double element[0] to element [1] for conversion.  */
1115   __v2df __temp_b = vec_splat((__v2df)__B, 0);
1116 
1117   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1118   __result = __builtin_vsx_xxsldwi (__result, __result, 3);
1119   /* Convert double to single float scalar in a vector.  */
1120   __asm__(
1121       "xscvdpsp %x0,%x1"
1122       : "=wa" (__temp_s)
1123       : "wa" (__temp_b)
1124       : );
1125   /* Shift the resulting scalar into vector element [0].  */
1126   __result = __builtin_vsx_xxsldwi (__result, __temp_s, 1);
1127 #else
1128   __result [0] = ((__v2df)__B)[0];
1129 #endif
1130   return (__m128) __result;
1131 }
1132 
1133 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd(__m128d __A,int __B)1134 _mm_cvtsi32_sd (__m128d __A, int __B)
1135 {
1136   __v2df __result = (__v2df)__A;
1137   double __db = __B;
1138   __result [0] = __db;
1139   return (__m128d)__result;
1140 }
1141 
1142 /* Intel intrinsic.  */
1143 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd(__m128d __A,long long __B)1144 _mm_cvtsi64_sd (__m128d __A, long long __B)
1145 {
1146   __v2df __result = (__v2df)__A;
1147   double __db = __B;
1148   __result [0] = __db;
1149   return (__m128d)__result;
1150 }
1151 
1152 /* Microsoft intrinsic.  */
1153 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd(__m128d __A,long long __B)1154 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1155 {
1156   return _mm_cvtsi64_sd (__A, __B);
1157 }
1158 
1159 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd(__m128d __A,__m128 __B)1160 _mm_cvtss_sd (__m128d __A, __m128 __B)
1161 {
1162 #ifdef __LITTLE_ENDIAN__
1163   /* Use splat to move element [0] into position for the convert. */
1164   __v4sf __temp = vec_splat ((__v4sf)__B, 0);
1165   __v2df __res;
1166   /* Convert single float scalar to double in a vector.  */
1167   __asm__(
1168       "xscvspdp %x0,%x1"
1169       : "=wa" (__res)
1170       : "wa" (__temp)
1171       : );
1172   return (__m128d) vec_mergel (__res, (__v2df)__A);
1173 #else
1174   __v2df __res = (__v2df)__A;
1175   __res [0] = ((__v4sf)__B) [0];
1176   return (__m128d) __res;
1177 #endif
1178 }
1179 
1180 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A,__m128d __B,const int __mask)1181 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1182 {
1183   __vector double __result;
1184   const int __litmsk = __mask & 0x3;
1185 
1186   if (__litmsk == 0)
1187     __result = vec_mergeh (__A, __B);
1188 #if __GNUC__ < 6
1189   else if (__litmsk == 1)
1190     __result = vec_xxpermdi (__B, __A, 2);
1191   else if (__litmsk == 2)
1192     __result = vec_xxpermdi (__B, __A, 1);
1193 #else
1194   else if (__litmsk == 1)
1195     __result = vec_xxpermdi (__A, __B, 2);
1196   else if (__litmsk == 2)
1197     __result = vec_xxpermdi (__A, __B, 1);
1198 #endif
1199   else
1200     __result = vec_mergel (__A, __B);
1201 
1202   return __result;
1203 }
1204 
1205 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd(__m128d __A,__m128d __B)1206 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1207 {
1208   return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1209 }
1210 
1211 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd(__m128d __A,__m128d __B)1212 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1213 {
1214   return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1215 }
1216 
1217 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd(__m128d __A,double const * __B)1218 _mm_loadh_pd (__m128d __A, double const *__B)
1219 {
1220   __v2df __result = (__v2df)__A;
1221   __result [1] = *__B;
1222   return (__m128d)__result;
1223 }
1224 
1225 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd(__m128d __A,double const * __B)1226 _mm_loadl_pd (__m128d __A, double const *__B)
1227 {
1228   __v2df __result = (__v2df)__A;
1229   __result [0] = *__B;
1230   return (__m128d)__result;
1231 }
1232 
1233 #ifdef _ARCH_PWR8
1234 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1235 
1236 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1237 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd(__m128d __A)1238 _mm_movemask_pd (__m128d  __A)
1239 {
1240   __vector unsigned long long __result;
1241   static const __vector unsigned int __perm_mask =
1242     {
1243 #ifdef __LITTLE_ENDIAN__
1244 	0x80800040, 0x80808080, 0x80808080, 0x80808080
1245 #else
1246       0x80808080, 0x80808080, 0x80808080, 0x80804000
1247 #endif
1248     };
1249 
1250   __result = ((__vector unsigned long long)
1251 	    vec_vbpermq ((__vector unsigned char) __A,
1252 			 (__vector unsigned char) __perm_mask));
1253 
1254 #ifdef __LITTLE_ENDIAN__
1255   return __result[1];
1256 #else
1257   return __result[0];
1258 #endif
1259 }
1260 #endif /* _ARCH_PWR8 */
1261 
1262 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16(__m128i __A,__m128i __B)1263 _mm_packs_epi16 (__m128i __A, __m128i __B)
1264 {
1265   return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1266 }
1267 
1268 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32(__m128i __A,__m128i __B)1269 _mm_packs_epi32 (__m128i __A, __m128i __B)
1270 {
1271   return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1272 }
1273 
1274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16(__m128i __A,__m128i __B)1275 _mm_packus_epi16 (__m128i __A, __m128i __B)
1276 {
1277   return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1278 }
1279 
1280 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8(__m128i __A,__m128i __B)1281 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1282 {
1283   return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1284 }
1285 
1286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16(__m128i __A,__m128i __B)1287 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1288 {
1289   return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1290 }
1291 
1292 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32(__m128i __A,__m128i __B)1293 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1294 {
1295   return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1296 }
1297 
1298 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64(__m128i __A,__m128i __B)1299 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1300 {
1301   return (__m128i) vec_mergel ((__vector long long) __A,
1302 			       (__vector long long) __B);
1303 }
1304 
1305 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8(__m128i __A,__m128i __B)1306 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1307 {
1308   return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1309 }
1310 
1311 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16(__m128i __A,__m128i __B)1312 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1313 {
1314   return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1315 }
1316 
1317 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32(__m128i __A,__m128i __B)1318 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1319 {
1320   return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1321 }
1322 
1323 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64(__m128i __A,__m128i __B)1324 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1325 {
1326   return (__m128i) vec_mergeh ((__vector long long) __A,
1327 			       (__vector long long) __B);
1328 }
1329 
1330 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8(__m128i __A,__m128i __B)1331 _mm_add_epi8 (__m128i __A, __m128i __B)
1332 {
1333   return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1334 }
1335 
1336 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16(__m128i __A,__m128i __B)1337 _mm_add_epi16 (__m128i __A, __m128i __B)
1338 {
1339   return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1340 }
1341 
1342 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32(__m128i __A,__m128i __B)1343 _mm_add_epi32 (__m128i __A, __m128i __B)
1344 {
1345   return (__m128i) ((__v4su)__A + (__v4su)__B);
1346 }
1347 
1348 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64(__m128i __A,__m128i __B)1349 _mm_add_epi64 (__m128i __A, __m128i __B)
1350 {
1351   return (__m128i) ((__v2du)__A + (__v2du)__B);
1352 }
1353 
1354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8(__m128i __A,__m128i __B)1355 _mm_adds_epi8 (__m128i __A, __m128i __B)
1356 {
1357   return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1358 }
1359 
1360 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16(__m128i __A,__m128i __B)1361 _mm_adds_epi16 (__m128i __A, __m128i __B)
1362 {
1363   return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1364 }
1365 
1366 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8(__m128i __A,__m128i __B)1367 _mm_adds_epu8 (__m128i __A, __m128i __B)
1368 {
1369   return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1370 }
1371 
1372 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16(__m128i __A,__m128i __B)1373 _mm_adds_epu16 (__m128i __A, __m128i __B)
1374 {
1375   return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1376 }
1377 
1378 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8(__m128i __A,__m128i __B)1379 _mm_sub_epi8 (__m128i __A, __m128i __B)
1380 {
1381   return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1382 }
1383 
1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16(__m128i __A,__m128i __B)1385 _mm_sub_epi16 (__m128i __A, __m128i __B)
1386 {
1387   return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1388 }
1389 
1390 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32(__m128i __A,__m128i __B)1391 _mm_sub_epi32 (__m128i __A, __m128i __B)
1392 {
1393   return (__m128i) ((__v4su)__A - (__v4su)__B);
1394 }
1395 
1396 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64(__m128i __A,__m128i __B)1397 _mm_sub_epi64 (__m128i __A, __m128i __B)
1398 {
1399   return (__m128i) ((__v2du)__A - (__v2du)__B);
1400 }
1401 
1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8(__m128i __A,__m128i __B)1403 _mm_subs_epi8 (__m128i __A, __m128i __B)
1404 {
1405   return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1406 }
1407 
1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16(__m128i __A,__m128i __B)1409 _mm_subs_epi16 (__m128i __A, __m128i __B)
1410 {
1411   return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1412 }
1413 
1414 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8(__m128i __A,__m128i __B)1415 _mm_subs_epu8 (__m128i __A, __m128i __B)
1416 {
1417   return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1418 }
1419 
1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16(__m128i __A,__m128i __B)1421 _mm_subs_epu16 (__m128i __A, __m128i __B)
1422 {
1423   return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1424 }
1425 
1426 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16(__m128i __A,__m128i __B)1427 _mm_madd_epi16 (__m128i __A, __m128i __B)
1428 {
1429   __vector signed int __zero = {0, 0, 0, 0};
1430 
1431   return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, __zero);
1432 }
1433 
1434 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16(__m128i __A,__m128i __B)1435 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1436 {
1437   __vector signed int __w0, __w1;
1438 
1439   __vector unsigned char __xform1 = {
1440 #ifdef __LITTLE_ENDIAN__
1441       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1442       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1443 #else
1444       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1445       0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
1446 #endif
1447     };
1448 
1449   __w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1450   __w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1451   return (__m128i) vec_perm (__w0, __w1, __xform1);
1452 }
1453 
1454 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16(__m128i __A,__m128i __B)1455 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1456 {
1457     return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1458 }
1459 
1460 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32(__m64 __A,__m64 __B)1461 _mm_mul_su32 (__m64 __A, __m64 __B)
1462 {
1463   unsigned int __a = __A;
1464   unsigned int __b = __B;
1465 
1466   return ((__m64)__a * (__m64)__b);
1467 }
1468 
1469 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32(__m128i __A,__m128i __B)1470 _mm_mul_epu32 (__m128i __A, __m128i __B)
1471 {
1472 #if __GNUC__ < 8 || !defined (_ARCH_PWR8)
1473   __v2du __result;
1474 
1475 #ifdef __LITTLE_ENDIAN__
1476   /* VMX Vector Multiply Odd Unsigned Word.  */
1477   __asm__(
1478       "vmulouw %0,%1,%2"
1479       : "=v" (__result)
1480       : "v" (__A), "v" (__B)
1481       : );
1482 #else
1483   /* VMX Vector Multiply Even Unsigned Word.  */
1484   __asm__(
1485       "vmuleuw %0,%1,%2"
1486       : "=v" (__result)
1487       : "v" (__A), "v" (__B)
1488       : );
1489 #endif
1490   return (__m128i) __result;
1491 #else
1492   return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1493 #endif
1494 }
1495 
1496 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16(__m128i __A,int __B)1497 _mm_slli_epi16 (__m128i __A, int __B)
1498 {
1499   __v8hu __lshift;
1500   __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1501 
1502   if (__B >= 0 && __B < 16)
1503     {
1504       if (__builtin_constant_p(__B))
1505 	__lshift = (__v8hu) vec_splat_s16(__B);
1506       else
1507 	__lshift = vec_splats ((unsigned short) __B);
1508 
1509       __result = vec_sl ((__v8hi) __A, __lshift);
1510     }
1511 
1512   return (__m128i) __result;
1513 }
1514 
1515 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32(__m128i __A,int __B)1516 _mm_slli_epi32 (__m128i __A, int __B)
1517 {
1518   __v4su __lshift;
1519   __v4si __result = { 0, 0, 0, 0 };
1520 
1521   if (__B >= 0 && __B < 32)
1522     {
1523       if (__builtin_constant_p(__B) && __B < 16)
1524 	__lshift = (__v4su) vec_splat_s32(__B);
1525       else
1526 	__lshift = vec_splats ((unsigned int) __B);
1527 
1528       __result = vec_sl ((__v4si) __A, __lshift);
1529     }
1530 
1531   return (__m128i) __result;
1532 }
1533 
1534 #ifdef _ARCH_PWR8
1535 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64(__m128i __A,int __B)1536 _mm_slli_epi64 (__m128i __A, int __B)
1537 {
1538   __v2du __lshift;
1539   __v2di __result = { 0, 0 };
1540 
1541   if (__B >= 0 && __B < 64)
1542     {
1543       if (__builtin_constant_p(__B) && __B < 16)
1544 	__lshift = (__v2du) vec_splat_s32(__B);
1545       else
1546 	__lshift = (__v2du) vec_splats ((unsigned int) __B);
1547 
1548       __result = vec_sl ((__v2di) __A, __lshift);
1549     }
1550 
1551   return (__m128i) __result;
1552 }
1553 #endif
1554 
1555 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16(__m128i __A,int __B)1556 _mm_srai_epi16 (__m128i __A, int __B)
1557 {
1558   __v8hu __rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1559   __v8hi __result;
1560 
1561   if (__B < 16)
1562     {
1563       if (__builtin_constant_p(__B))
1564 	__rshift = (__v8hu) vec_splat_s16(__B);
1565       else
1566 	__rshift = vec_splats ((unsigned short) __B);
1567     }
1568   __result = vec_sra ((__v8hi) __A, __rshift);
1569 
1570   return (__m128i) __result;
1571 }
1572 
1573 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32(__m128i __A,int __B)1574 _mm_srai_epi32 (__m128i __A, int __B)
1575 {
1576   __v4su __rshift = { 31, 31, 31, 31 };
1577   __v4si __result;
1578 
1579   if (__B < 32)
1580     {
1581       if (__builtin_constant_p(__B))
1582 	{
1583 	  if (__B < 16)
1584 	      __rshift = (__v4su) vec_splat_s32(__B);
1585 	    else
1586 	      __rshift = (__v4su) vec_splats((unsigned int)__B);
1587 	}
1588       else
1589 	__rshift = vec_splats ((unsigned int) __B);
1590     }
1591   __result = vec_sra ((__v4si) __A, __rshift);
1592 
1593   return (__m128i) __result;
1594 }
1595 
1596 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128(__m128i __A,const int __N)1597 _mm_bslli_si128 (__m128i __A, const int __N)
1598 {
1599   __v16qu __result;
1600   const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1601 
1602   if (__N < 16)
1603     __result = vec_sld ((__v16qu) __A, __zeros, __N);
1604   else
1605     __result = __zeros;
1606 
1607   return (__m128i) __result;
1608 }
1609 
1610 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128(__m128i __A,const int __N)1611 _mm_bsrli_si128 (__m128i __A, const int __N)
1612 {
1613   __v16qu __result;
1614   const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1615 
1616   if (__N < 16)
1617 #ifdef __LITTLE_ENDIAN__
1618     if (__builtin_constant_p(__N))
1619       /* Would like to use Vector Shift Left Double by Octet
1620 	 Immediate here to use the immediate form and avoid
1621 	 load of __N * 8 value into a separate VR.  */
1622       __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N));
1623     else
1624 #endif
1625       {
1626 	__v16qu __shift = vec_splats((unsigned char)(__N*8));
1627 #ifdef __LITTLE_ENDIAN__
1628 	__result = vec_sro ((__v16qu)__A, __shift);
1629 #else
1630 	__result = vec_slo ((__v16qu)__A, __shift);
1631 #endif
1632       }
1633   else
1634     __result = __zeros;
1635 
1636   return (__m128i) __result;
1637 }
1638 
1639 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128(__m128i __A,const int __N)1640 _mm_srli_si128 (__m128i __A, const int __N)
1641 {
1642   return _mm_bsrli_si128 (__A, __N);
1643 }
1644 
1645 extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128(__m128i __A,const int _imm5)1646 _mm_slli_si128 (__m128i __A, const int _imm5)
1647 {
1648   __v16qu __result;
1649   const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1650 
1651   if (_imm5 < 16)
1652 #ifdef __LITTLE_ENDIAN__
1653     __result = vec_sld ((__v16qu) __A, __zeros, _imm5);
1654 #else
1655     __result = vec_sld (__zeros, (__v16qu) __A, (16 - _imm5));
1656 #endif
1657   else
1658     __result = __zeros;
1659 
1660   return (__m128i) __result;
1661 }
1662 
1663 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664 
_mm_srli_epi16(__m128i __A,int __B)1665 _mm_srli_epi16 (__m128i  __A, int __B)
1666 {
1667   __v8hu __rshift;
1668   __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1669 
1670   if (__B < 16)
1671     {
1672       if (__builtin_constant_p(__B))
1673 	__rshift = (__v8hu) vec_splat_s16(__B);
1674       else
1675 	__rshift = vec_splats ((unsigned short) __B);
1676 
1677       __result = vec_sr ((__v8hi) __A, __rshift);
1678     }
1679 
1680   return (__m128i) __result;
1681 }
1682 
1683 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32(__m128i __A,int __B)1684 _mm_srli_epi32 (__m128i __A, int __B)
1685 {
1686   __v4su __rshift;
1687   __v4si __result = { 0, 0, 0, 0 };
1688 
1689   if (__B < 32)
1690     {
1691       if (__builtin_constant_p(__B))
1692 	{
1693 	  if (__B < 16)
1694 	      __rshift = (__v4su) vec_splat_s32(__B);
1695 	    else
1696 	      __rshift = (__v4su) vec_splats((unsigned int)__B);
1697 	}
1698       else
1699 	__rshift = vec_splats ((unsigned int) __B);
1700 
1701       __result = vec_sr ((__v4si) __A, __rshift);
1702     }
1703 
1704   return (__m128i) __result;
1705 }
1706 
1707 #ifdef _ARCH_PWR8
1708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64(__m128i __A,int __B)1709 _mm_srli_epi64 (__m128i __A, int __B)
1710 {
1711   __v2du __rshift;
1712   __v2di __result = { 0, 0 };
1713 
1714   if (__B < 64)
1715     {
1716       if (__builtin_constant_p(__B))
1717 	{
1718 	  if (__B < 16)
1719 	      __rshift = (__v2du) vec_splat_s32(__B);
1720 	    else
1721 	      __rshift = (__v2du) vec_splats((unsigned long long)__B);
1722 	}
1723       else
1724 	__rshift = (__v2du) vec_splats ((unsigned int) __B);
1725 
1726       __result = vec_sr ((__v2di) __A, __rshift);
1727     }
1728 
1729   return (__m128i) __result;
1730 }
1731 #endif
1732 
1733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16(__m128i __A,__m128i __B)1734 _mm_sll_epi16 (__m128i __A, __m128i __B)
1735 {
1736   __v8hu __lshift;
1737   __vector __bool short __shmask;
1738   const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1739   __v8hu __result;
1740 
1741 #ifdef __LITTLE_ENDIAN__
1742   __lshift = vec_splat ((__v8hu) __B, 0);
1743 #else
1744   __lshift = vec_splat ((__v8hu) __B, 3);
1745 #endif
1746   __shmask = vec_cmple (__lshift, __shmax);
1747   __result = vec_sl ((__v8hu) __A, __lshift);
1748   __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
1749 
1750   return (__m128i) __result;
1751 }
1752 
1753 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32(__m128i __A,__m128i __B)1754 _mm_sll_epi32 (__m128i __A, __m128i __B)
1755 {
1756   __v4su __lshift;
1757   __vector __bool int __shmask;
1758   const __v4su __shmax = { 32, 32, 32, 32 };
1759   __v4su __result;
1760 #ifdef __LITTLE_ENDIAN__
1761   __lshift = vec_splat ((__v4su) __B, 0);
1762 #else
1763   __lshift = vec_splat ((__v4su) __B, 1);
1764 #endif
1765   __shmask = vec_cmplt (__lshift, __shmax);
1766   __result = vec_sl ((__v4su) __A, __lshift);
1767   __result = vec_sel ((__v4su) __shmask, __result, __shmask);
1768 
1769   return (__m128i) __result;
1770 }
1771 
1772 #ifdef _ARCH_PWR8
1773 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64(__m128i __A,__m128i __B)1774 _mm_sll_epi64 (__m128i __A, __m128i __B)
1775 {
1776   __v2du __lshift;
1777   __vector __bool long long __shmask;
1778   const __v2du __shmax = { 64, 64 };
1779   __v2du __result;
1780 
1781   __lshift = vec_splat ((__v2du) __B, 0);
1782   __shmask = vec_cmplt (__lshift, __shmax);
1783   __result = vec_sl ((__v2du) __A, __lshift);
1784   __result = vec_sel ((__v2du) __shmask, __result, __shmask);
1785 
1786   return (__m128i) __result;
1787 }
1788 #endif
1789 
1790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16(__m128i __A,__m128i __B)1791 _mm_sra_epi16 (__m128i __A, __m128i __B)
1792 {
1793   const __v8hu __rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1794   __v8hu __rshift;
1795   __v8hi __result;
1796 
1797 #ifdef __LITTLE_ENDIAN__
1798   __rshift = vec_splat ((__v8hu)__B, 0);
1799 #else
1800   __rshift = vec_splat ((__v8hu)__B, 3);
1801 #endif
1802   __rshift = vec_min (__rshift, __rshmax);
1803   __result = vec_sra ((__v8hi) __A, __rshift);
1804 
1805   return (__m128i) __result;
1806 }
1807 
1808 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32(__m128i __A,__m128i __B)1809 _mm_sra_epi32 (__m128i __A, __m128i __B)
1810 {
1811   const __v4su __rshmax = { 31, 31, 31, 31 };
1812   __v4su __rshift;
1813   __v4si __result;
1814 
1815 #ifdef __LITTLE_ENDIAN__
1816   __rshift = vec_splat ((__v4su)__B, 0);
1817 #else
1818   __rshift = vec_splat ((__v4su)__B, 1);
1819 #endif
1820   __rshift = vec_min (__rshift, __rshmax);
1821   __result = vec_sra ((__v4si) __A, __rshift);
1822 
1823   return (__m128i) __result;
1824 }
1825 
1826 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16(__m128i __A,__m128i __B)1827 _mm_srl_epi16 (__m128i __A, __m128i __B)
1828 {
1829   __v8hu __rshift;
1830   __vector __bool short __shmask;
1831   const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1832   __v8hu __result;
1833 
1834 #ifdef __LITTLE_ENDIAN__
1835   __rshift = vec_splat ((__v8hu) __B, 0);
1836 #else
1837   __rshift = vec_splat ((__v8hu) __B, 3);
1838 #endif
1839   __shmask = vec_cmple (__rshift, __shmax);
1840   __result = vec_sr ((__v8hu) __A, __rshift);
1841   __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
1842 
1843   return (__m128i) __result;
1844 }
1845 
1846 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32(__m128i __A,__m128i __B)1847 _mm_srl_epi32 (__m128i __A, __m128i __B)
1848 {
1849   __v4su __rshift;
1850   __vector __bool int __shmask;
1851   const __v4su __shmax = { 32, 32, 32, 32 };
1852   __v4su __result;
1853 
1854 #ifdef __LITTLE_ENDIAN__
1855   __rshift = vec_splat ((__v4su) __B, 0);
1856 #else
1857   __rshift = vec_splat ((__v4su) __B, 1);
1858 #endif
1859   __shmask = vec_cmplt (__rshift, __shmax);
1860   __result = vec_sr ((__v4su) __A, __rshift);
1861   __result = vec_sel ((__v4su) __shmask, __result, __shmask);
1862 
1863   return (__m128i) __result;
1864 }
1865 
1866 #ifdef _ARCH_PWR8
1867 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64(__m128i __A,__m128i __B)1868 _mm_srl_epi64 (__m128i __A, __m128i __B)
1869 {
1870   __v2du __rshift;
1871   __vector __bool long long __shmask;
1872   const __v2du __shmax = { 64, 64 };
1873   __v2du __result;
1874 
1875   __rshift = vec_splat ((__v2du) __B, 0);
1876   __shmask = vec_cmplt (__rshift, __shmax);
1877   __result = vec_sr ((__v2du) __A, __rshift);
1878   __result = vec_sel ((__v2du) __shmask, __result, __shmask);
1879 
1880   return (__m128i) __result;
1881 }
1882 #endif
1883 
1884 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd(__m128d __A,__m128d __B)1885 _mm_and_pd (__m128d __A, __m128d __B)
1886 {
1887   return (vec_and ((__v2df) __A, (__v2df) __B));
1888 }
1889 
1890 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd(__m128d __A,__m128d __B)1891 _mm_andnot_pd (__m128d __A, __m128d __B)
1892 {
1893   return (vec_andc ((__v2df) __B, (__v2df) __A));
1894 }
1895 
1896 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd(__m128d __A,__m128d __B)1897 _mm_or_pd (__m128d __A, __m128d __B)
1898 {
1899   return (vec_or ((__v2df) __A, (__v2df) __B));
1900 }
1901 
1902 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd(__m128d __A,__m128d __B)1903 _mm_xor_pd (__m128d __A, __m128d __B)
1904 {
1905   return (vec_xor ((__v2df) __A, (__v2df) __B));
1906 }
1907 
1908 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128(__m128i __A,__m128i __B)1909 _mm_and_si128 (__m128i __A, __m128i __B)
1910 {
1911   return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1912 }
1913 
1914 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128(__m128i __A,__m128i __B)1915 _mm_andnot_si128 (__m128i __A, __m128i __B)
1916 {
1917   return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1918 }
1919 
1920 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128(__m128i __A,__m128i __B)1921 _mm_or_si128 (__m128i __A, __m128i __B)
1922 {
1923   return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1924 }
1925 
1926 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128(__m128i __A,__m128i __B)1927 _mm_xor_si128 (__m128i __A, __m128i __B)
1928 {
1929   return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1930 }
1931 
1932 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8(__m128i __A,__m128i __B)1933 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1934 {
1935   return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1936 }
1937 
1938 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16(__m128i __A,__m128i __B)1939 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1940 {
1941   return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1942 }
1943 
1944 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32(__m128i __A,__m128i __B)1945 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1946 {
1947   return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1948 }
1949 
1950 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8(__m128i __A,__m128i __B)1951 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1952 {
1953   return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1954 }
1955 
1956 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16(__m128i __A,__m128i __B)1957 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1958 {
1959   return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1960 }
1961 
1962 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32(__m128i __A,__m128i __B)1963 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1964 {
1965   return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1966 }
1967 
1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8(__m128i __A,__m128i __B)1969 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1970 {
1971   return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1972 }
1973 
1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16(__m128i __A,__m128i __B)1975 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1976 {
1977   return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1978 }
1979 
1980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32(__m128i __A,__m128i __B)1981 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1982 {
1983   return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1984 }
1985 
1986 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16(__m128i const __A,int const __N)1987 _mm_extract_epi16 (__m128i const __A, int const __N)
1988 {
1989   return (unsigned short) ((__v8hi)__A)[__N & 7];
1990 }
1991 
1992 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16(__m128i const __A,int const __D,int const __N)1993 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1994 {
1995   __v8hi __result = (__v8hi)__A;
1996 
1997   __result [(__N & 7)] = __D;
1998 
1999   return (__m128i) __result;
2000 }
2001 
2002 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16(__m128i __A,__m128i __B)2003 _mm_max_epi16 (__m128i __A, __m128i __B)
2004 {
2005   return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
2006 }
2007 
2008 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8(__m128i __A,__m128i __B)2009 _mm_max_epu8 (__m128i __A, __m128i __B)
2010 {
2011   return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
2012 }
2013 
2014 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16(__m128i __A,__m128i __B)2015 _mm_min_epi16 (__m128i __A, __m128i __B)
2016 {
2017   return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
2018 }
2019 
2020 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8(__m128i __A,__m128i __B)2021 _mm_min_epu8 (__m128i __A, __m128i __B)
2022 {
2023   return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
2024 }
2025 
2026 
2027 #ifdef _ARCH_PWR8
2028 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
2029 
2030 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
2031 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8(__m128i __A)2032 _mm_movemask_epi8 (__m128i __A)
2033 {
2034   __vector unsigned long long __result;
2035   static const __vector unsigned char __perm_mask =
2036     {
2037 	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2038 	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2039     };
2040 
2041   __result = ((__vector unsigned long long)
2042 	    vec_vbpermq ((__vector unsigned char) __A,
2043 			 (__vector unsigned char) __perm_mask));
2044 
2045 #ifdef __LITTLE_ENDIAN__
2046   return __result[1];
2047 #else
2048   return __result[0];
2049 #endif
2050 }
2051 #endif /* _ARCH_PWR8 */
2052 
2053 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16(__m128i __A,__m128i __B)2054 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2055 {
2056   __v4su __w0, __w1;
2057   __v16qu __xform1 = {
2058 #ifdef __LITTLE_ENDIAN__
2059       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
2060       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
2061 #else
2062       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
2063       0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
2064 #endif
2065     };
2066 
2067   __w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2068   __w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2069   return (__m128i) vec_perm (__w0, __w1, __xform1);
2070 }
2071 
2072 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16(__m128i __A,const int __mask)2073 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2074 {
2075   unsigned long __element_selector_98 = __mask & 0x03;
2076   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2077   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2078   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2079   static const unsigned short __permute_selectors[4] =
2080     {
2081 #ifdef __LITTLE_ENDIAN__
2082 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2083 #else
2084 	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2085 #endif
2086     };
2087   __v2du __pmask =
2088 #ifdef __LITTLE_ENDIAN__
2089       { 0x1716151413121110UL,  0UL};
2090 #else
2091       { 0x1011121314151617UL,  0UL};
2092 #endif
2093   __m64_union __t;
2094   __v2du __a, __r;
2095 
2096   __t.as_short[0] = __permute_selectors[__element_selector_98];
2097   __t.as_short[1] = __permute_selectors[__element_selector_BA];
2098   __t.as_short[2] = __permute_selectors[__element_selector_DC];
2099   __t.as_short[3] = __permute_selectors[__element_selector_FE];
2100   __pmask[1] = __t.as_m64;
2101   __a = (__v2du)__A;
2102   __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
2103   return (__m128i) __r;
2104 }
2105 
2106 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16(__m128i __A,const int __mask)2107 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2108 {
2109   unsigned long __element_selector_10 = __mask & 0x03;
2110   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2111   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2112   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2113   static const unsigned short __permute_selectors[4] =
2114     {
2115 #ifdef __LITTLE_ENDIAN__
2116 	      0x0100, 0x0302, 0x0504, 0x0706
2117 #else
2118 	      0x0001, 0x0203, 0x0405, 0x0607
2119 #endif
2120     };
2121   __v2du __pmask =
2122 #ifdef __LITTLE_ENDIAN__
2123                  { 0UL,  0x1f1e1d1c1b1a1918UL};
2124 #else
2125                  { 0UL,  0x18191a1b1c1d1e1fUL};
2126 #endif
2127   __m64_union __t;
2128   __v2du __a, __r;
2129   __t.as_short[0] = __permute_selectors[__element_selector_10];
2130   __t.as_short[1] = __permute_selectors[__element_selector_32];
2131   __t.as_short[2] = __permute_selectors[__element_selector_54];
2132   __t.as_short[3] = __permute_selectors[__element_selector_76];
2133   __pmask[0] = __t.as_m64;
2134   __a = (__v2du)__A;
2135   __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
2136   return (__m128i) __r;
2137 }
2138 
2139 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32(__m128i __A,const int __mask)2140 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2141 {
2142   unsigned long __element_selector_10 = __mask & 0x03;
2143   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2144   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2145   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2146   static const unsigned int __permute_selectors[4] =
2147     {
2148 #ifdef __LITTLE_ENDIAN__
2149 	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2150 #else
2151       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2152 #endif
2153     };
2154   __v4su __t;
2155 
2156   __t[0] = __permute_selectors[__element_selector_10];
2157   __t[1] = __permute_selectors[__element_selector_32];
2158   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2159   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2160   return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)__t);
2161 }
2162 
2163 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128(__m128i __A,__m128i __B,char * __C)2164 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2165 {
2166   __v2du __hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2167   __v16qu __mask, __tmp;
2168   __m128i_u *__p = (__m128i_u*)__C;
2169 
2170   __tmp = (__v16qu)_mm_loadu_si128(__p);
2171   __mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)__hibit);
2172   __tmp = vec_sel (__tmp, (__v16qu)__A, __mask);
2173   _mm_storeu_si128 (__p, (__m128i)__tmp);
2174 }
2175 
2176 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8(__m128i __A,__m128i __B)2177 _mm_avg_epu8 (__m128i __A, __m128i __B)
2178 {
2179   return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2180 }
2181 
2182 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16(__m128i __A,__m128i __B)2183 _mm_avg_epu16 (__m128i __A, __m128i __B)
2184 {
2185   return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2186 }
2187 
2188 
2189 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8(__m128i __A,__m128i __B)2190 _mm_sad_epu8 (__m128i __A, __m128i __B)
2191 {
2192   __v16qu __a, __b;
2193   __v16qu __vmin, __vmax, __vabsdiff;
2194   __v4si __vsum;
2195   const __v4su __zero = { 0, 0, 0, 0 };
2196   __v4si __result;
2197 
2198   __a = (__v16qu) __A;
2199   __b = (__v16qu) __B;
2200   __vmin = vec_min (__a, __b);
2201   __vmax = vec_max (__a, __b);
2202   __vabsdiff = vec_sub (__vmax, __vmin);
2203   /* Sum four groups of bytes into integers.  */
2204   __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
2205   /* Sum across four integers with two integer results.  */
2206   __result = vec_sum2s (__vsum, (__vector signed int) __zero);
2207   /* Rotate the sums into the correct position.  */
2208 #ifdef __LITTLE_ENDIAN__
2209   __result = vec_sld (__result, __result, 4);
2210 #else
2211   __result = vec_sld (__result, __result, 6);
2212 #endif
2213   /* Rotate the sums into the correct position.  */
2214   return (__m128i) __result;
2215 }
2216 
2217 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32(int * __A,int __B)2218 _mm_stream_si32 (int *__A, int __B)
2219 {
2220   /* Use the data cache block touch for store transient.  */
2221   __asm__ (
2222     "dcbtstt 0,%0"
2223     :
2224     : "b" (__A)
2225     : "memory"
2226   );
2227   *__A = __B;
2228 }
2229 
2230 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64(long long int * __A,long long int __B)2231 _mm_stream_si64 (long long int *__A, long long int __B)
2232 {
2233   /* Use the data cache block touch for store transient.  */
2234   __asm__ (
2235     "	dcbtstt	0,%0"
2236     :
2237     : "b" (__A)
2238     : "memory"
2239   );
2240   *__A = __B;
2241 }
2242 
2243 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128(__m128i * __A,__m128i __B)2244 _mm_stream_si128 (__m128i *__A, __m128i __B)
2245 {
2246   /* Use the data cache block touch for store transient.  */
2247   __asm__ (
2248     "dcbtstt 0,%0"
2249     :
2250     : "b" (__A)
2251     : "memory"
2252   );
2253   *__A = __B;
2254 }
2255 
2256 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd(double * __A,__m128d __B)2257 _mm_stream_pd (double *__A, __m128d __B)
2258 {
2259   /* Use the data cache block touch for store transient.  */
2260   __asm__ (
2261     "dcbtstt 0,%0"
2262     :
2263     : "b" (__A)
2264     : "memory"
2265   );
2266   *(__m128d*)__A = __B;
2267 }
2268 
2269 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush(void const * __A)2270 _mm_clflush (void const *__A)
2271 {
2272   /* Use the data cache block flush.  */
2273   __asm__ (
2274     "dcbf 0,%0"
2275     :
2276     : "b" (__A)
2277     : "memory"
2278   );
2279 }
2280 
2281 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence(void)2282 _mm_lfence (void)
2283 {
2284   /* Use light weight sync for load to load ordering.  */
2285   __atomic_thread_fence (__ATOMIC_RELEASE);
2286 }
2287 
2288 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence(void)2289 _mm_mfence (void)
2290 {
2291   /* Use heavy weight sync for any to any ordering.  */
2292   __atomic_thread_fence (__ATOMIC_SEQ_CST);
2293 }
2294 
2295 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128(int __A)2296 _mm_cvtsi32_si128 (int __A)
2297 {
2298   return _mm_set_epi32 (0, 0, 0, __A);
2299 }
2300 
2301 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128(long long __A)2302 _mm_cvtsi64_si128 (long long __A)
2303 {
2304   return __extension__ (__m128i)(__v2di){ __A, 0LL };
2305 }
2306 
2307 /* Microsoft intrinsic.  */
2308 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128(long long __A)2309 _mm_cvtsi64x_si128 (long long __A)
2310 {
2311   return __extension__ (__m128i)(__v2di){ __A, 0LL };
2312 }
2313 
2314 /* Casts between various SP, DP, INT vector types.  Note that these do no
2315    conversion of values, they just change the type.  */
2316 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)2317 _mm_castpd_ps(__m128d __A)
2318 {
2319   return (__m128) __A;
2320 }
2321 
2322 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)2323 _mm_castpd_si128(__m128d __A)
2324 {
2325   return (__m128i) __A;
2326 }
2327 
2328 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)2329 _mm_castps_pd(__m128 __A)
2330 {
2331   return (__m128d) __A;
2332 }
2333 
2334 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)2335 _mm_castps_si128(__m128 __A)
2336 {
2337   return (__m128i) __A;
2338 }
2339 
2340 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)2341 _mm_castsi128_ps(__m128i __A)
2342 {
2343   return (__m128) __A;
2344 }
2345 
2346 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)2347 _mm_castsi128_pd(__m128i __A)
2348 {
2349   return (__m128d) __A;
2350 }
2351 
2352 #endif /* EMMINTRIN_H_ */
2353 
2354