1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19    However scalar float operations in vector (XMM) registers require
20    the POWER8 VSX ISA (2.07) level. There are differences for data
21    format and placement of float scalars in the vector register, which
22    require extra steps to match SSE2 scalar float semantics on POWER.
23 
24    It should be noted that there's much difference between X86_64's
25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26    portable <fenv.h> instead of access MXSCR directly.
27 
28    Most SSE2 scalar float intrinsic operations can be performed more
29    efficiently as C language float scalar operations or optimized to
30    use vector SIMD operations. We recommend this for new applications.
31 */
32 #error                                                                         \
33     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #endif
35 
36 #ifndef EMMINTRIN_H_
37 #define EMMINTRIN_H_
38 
39 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
40 
41 #include <altivec.h>
42 
43 /* We need definitions from the SSE header files.  */
44 #include <xmmintrin.h>
45 
46 /* SSE2 */
47 typedef __vector double __v2df;
48 typedef __vector long long __v2di;
49 typedef __vector unsigned long long __v2du;
50 typedef __vector int __v4si;
51 typedef __vector unsigned int __v4su;
52 typedef __vector short __v8hi;
53 typedef __vector unsigned short __v8hu;
54 typedef __vector signed char __v16qi;
55 typedef __vector unsigned char __v16qu;
56 
57 /* The Intel API is flexible enough that we must allow aliasing with other
58    vector types, and their scalar components.  */
59 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
60 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
61 
62 /* Unaligned version of the same types.  */
63 typedef long long __m128i_u
64     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
65 typedef double __m128d_u
66     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
67 
68 /* Define two value permute mask.  */
69 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
70 
71 /* Create a vector with element 0 as F and the rest zero.  */
72 extern __inline __m128d
73     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74     _mm_set_sd(double __F) {
75   return __extension__(__m128d){__F, 0.0};
76 }
77 
78 /* Create a vector with both elements equal to F.  */
79 extern __inline __m128d
80     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81     _mm_set1_pd(double __F) {
82   return __extension__(__m128d){__F, __F};
83 }
84 
85 extern __inline __m128d
86     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
87     _mm_set_pd1(double __F) {
88   return _mm_set1_pd(__F);
89 }
90 
91 /* Create a vector with the lower value X and upper value W.  */
92 extern __inline __m128d
93     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
94     _mm_set_pd(double __W, double __X) {
95   return __extension__(__m128d){__X, __W};
96 }
97 
98 /* Create a vector with the lower value W and upper value X.  */
99 extern __inline __m128d
100     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101     _mm_setr_pd(double __W, double __X) {
102   return __extension__(__m128d){__W, __X};
103 }
104 
105 /* Create an undefined vector.  */
106 extern __inline __m128d
107     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
108     _mm_undefined_pd(void) {
109   __m128d __Y = __Y;
110   return __Y;
111 }
112 
113 /* Create a vector of zeros.  */
114 extern __inline __m128d
115     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
116     _mm_setzero_pd(void) {
117   return (__m128d)vec_splats(0);
118 }
119 
120 /* Sets the low DPFP value of A from the low value of B.  */
121 extern __inline __m128d
122     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123     _mm_move_sd(__m128d __A, __m128d __B) {
124   __v2df __result = (__v2df)__A;
125   __result[0] = ((__v2df)__B)[0];
126   return (__m128d)__result;
127 }
128 
129 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
130 extern __inline __m128d
131     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132     _mm_load_pd(double const *__P) {
133   return ((__m128d)vec_ld(0, (__v16qu *)__P));
134 }
135 
136 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
137 extern __inline __m128d
138     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139     _mm_loadu_pd(double const *__P) {
140   return (vec_vsx_ld(0, __P));
141 }
142 
143 /* Create a vector with all two elements equal to *P.  */
144 extern __inline __m128d
145     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146     _mm_load1_pd(double const *__P) {
147   return (vec_splats(*__P));
148 }
149 
150 /* Create a vector with element 0 as *P and the rest zero.  */
151 extern __inline __m128d
152     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153     _mm_load_sd(double const *__P) {
154   return _mm_set_sd(*__P);
155 }
156 
157 extern __inline __m128d
158     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159     _mm_load_pd1(double const *__P) {
160   return _mm_load1_pd(__P);
161 }
162 
163 /* Load two DPFP values in reverse order.  The address must be aligned.  */
164 extern __inline __m128d
165     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166     _mm_loadr_pd(double const *__P) {
167   __v2df __tmp = _mm_load_pd(__P);
168   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
169 }
170 
171 /* Store two DPFP values.  The address must be 16-byte aligned.  */
172 extern __inline void
173     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174     _mm_store_pd(double *__P, __m128d __A) {
175   vec_st((__v16qu)__A, 0, (__v16qu *)__P);
176 }
177 
178 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
179 extern __inline void
180     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
181     _mm_storeu_pd(double *__P, __m128d __A) {
182   *(__m128d_u *)__P = __A;
183 }
184 
185 /* Stores the lower DPFP value.  */
186 extern __inline void
187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188     _mm_store_sd(double *__P, __m128d __A) {
189   *__P = ((__v2df)__A)[0];
190 }
191 
192 extern __inline double
193     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194     _mm_cvtsd_f64(__m128d __A) {
195   return ((__v2df)__A)[0];
196 }
197 
198 extern __inline void
199     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
200     _mm_storel_pd(double *__P, __m128d __A) {
201   _mm_store_sd(__P, __A);
202 }
203 
204 /* Stores the upper DPFP value.  */
205 extern __inline void
206     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207     _mm_storeh_pd(double *__P, __m128d __A) {
208   *__P = ((__v2df)__A)[1];
209 }
210 /* Store the lower DPFP value across two words.
211    The address must be 16-byte aligned.  */
212 extern __inline void
213     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214     _mm_store1_pd(double *__P, __m128d __A) {
215   _mm_store_pd(__P, vec_splat(__A, 0));
216 }
217 
218 extern __inline void
219     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220     _mm_store_pd1(double *__P, __m128d __A) {
221   _mm_store1_pd(__P, __A);
222 }
223 
224 /* Store two DPFP values in reverse order.  The address must be aligned.  */
225 extern __inline void
226     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
227     _mm_storer_pd(double *__P, __m128d __A) {
228   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
229 }
230 
231 /* Intel intrinsic.  */
232 extern __inline long long
233     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
234     _mm_cvtsi128_si64(__m128i __A) {
235   return ((__v2di)__A)[0];
236 }
237 
238 /* Microsoft intrinsic.  */
239 extern __inline long long
240     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241     _mm_cvtsi128_si64x(__m128i __A) {
242   return ((__v2di)__A)[0];
243 }
244 
245 extern __inline __m128d
246     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
247     _mm_add_pd(__m128d __A, __m128d __B) {
248   return (__m128d)((__v2df)__A + (__v2df)__B);
249 }
250 
251 /* Add the lower double-precision (64-bit) floating-point element in
252    a and b, store the result in the lower element of dst, and copy
253    the upper element from a to the upper element of dst. */
254 extern __inline __m128d
255     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
256     _mm_add_sd(__m128d __A, __m128d __B) {
257   __A[0] = __A[0] + __B[0];
258   return (__A);
259 }
260 
261 extern __inline __m128d
262     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263     _mm_sub_pd(__m128d __A, __m128d __B) {
264   return (__m128d)((__v2df)__A - (__v2df)__B);
265 }
266 
267 extern __inline __m128d
268     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269     _mm_sub_sd(__m128d __A, __m128d __B) {
270   __A[0] = __A[0] - __B[0];
271   return (__A);
272 }
273 
274 extern __inline __m128d
275     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276     _mm_mul_pd(__m128d __A, __m128d __B) {
277   return (__m128d)((__v2df)__A * (__v2df)__B);
278 }
279 
280 extern __inline __m128d
281     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282     _mm_mul_sd(__m128d __A, __m128d __B) {
283   __A[0] = __A[0] * __B[0];
284   return (__A);
285 }
286 
287 extern __inline __m128d
288     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289     _mm_div_pd(__m128d __A, __m128d __B) {
290   return (__m128d)((__v2df)__A / (__v2df)__B);
291 }
292 
293 extern __inline __m128d
294     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
295     _mm_div_sd(__m128d __A, __m128d __B) {
296   __A[0] = __A[0] / __B[0];
297   return (__A);
298 }
299 
300 extern __inline __m128d
301     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302     _mm_sqrt_pd(__m128d __A) {
303   return (vec_sqrt(__A));
304 }
305 
306 /* Return pair {sqrt (B[0]), A[1]}.  */
307 extern __inline __m128d
308     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309     _mm_sqrt_sd(__m128d __A, __m128d __B) {
310   __v2df __c;
311   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
312   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
313 }
314 
315 extern __inline __m128d
316     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317     _mm_min_pd(__m128d __A, __m128d __B) {
318   return (vec_min(__A, __B));
319 }
320 
321 extern __inline __m128d
322     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323     _mm_min_sd(__m128d __A, __m128d __B) {
324   __v2df __a, __b, __c;
325   __a = vec_splats(__A[0]);
326   __b = vec_splats(__B[0]);
327   __c = vec_min(__a, __b);
328   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
329 }
330 
331 extern __inline __m128d
332     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333     _mm_max_pd(__m128d __A, __m128d __B) {
334   return (vec_max(__A, __B));
335 }
336 
337 extern __inline __m128d
338     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339     _mm_max_sd(__m128d __A, __m128d __B) {
340   __v2df __a, __b, __c;
341   __a = vec_splats(__A[0]);
342   __b = vec_splats(__B[0]);
343   __c = vec_max(__a, __b);
344   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
345 }
346 
347 extern __inline __m128d
348     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349     _mm_cmpeq_pd(__m128d __A, __m128d __B) {
350   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
351 }
352 
353 extern __inline __m128d
354     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355     _mm_cmplt_pd(__m128d __A, __m128d __B) {
356   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
357 }
358 
359 extern __inline __m128d
360     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361     _mm_cmple_pd(__m128d __A, __m128d __B) {
362   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
363 }
364 
365 extern __inline __m128d
366     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367     _mm_cmpgt_pd(__m128d __A, __m128d __B) {
368   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
369 }
370 
371 extern __inline __m128d
372     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373     _mm_cmpge_pd(__m128d __A, __m128d __B) {
374   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
375 }
376 
377 extern __inline __m128d
378     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379     _mm_cmpneq_pd(__m128d __A, __m128d __B) {
380   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
381   return ((__m128d)vec_nor(__temp, __temp));
382 }
383 
384 extern __inline __m128d
385     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
387   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
388 }
389 
390 extern __inline __m128d
391     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392     _mm_cmpnle_pd(__m128d __A, __m128d __B) {
393   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
394 }
395 
396 extern __inline __m128d
397     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
398     _mm_cmpngt_pd(__m128d __A, __m128d __B) {
399   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
400 }
401 
402 extern __inline __m128d
403     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404     _mm_cmpnge_pd(__m128d __A, __m128d __B) {
405   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
406 }
407 
408 extern __inline __m128d
409     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410     _mm_cmpord_pd(__m128d __A, __m128d __B) {
411   __v2du __c, __d;
412   /* Compare against self will return false (0's) if NAN.  */
413   __c = (__v2du)vec_cmpeq(__A, __A);
414   __d = (__v2du)vec_cmpeq(__B, __B);
415   /* A != NAN and B != NAN.  */
416   return ((__m128d)vec_and(__c, __d));
417 }
418 
419 extern __inline __m128d
420     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
421     _mm_cmpunord_pd(__m128d __A, __m128d __B) {
422 #if _ARCH_PWR8
423   __v2du __c, __d;
424   /* Compare against self will return false (0's) if NAN.  */
425   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
426   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
427   /* A == NAN OR B == NAN converts too:
428      NOT(A != NAN) OR NOT(B != NAN).  */
429   __c = vec_nor(__c, __c);
430   return ((__m128d)vec_orc(__c, __d));
431 #else
432   __v2du __c, __d;
433   /* Compare against self will return false (0's) if NAN.  */
434   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
435   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
436   /* Convert the true ('1's) is NAN.  */
437   __c = vec_nor(__c, __c);
438   __d = vec_nor(__d, __d);
439   return ((__m128d)vec_or(__c, __d));
440 #endif
441 }
442 
443 extern __inline __m128d
444     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
445     _mm_cmpeq_sd(__m128d __A, __m128d __B) {
446   __v2df __a, __b, __c;
447   /* PowerISA VSX does not allow partial (for just lower double)
448      results. So to insure we don't generate spurious exceptions
449      (from the upper double values) we splat the lower double
450      before we do the operation. */
451   __a = vec_splats(__A[0]);
452   __b = vec_splats(__B[0]);
453   __c = (__v2df)vec_cmpeq(__a, __b);
454   /* Then we merge the lower double result with the original upper
455      double from __A.  */
456   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
457 }
458 
459 extern __inline __m128d
460     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
461     _mm_cmplt_sd(__m128d __A, __m128d __B) {
462   __v2df __a, __b, __c;
463   __a = vec_splats(__A[0]);
464   __b = vec_splats(__B[0]);
465   __c = (__v2df)vec_cmplt(__a, __b);
466   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
467 }
468 
469 extern __inline __m128d
470     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471     _mm_cmple_sd(__m128d __A, __m128d __B) {
472   __v2df __a, __b, __c;
473   __a = vec_splats(__A[0]);
474   __b = vec_splats(__B[0]);
475   __c = (__v2df)vec_cmple(__a, __b);
476   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
477 }
478 
479 extern __inline __m128d
480     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481     _mm_cmpgt_sd(__m128d __A, __m128d __B) {
482   __v2df __a, __b, __c;
483   __a = vec_splats(__A[0]);
484   __b = vec_splats(__B[0]);
485   __c = (__v2df)vec_cmpgt(__a, __b);
486   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
487 }
488 
489 extern __inline __m128d
490     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
491     _mm_cmpge_sd(__m128d __A, __m128d __B) {
492   __v2df __a, __b, __c;
493   __a = vec_splats(__A[0]);
494   __b = vec_splats(__B[0]);
495   __c = (__v2df)vec_cmpge(__a, __b);
496   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
497 }
498 
499 extern __inline __m128d
500     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501     _mm_cmpneq_sd(__m128d __A, __m128d __B) {
502   __v2df __a, __b, __c;
503   __a = vec_splats(__A[0]);
504   __b = vec_splats(__B[0]);
505   __c = (__v2df)vec_cmpeq(__a, __b);
506   __c = vec_nor(__c, __c);
507   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
508 }
509 
510 extern __inline __m128d
511     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
513   __v2df __a, __b, __c;
514   __a = vec_splats(__A[0]);
515   __b = vec_splats(__B[0]);
516   /* Not less than is just greater than or equal.  */
517   __c = (__v2df)vec_cmpge(__a, __b);
518   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
519 }
520 
521 extern __inline __m128d
522     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523     _mm_cmpnle_sd(__m128d __A, __m128d __B) {
524   __v2df __a, __b, __c;
525   __a = vec_splats(__A[0]);
526   __b = vec_splats(__B[0]);
527   /* Not less than or equal is just greater than.  */
528   __c = (__v2df)vec_cmpge(__a, __b);
529   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
530 }
531 
532 extern __inline __m128d
533     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
534     _mm_cmpngt_sd(__m128d __A, __m128d __B) {
535   __v2df __a, __b, __c;
536   __a = vec_splats(__A[0]);
537   __b = vec_splats(__B[0]);
538   /* Not greater than is just less than or equal.  */
539   __c = (__v2df)vec_cmple(__a, __b);
540   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
541 }
542 
543 extern __inline __m128d
544     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
545     _mm_cmpnge_sd(__m128d __A, __m128d __B) {
546   __v2df __a, __b, __c;
547   __a = vec_splats(__A[0]);
548   __b = vec_splats(__B[0]);
549   /* Not greater than or equal is just less than.  */
550   __c = (__v2df)vec_cmplt(__a, __b);
551   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
552 }
553 
554 extern __inline __m128d
555     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556     _mm_cmpord_sd(__m128d __A, __m128d __B) {
557   __v2df __r;
558   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
559   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
560 }
561 
562 extern __inline __m128d
563     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
564     _mm_cmpunord_sd(__m128d __A, __m128d __B) {
565   __v2df __r;
566   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
567   return (__m128d)_mm_setr_pd(__r[0], __A[1]);
568 }
569 
570 /* FIXME
571    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
572    exactly the same because GCC for PowerPC only generates unordered
573    compares (scalar and vector).
574    Technically __mm_comieq_sp et all should be using the ordered
575    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
576    be OK.   */
577 extern __inline int
578     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579     _mm_comieq_sd(__m128d __A, __m128d __B) {
580   return (__A[0] == __B[0]);
581 }
582 
583 extern __inline int
584     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585     _mm_comilt_sd(__m128d __A, __m128d __B) {
586   return (__A[0] < __B[0]);
587 }
588 
589 extern __inline int
590     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591     _mm_comile_sd(__m128d __A, __m128d __B) {
592   return (__A[0] <= __B[0]);
593 }
594 
595 extern __inline int
596     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
597     _mm_comigt_sd(__m128d __A, __m128d __B) {
598   return (__A[0] > __B[0]);
599 }
600 
601 extern __inline int
602     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
603     _mm_comige_sd(__m128d __A, __m128d __B) {
604   return (__A[0] >= __B[0]);
605 }
606 
607 extern __inline int
608     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609     _mm_comineq_sd(__m128d __A, __m128d __B) {
610   return (__A[0] != __B[0]);
611 }
612 
613 extern __inline int
614     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
615     _mm_ucomieq_sd(__m128d __A, __m128d __B) {
616   return (__A[0] == __B[0]);
617 }
618 
619 extern __inline int
620     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621     _mm_ucomilt_sd(__m128d __A, __m128d __B) {
622   return (__A[0] < __B[0]);
623 }
624 
625 extern __inline int
626     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627     _mm_ucomile_sd(__m128d __A, __m128d __B) {
628   return (__A[0] <= __B[0]);
629 }
630 
631 extern __inline int
632     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
633     _mm_ucomigt_sd(__m128d __A, __m128d __B) {
634   return (__A[0] > __B[0]);
635 }
636 
637 extern __inline int
638     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639     _mm_ucomige_sd(__m128d __A, __m128d __B) {
640   return (__A[0] >= __B[0]);
641 }
642 
643 extern __inline int
644     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645     _mm_ucomineq_sd(__m128d __A, __m128d __B) {
646   return (__A[0] != __B[0]);
647 }
648 
649 /* Create a vector of Qi, where i is the element number.  */
650 extern __inline __m128i
651     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
652     _mm_set_epi64x(long long __q1, long long __q0) {
653   return __extension__(__m128i)(__v2di){__q0, __q1};
654 }
655 
656 extern __inline __m128i
657     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658     _mm_set_epi64(__m64 __q1, __m64 __q0) {
659   return _mm_set_epi64x((long long)__q1, (long long)__q0);
660 }
661 
662 extern __inline __m128i
663     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
665   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
666 }
667 
668 extern __inline __m128i
669     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
671                   short __q2, short __q1, short __q0) {
672   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
673                                         __q4, __q5, __q6, __q7};
674 }
675 
676 extern __inline __m128i
677     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
679                  char __q10, char __q09, char __q08, char __q07, char __q06,
680                  char __q05, char __q04, char __q03, char __q02, char __q01,
681                  char __q00) {
682   return __extension__(__m128i)(__v16qi){
683       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
684       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
685 }
686 
687 /* Set all of the elements of the vector to A.  */
688 extern __inline __m128i
689     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690     _mm_set1_epi64x(long long __A) {
691   return _mm_set_epi64x(__A, __A);
692 }
693 
694 extern __inline __m128i
695     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696     _mm_set1_epi64(__m64 __A) {
697   return _mm_set_epi64(__A, __A);
698 }
699 
700 extern __inline __m128i
701     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
702     _mm_set1_epi32(int __A) {
703   return _mm_set_epi32(__A, __A, __A, __A);
704 }
705 
706 extern __inline __m128i
707     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
708     _mm_set1_epi16(short __A) {
709   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
710 }
711 
712 extern __inline __m128i
713     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
714     _mm_set1_epi8(char __A) {
715   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
716                       __A, __A, __A, __A, __A);
717 }
718 
719 /* Create a vector of Qi, where i is the element number.
720    The parameter order is reversed from the _mm_set_epi* functions.  */
721 extern __inline __m128i
722     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
723     _mm_setr_epi64(__m64 __q0, __m64 __q1) {
724   return _mm_set_epi64(__q1, __q0);
725 }
726 
727 extern __inline __m128i
728     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
730   return _mm_set_epi32(__q3, __q2, __q1, __q0);
731 }
732 
733 extern __inline __m128i
734     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
736                    short __q5, short __q6, short __q7) {
737   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
738 }
739 
740 extern __inline __m128i
741     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
743                   char __q05, char __q06, char __q07, char __q08, char __q09,
744                   char __q10, char __q11, char __q12, char __q13, char __q14,
745                   char __q15) {
746   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
747                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
748 }
749 
750 /* Create a vector with element 0 as *P and the rest zero.  */
751 extern __inline __m128i
752     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753     _mm_load_si128(__m128i const *__P) {
754   return *__P;
755 }
756 
757 extern __inline __m128i
758     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
759     _mm_loadu_si128(__m128i_u const *__P) {
760   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
761 }
762 
763 extern __inline __m128i
764     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
765     _mm_loadl_epi64(__m128i_u const *__P) {
766   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
767 }
768 
769 extern __inline void
770     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771     _mm_store_si128(__m128i *__P, __m128i __B) {
772   vec_st((__v16qu)__B, 0, (__v16qu *)__P);
773 }
774 
775 extern __inline void
776     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
777     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
778   *__P = __B;
779 }
780 
781 extern __inline void
782     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
783     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
784   *(long long *)__P = ((__v2di)__B)[0];
785 }
786 
787 extern __inline __m64
788     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
789     _mm_movepi64_pi64(__m128i_u __B) {
790   return (__m64)((__v2di)__B)[0];
791 }
792 
793 extern __inline __m128i
794     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
795     _mm_movpi64_epi64(__m64 __A) {
796   return _mm_set_epi64((__m64)0LL, __A);
797 }
798 
799 extern __inline __m128i
800     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801     _mm_move_epi64(__m128i __A) {
802   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
803 }
804 
805 /* Create an undefined vector.  */
806 extern __inline __m128i
807     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
808     _mm_undefined_si128(void) {
809   __m128i __Y = __Y;
810   return __Y;
811 }
812 
813 /* Create a vector of zeros.  */
814 extern __inline __m128i
815     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
816     _mm_setzero_si128(void) {
817   return __extension__(__m128i)(__v4si){0, 0, 0, 0};
818 }
819 
820 #ifdef _ARCH_PWR8
821 extern __inline __m128d
822     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
823     _mm_cvtepi32_pd(__m128i __A) {
824   __v2di __val;
825   /* For LE need to generate Vector Unpack Low Signed Word.
826      Which is generated from unpackh.  */
827   __val = (__v2di)vec_unpackh((__v4si)__A);
828 
829   return (__m128d)vec_ctf(__val, 0);
830 }
831 #endif
832 
833 extern __inline __m128
834     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835     _mm_cvtepi32_ps(__m128i __A) {
836   return ((__m128)vec_ctf((__v4si)__A, 0));
837 }
838 
839 extern __inline __m128i
840     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841     _mm_cvtpd_epi32(__m128d __A) {
842   __v2df __rounded = vec_rint(__A);
843   __v4si __result, __temp;
844   const __v4si __vzero = {0, 0, 0, 0};
845 
846   /* VSX Vector truncate Double-Precision to integer and Convert to
847    Signed Integer Word format with Saturate.  */
848   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
849 
850 #ifdef _ARCH_PWR8
851 #ifdef __LITTLE_ENDIAN__
852   __temp = vec_mergeo(__temp, __temp);
853 #else
854   __temp = vec_mergee(__temp, __temp);
855 #endif
856   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
857                                  (__vector long long)__vzero);
858 #else
859   {
860     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
861                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
862     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
863   }
864 #endif
865   return (__m128i)__result;
866 }
867 
868 extern __inline __m64
869     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
870     _mm_cvtpd_pi32(__m128d __A) {
871   __m128i __result = _mm_cvtpd_epi32(__A);
872 
873   return (__m64)__result[0];
874 }
875 
876 extern __inline __m128
877     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878     _mm_cvtpd_ps(__m128d __A) {
879   __v4sf __result;
880   __v4si __temp;
881   const __v4si __vzero = {0, 0, 0, 0};
882 
883   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
884 
885 #ifdef _ARCH_PWR8
886 #ifdef __LITTLE_ENDIAN__
887   __temp = vec_mergeo(__temp, __temp);
888 #else
889   __temp = vec_mergee(__temp, __temp);
890 #endif
891   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
892                                  (__vector long long)__vzero);
893 #else
894   {
895     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
896                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
897     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
898   }
899 #endif
900   return ((__m128)__result);
901 }
902 
903 extern __inline __m128i
904     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905     _mm_cvttpd_epi32(__m128d __A) {
906   __v4si __result;
907   __v4si __temp;
908   const __v4si __vzero = {0, 0, 0, 0};
909 
910   /* VSX Vector truncate Double-Precision to integer and Convert to
911    Signed Integer Word format with Saturate.  */
912   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
913 
914 #ifdef _ARCH_PWR8
915 #ifdef __LITTLE_ENDIAN__
916   __temp = vec_mergeo(__temp, __temp);
917 #else
918   __temp = vec_mergee(__temp, __temp);
919 #endif
920   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
921                                  (__vector long long)__vzero);
922 #else
923   {
924     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
925                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
926     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
927   }
928 #endif
929 
930   return ((__m128i)__result);
931 }
932 
933 extern __inline __m64
934     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935     _mm_cvttpd_pi32(__m128d __A) {
936   __m128i __result = _mm_cvttpd_epi32(__A);
937 
938   return (__m64)__result[0];
939 }
940 
941 extern __inline int
942     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
943     _mm_cvtsi128_si32(__m128i __A) {
944   return ((__v4si)__A)[0];
945 }
946 
947 #ifdef _ARCH_PWR8
948 extern __inline __m128d
949     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
950     _mm_cvtpi32_pd(__m64 __A) {
951   __v4si __temp;
952   __v2di __tmp2;
953   __v2df __result;
954 
955   __temp = (__v4si)vec_splats(__A);
956   __tmp2 = (__v2di)vec_unpackl(__temp);
957   __result = vec_ctf((__vector signed long long)__tmp2, 0);
958   return (__m128d)__result;
959 }
960 #endif
961 
962 extern __inline __m128i
963     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
964     _mm_cvtps_epi32(__m128 __A) {
965   __v4sf __rounded;
966   __v4si __result;
967 
968   __rounded = vec_rint((__v4sf)__A);
969   __result = vec_cts(__rounded, 0);
970   return (__m128i)__result;
971 }
972 
973 extern __inline __m128i
974     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975     _mm_cvttps_epi32(__m128 __A) {
976   __v4si __result;
977 
978   __result = vec_cts((__v4sf)__A, 0);
979   return (__m128i)__result;
980 }
981 
982 extern __inline __m128d
983     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984     _mm_cvtps_pd(__m128 __A) {
985   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
986 #ifdef vec_doubleh
987   return (__m128d)vec_doubleh((__v4sf)__A);
988 #else
989   /* Otherwise the compiler is not current and so need to generate the
990      equivalent code.  */
991   __v4sf __a = (__v4sf)__A;
992   __v4sf __temp;
993   __v2df __result;
994 #ifdef __LITTLE_ENDIAN__
995   /* The input float values are in elements {[0], [1]} but the convert
996      instruction needs them in elements {[1], [3]}, So we use two
997      shift left double vector word immediates to get the elements
998      lined up.  */
999   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1000   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1001 #else
1002   /* The input float values are in elements {[0], [1]} but the convert
1003      instruction needs them in elements {[0], [2]}, So we use two
1004      shift left double vector word immediates to get the elements
1005      lined up.  */
1006   __temp = vec_vmrghw(__a, __a);
1007 #endif
1008   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1009   return (__m128d)__result;
1010 #endif
1011 }
1012 
1013 extern __inline int
1014     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015     _mm_cvtsd_si32(__m128d __A) {
1016   __v2df __rounded = vec_rint((__v2df)__A);
1017   int __result = ((__v2df)__rounded)[0];
1018 
1019   return __result;
1020 }
1021 /* Intel intrinsic.  */
1022 extern __inline long long
1023     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1024     _mm_cvtsd_si64(__m128d __A) {
1025   __v2df __rounded = vec_rint((__v2df)__A);
1026   long long __result = ((__v2df)__rounded)[0];
1027 
1028   return __result;
1029 }
1030 
1031 /* Microsoft intrinsic.  */
1032 extern __inline long long
1033     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034     _mm_cvtsd_si64x(__m128d __A) {
1035   return _mm_cvtsd_si64((__v2df)__A);
1036 }
1037 
1038 extern __inline int
1039     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040     _mm_cvttsd_si32(__m128d __A) {
1041   int __result = ((__v2df)__A)[0];
1042 
1043   return __result;
1044 }
1045 
1046 /* Intel intrinsic.  */
1047 extern __inline long long
1048     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1049     _mm_cvttsd_si64(__m128d __A) {
1050   long long __result = ((__v2df)__A)[0];
1051 
1052   return __result;
1053 }
1054 
1055 /* Microsoft intrinsic.  */
1056 extern __inline long long
1057     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1058     _mm_cvttsd_si64x(__m128d __A) {
1059   return _mm_cvttsd_si64(__A);
1060 }
1061 
1062 extern __inline __m128
1063     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064     _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1065   __v4sf __result = (__v4sf)__A;
1066 
1067 #ifdef __LITTLE_ENDIAN__
1068   __v4sf __temp_s;
1069   /* Copy double element[0] to element [1] for conversion.  */
1070   __v2df __temp_b = vec_splat((__v2df)__B, 0);
1071 
1072   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1073   __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1074   /* Convert double to single float scalar in a vector.  */
1075   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1076   /* Shift the resulting scalar into vector element [0].  */
1077   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1078 #else
1079   __result[0] = ((__v2df)__B)[0];
1080 #endif
1081   return (__m128)__result;
1082 }
1083 
1084 extern __inline __m128d
1085     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1086     _mm_cvtsi32_sd(__m128d __A, int __B) {
1087   __v2df __result = (__v2df)__A;
1088   double __db = __B;
1089   __result[0] = __db;
1090   return (__m128d)__result;
1091 }
1092 
1093 /* Intel intrinsic.  */
1094 extern __inline __m128d
1095     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096     _mm_cvtsi64_sd(__m128d __A, long long __B) {
1097   __v2df __result = (__v2df)__A;
1098   double __db = __B;
1099   __result[0] = __db;
1100   return (__m128d)__result;
1101 }
1102 
1103 /* Microsoft intrinsic.  */
1104 extern __inline __m128d
1105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106     _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1107   return _mm_cvtsi64_sd(__A, __B);
1108 }
1109 
1110 extern __inline __m128d
1111     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1112     _mm_cvtss_sd(__m128d __A, __m128 __B) {
1113 #ifdef __LITTLE_ENDIAN__
1114   /* Use splat to move element [0] into position for the convert. */
1115   __v4sf __temp = vec_splat((__v4sf)__B, 0);
1116   __v2df __res;
1117   /* Convert single float scalar to double in a vector.  */
1118   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1119   return (__m128d)vec_mergel(__res, (__v2df)__A);
1120 #else
1121   __v2df __res = (__v2df)__A;
1122   __res[0] = ((__v4sf)__B)[0];
1123   return (__m128d)__res;
1124 #endif
1125 }
1126 
1127 extern __inline __m128d
1128     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1130   __vector double __result;
1131   const int __litmsk = __mask & 0x3;
1132 
1133   if (__litmsk == 0)
1134     __result = vec_mergeh(__A, __B);
1135 #if __GNUC__ < 6
1136   else if (__litmsk == 1)
1137     __result = vec_xxpermdi(__B, __A, 2);
1138   else if (__litmsk == 2)
1139     __result = vec_xxpermdi(__B, __A, 1);
1140 #else
1141   else if (__litmsk == 1)
1142     __result = vec_xxpermdi(__A, __B, 2);
1143   else if (__litmsk == 2)
1144     __result = vec_xxpermdi(__A, __B, 1);
1145 #endif
1146   else
1147     __result = vec_mergel(__A, __B);
1148 
1149   return __result;
1150 }
1151 
1152 extern __inline __m128d
1153     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154     _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1155   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1156 }
1157 
1158 extern __inline __m128d
1159     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1160     _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1161   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1162 }
1163 
1164 extern __inline __m128d
1165     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1166     _mm_loadh_pd(__m128d __A, double const *__B) {
1167   __v2df __result = (__v2df)__A;
1168   __result[1] = *__B;
1169   return (__m128d)__result;
1170 }
1171 
1172 extern __inline __m128d
1173     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174     _mm_loadl_pd(__m128d __A, double const *__B) {
1175   __v2df __result = (__v2df)__A;
1176   __result[0] = *__B;
1177   return (__m128d)__result;
1178 }
1179 
1180 #ifdef _ARCH_PWR8
1181 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1182 
1183 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1184 extern __inline int
1185     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186     _mm_movemask_pd(__m128d __A) {
1187 #ifdef _ARCH_PWR10
1188   return vec_extractm((__v2du)__A);
1189 #else
1190   __vector unsigned long long __result;
1191   static const __vector unsigned int __perm_mask = {
1192 #ifdef __LITTLE_ENDIAN__
1193       0x80800040, 0x80808080, 0x80808080, 0x80808080
1194 #else
1195       0x80808080, 0x80808080, 0x80808080, 0x80804000
1196 #endif
1197   };
1198 
1199   __result = ((__vector unsigned long long)vec_vbpermq(
1200       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1201 
1202 #ifdef __LITTLE_ENDIAN__
1203   return __result[1];
1204 #else
1205   return __result[0];
1206 #endif
1207 #endif /* !_ARCH_PWR10 */
1208 }
1209 #endif /* _ARCH_PWR8 */
1210 
1211 extern __inline __m128i
1212     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1213     _mm_packs_epi16(__m128i __A, __m128i __B) {
1214   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1215 }
1216 
1217 extern __inline __m128i
1218     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1219     _mm_packs_epi32(__m128i __A, __m128i __B) {
1220   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1221 }
1222 
1223 extern __inline __m128i
1224     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1225     _mm_packus_epi16(__m128i __A, __m128i __B) {
1226   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1227 }
1228 
1229 extern __inline __m128i
1230     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1231     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1232   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1233 }
1234 
1235 extern __inline __m128i
1236     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1238   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1239 }
1240 
1241 extern __inline __m128i
1242     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1244   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1245 }
1246 
1247 extern __inline __m128i
1248     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1249     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1250   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1251 }
1252 
1253 extern __inline __m128i
1254     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1256   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1257 }
1258 
1259 extern __inline __m128i
1260     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1261     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1262   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1263 }
1264 
1265 extern __inline __m128i
1266     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1267     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1268   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1269 }
1270 
1271 extern __inline __m128i
1272     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1274   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1275 }
1276 
1277 extern __inline __m128i
1278     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1279     _mm_add_epi8(__m128i __A, __m128i __B) {
1280   return (__m128i)((__v16qu)__A + (__v16qu)__B);
1281 }
1282 
1283 extern __inline __m128i
1284     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1285     _mm_add_epi16(__m128i __A, __m128i __B) {
1286   return (__m128i)((__v8hu)__A + (__v8hu)__B);
1287 }
1288 
1289 extern __inline __m128i
1290     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291     _mm_add_epi32(__m128i __A, __m128i __B) {
1292   return (__m128i)((__v4su)__A + (__v4su)__B);
1293 }
1294 
1295 extern __inline __m128i
1296     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297     _mm_add_epi64(__m128i __A, __m128i __B) {
1298   return (__m128i)((__v2du)__A + (__v2du)__B);
1299 }
1300 
1301 extern __inline __m128i
1302     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303     _mm_adds_epi8(__m128i __A, __m128i __B) {
1304   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1305 }
1306 
1307 extern __inline __m128i
1308     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309     _mm_adds_epi16(__m128i __A, __m128i __B) {
1310   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1311 }
1312 
1313 extern __inline __m128i
1314     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315     _mm_adds_epu8(__m128i __A, __m128i __B) {
1316   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1317 }
1318 
1319 extern __inline __m128i
1320     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321     _mm_adds_epu16(__m128i __A, __m128i __B) {
1322   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1323 }
1324 
1325 extern __inline __m128i
1326     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327     _mm_sub_epi8(__m128i __A, __m128i __B) {
1328   return (__m128i)((__v16qu)__A - (__v16qu)__B);
1329 }
1330 
1331 extern __inline __m128i
1332     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1333     _mm_sub_epi16(__m128i __A, __m128i __B) {
1334   return (__m128i)((__v8hu)__A - (__v8hu)__B);
1335 }
1336 
1337 extern __inline __m128i
1338     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339     _mm_sub_epi32(__m128i __A, __m128i __B) {
1340   return (__m128i)((__v4su)__A - (__v4su)__B);
1341 }
1342 
1343 extern __inline __m128i
1344     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1345     _mm_sub_epi64(__m128i __A, __m128i __B) {
1346   return (__m128i)((__v2du)__A - (__v2du)__B);
1347 }
1348 
1349 extern __inline __m128i
1350     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351     _mm_subs_epi8(__m128i __A, __m128i __B) {
1352   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1353 }
1354 
1355 extern __inline __m128i
1356     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357     _mm_subs_epi16(__m128i __A, __m128i __B) {
1358   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1359 }
1360 
1361 extern __inline __m128i
1362     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363     _mm_subs_epu8(__m128i __A, __m128i __B) {
1364   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1365 }
1366 
1367 extern __inline __m128i
1368     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369     _mm_subs_epu16(__m128i __A, __m128i __B) {
1370   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1371 }
1372 
1373 extern __inline __m128i
1374     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375     _mm_madd_epi16(__m128i __A, __m128i __B) {
1376   __vector signed int __zero = {0, 0, 0, 0};
1377 
1378   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1379 }
1380 
1381 extern __inline __m128i
1382     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383     _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1384   __vector signed int __w0, __w1;
1385 
1386   __vector unsigned char __xform1 = {
1387 #ifdef __LITTLE_ENDIAN__
1388       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1389       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1390 #else
1391       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1392       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1393 #endif
1394   };
1395 
1396   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1397   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1398   return (__m128i)vec_perm(__w0, __w1, __xform1);
1399 }
1400 
1401 extern __inline __m128i
1402     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403     _mm_mullo_epi16(__m128i __A, __m128i __B) {
1404   return (__m128i)((__v8hi)__A * (__v8hi)__B);
1405 }
1406 
1407 extern __inline __m64
1408     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409     _mm_mul_su32(__m64 __A, __m64 __B) {
1410   unsigned int __a = __A;
1411   unsigned int __b = __B;
1412 
1413   return ((__m64)__a * (__m64)__b);
1414 }
1415 
1416 #ifdef _ARCH_PWR8
1417 extern __inline __m128i
1418     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1419     _mm_mul_epu32(__m128i __A, __m128i __B) {
1420 #if __GNUC__ < 8
1421   __v2du __result;
1422 
1423 #ifdef __LITTLE_ENDIAN__
1424   /* VMX Vector Multiply Odd Unsigned Word.  */
1425   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1426 #else
1427   /* VMX Vector Multiply Even Unsigned Word.  */
1428   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1429 #endif
1430   return (__m128i)__result;
1431 #else
1432   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1433 #endif
1434 }
1435 #endif
1436 
1437 extern __inline __m128i
1438     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1439     _mm_slli_epi16(__m128i __A, int __B) {
1440   __v8hu __lshift;
1441   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1442 
1443   if (__B >= 0 && __B < 16) {
1444     if (__builtin_constant_p(__B))
1445       __lshift = (__v8hu)vec_splat_s16(__B);
1446     else
1447       __lshift = vec_splats((unsigned short)__B);
1448 
1449     __result = vec_sl((__v8hi)__A, __lshift);
1450   }
1451 
1452   return (__m128i)__result;
1453 }
1454 
1455 extern __inline __m128i
1456     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1457     _mm_slli_epi32(__m128i __A, int __B) {
1458   __v4su __lshift;
1459   __v4si __result = {0, 0, 0, 0};
1460 
1461   if (__B >= 0 && __B < 32) {
1462     if (__builtin_constant_p(__B) && __B < 16)
1463       __lshift = (__v4su)vec_splat_s32(__B);
1464     else
1465       __lshift = vec_splats((unsigned int)__B);
1466 
1467     __result = vec_sl((__v4si)__A, __lshift);
1468   }
1469 
1470   return (__m128i)__result;
1471 }
1472 
1473 #ifdef _ARCH_PWR8
1474 extern __inline __m128i
1475     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1476     _mm_slli_epi64(__m128i __A, int __B) {
1477   __v2du __lshift;
1478   __v2di __result = {0, 0};
1479 
1480   if (__B >= 0 && __B < 64) {
1481     if (__builtin_constant_p(__B) && __B < 16)
1482       __lshift = (__v2du)vec_splat_s32(__B);
1483     else
1484       __lshift = (__v2du)vec_splats((unsigned int)__B);
1485 
1486     __result = vec_sl((__v2di)__A, __lshift);
1487   }
1488 
1489   return (__m128i)__result;
1490 }
1491 #endif
1492 
1493 extern __inline __m128i
1494     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1495     _mm_srai_epi16(__m128i __A, int __B) {
1496   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1497   __v8hi __result;
1498 
1499   if (__B < 16) {
1500     if (__builtin_constant_p(__B))
1501       __rshift = (__v8hu)vec_splat_s16(__B);
1502     else
1503       __rshift = vec_splats((unsigned short)__B);
1504   }
1505   __result = vec_sra((__v8hi)__A, __rshift);
1506 
1507   return (__m128i)__result;
1508 }
1509 
1510 extern __inline __m128i
1511     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512     _mm_srai_epi32(__m128i __A, int __B) {
1513   __v4su __rshift = {31, 31, 31, 31};
1514   __v4si __result;
1515 
1516   if (__B < 32) {
1517     if (__builtin_constant_p(__B)) {
1518       if (__B < 16)
1519         __rshift = (__v4su)vec_splat_s32(__B);
1520       else
1521         __rshift = (__v4su)vec_splats((unsigned int)__B);
1522     } else
1523       __rshift = vec_splats((unsigned int)__B);
1524   }
1525   __result = vec_sra((__v4si)__A, __rshift);
1526 
1527   return (__m128i)__result;
1528 }
1529 
1530 extern __inline __m128i
1531     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1532     _mm_bslli_si128(__m128i __A, const int __N) {
1533   __v16qu __result;
1534   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1535 
1536   if (__N < 16)
1537     __result = vec_sld((__v16qu)__A, __zeros, __N);
1538   else
1539     __result = __zeros;
1540 
1541   return (__m128i)__result;
1542 }
1543 
1544 extern __inline __m128i
1545     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1546     _mm_bsrli_si128(__m128i __A, const int __N) {
1547   __v16qu __result;
1548   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1549 
1550   if (__N < 16)
1551 #ifdef __LITTLE_ENDIAN__
1552     if (__builtin_constant_p(__N))
1553       /* Would like to use Vector Shift Left Double by Octet
1554          Immediate here to use the immediate form and avoid
1555          load of __N * 8 value into a separate VR.  */
1556       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1557     else
1558 #endif
1559     {
1560       __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1561 #ifdef __LITTLE_ENDIAN__
1562       __result = vec_sro((__v16qu)__A, __shift);
1563 #else
1564     __result = vec_slo((__v16qu)__A, __shift);
1565 #endif
1566     }
1567   else
1568     __result = __zeros;
1569 
1570   return (__m128i)__result;
1571 }
1572 
1573 extern __inline __m128i
1574     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1575     _mm_srli_si128(__m128i __A, const int __N) {
1576   return _mm_bsrli_si128(__A, __N);
1577 }
1578 
1579 extern __inline __m128i
1580     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1581     _mm_slli_si128(__m128i __A, const int _imm5) {
1582   __v16qu __result;
1583   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1584 
1585   if (_imm5 < 16)
1586 #ifdef __LITTLE_ENDIAN__
1587     __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1588 #else
1589     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1590 #endif
1591   else
1592     __result = __zeros;
1593 
1594   return (__m128i)__result;
1595 }
1596 
1597 extern __inline __m128i
1598     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1599 
1600     _mm_srli_epi16(__m128i __A, int __B) {
1601   __v8hu __rshift;
1602   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1603 
1604   if (__B < 16) {
1605     if (__builtin_constant_p(__B))
1606       __rshift = (__v8hu)vec_splat_s16(__B);
1607     else
1608       __rshift = vec_splats((unsigned short)__B);
1609 
1610     __result = vec_sr((__v8hi)__A, __rshift);
1611   }
1612 
1613   return (__m128i)__result;
1614 }
1615 
1616 extern __inline __m128i
1617     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1618     _mm_srli_epi32(__m128i __A, int __B) {
1619   __v4su __rshift;
1620   __v4si __result = {0, 0, 0, 0};
1621 
1622   if (__B < 32) {
1623     if (__builtin_constant_p(__B)) {
1624       if (__B < 16)
1625         __rshift = (__v4su)vec_splat_s32(__B);
1626       else
1627         __rshift = (__v4su)vec_splats((unsigned int)__B);
1628     } else
1629       __rshift = vec_splats((unsigned int)__B);
1630 
1631     __result = vec_sr((__v4si)__A, __rshift);
1632   }
1633 
1634   return (__m128i)__result;
1635 }
1636 
1637 #ifdef _ARCH_PWR8
1638 extern __inline __m128i
1639     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1640     _mm_srli_epi64(__m128i __A, int __B) {
1641   __v2du __rshift;
1642   __v2di __result = {0, 0};
1643 
1644   if (__B < 64) {
1645     if (__builtin_constant_p(__B)) {
1646       if (__B < 16)
1647         __rshift = (__v2du)vec_splat_s32(__B);
1648       else
1649         __rshift = (__v2du)vec_splats((unsigned long long)__B);
1650     } else
1651       __rshift = (__v2du)vec_splats((unsigned int)__B);
1652 
1653     __result = vec_sr((__v2di)__A, __rshift);
1654   }
1655 
1656   return (__m128i)__result;
1657 }
1658 #endif
1659 
1660 extern __inline __m128i
1661     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1662     _mm_sll_epi16(__m128i __A, __m128i __B) {
1663   __v8hu __lshift;
1664   __vector __bool short __shmask;
1665   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1666   __v8hu __result;
1667 
1668 #ifdef __LITTLE_ENDIAN__
1669   __lshift = vec_splat((__v8hu)__B, 0);
1670 #else
1671   __lshift = vec_splat((__v8hu)__B, 3);
1672 #endif
1673   __shmask = vec_cmple(__lshift, __shmax);
1674   __result = vec_sl((__v8hu)__A, __lshift);
1675   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1676 
1677   return (__m128i)__result;
1678 }
1679 
1680 extern __inline __m128i
1681     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1682     _mm_sll_epi32(__m128i __A, __m128i __B) {
1683   __v4su __lshift;
1684   __vector __bool int __shmask;
1685   const __v4su __shmax = {32, 32, 32, 32};
1686   __v4su __result;
1687 #ifdef __LITTLE_ENDIAN__
1688   __lshift = vec_splat((__v4su)__B, 0);
1689 #else
1690   __lshift = vec_splat((__v4su)__B, 1);
1691 #endif
1692   __shmask = vec_cmplt(__lshift, __shmax);
1693   __result = vec_sl((__v4su)__A, __lshift);
1694   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1695 
1696   return (__m128i)__result;
1697 }
1698 
1699 #ifdef _ARCH_PWR8
1700 extern __inline __m128i
1701     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1702     _mm_sll_epi64(__m128i __A, __m128i __B) {
1703   __v2du __lshift;
1704   __vector __bool long long __shmask;
1705   const __v2du __shmax = {64, 64};
1706   __v2du __result;
1707 
1708   __lshift = vec_splat((__v2du)__B, 0);
1709   __shmask = vec_cmplt(__lshift, __shmax);
1710   __result = vec_sl((__v2du)__A, __lshift);
1711   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1712 
1713   return (__m128i)__result;
1714 }
1715 #endif
1716 
1717 extern __inline __m128i
1718     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1719     _mm_sra_epi16(__m128i __A, __m128i __B) {
1720   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1721   __v8hu __rshift;
1722   __v8hi __result;
1723 
1724 #ifdef __LITTLE_ENDIAN__
1725   __rshift = vec_splat((__v8hu)__B, 0);
1726 #else
1727   __rshift = vec_splat((__v8hu)__B, 3);
1728 #endif
1729   __rshift = vec_min(__rshift, __rshmax);
1730   __result = vec_sra((__v8hi)__A, __rshift);
1731 
1732   return (__m128i)__result;
1733 }
1734 
1735 extern __inline __m128i
1736     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1737     _mm_sra_epi32(__m128i __A, __m128i __B) {
1738   const __v4su __rshmax = {31, 31, 31, 31};
1739   __v4su __rshift;
1740   __v4si __result;
1741 
1742 #ifdef __LITTLE_ENDIAN__
1743   __rshift = vec_splat((__v4su)__B, 0);
1744 #else
1745   __rshift = vec_splat((__v4su)__B, 1);
1746 #endif
1747   __rshift = vec_min(__rshift, __rshmax);
1748   __result = vec_sra((__v4si)__A, __rshift);
1749 
1750   return (__m128i)__result;
1751 }
1752 
1753 extern __inline __m128i
1754     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1755     _mm_srl_epi16(__m128i __A, __m128i __B) {
1756   __v8hu __rshift;
1757   __vector __bool short __shmask;
1758   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1759   __v8hu __result;
1760 
1761 #ifdef __LITTLE_ENDIAN__
1762   __rshift = vec_splat((__v8hu)__B, 0);
1763 #else
1764   __rshift = vec_splat((__v8hu)__B, 3);
1765 #endif
1766   __shmask = vec_cmple(__rshift, __shmax);
1767   __result = vec_sr((__v8hu)__A, __rshift);
1768   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1769 
1770   return (__m128i)__result;
1771 }
1772 
1773 extern __inline __m128i
1774     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1775     _mm_srl_epi32(__m128i __A, __m128i __B) {
1776   __v4su __rshift;
1777   __vector __bool int __shmask;
1778   const __v4su __shmax = {32, 32, 32, 32};
1779   __v4su __result;
1780 
1781 #ifdef __LITTLE_ENDIAN__
1782   __rshift = vec_splat((__v4su)__B, 0);
1783 #else
1784   __rshift = vec_splat((__v4su)__B, 1);
1785 #endif
1786   __shmask = vec_cmplt(__rshift, __shmax);
1787   __result = vec_sr((__v4su)__A, __rshift);
1788   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1789 
1790   return (__m128i)__result;
1791 }
1792 
1793 #ifdef _ARCH_PWR8
1794 extern __inline __m128i
1795     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1796     _mm_srl_epi64(__m128i __A, __m128i __B) {
1797   __v2du __rshift;
1798   __vector __bool long long __shmask;
1799   const __v2du __shmax = {64, 64};
1800   __v2du __result;
1801 
1802   __rshift = vec_splat((__v2du)__B, 0);
1803   __shmask = vec_cmplt(__rshift, __shmax);
1804   __result = vec_sr((__v2du)__A, __rshift);
1805   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1806 
1807   return (__m128i)__result;
1808 }
1809 #endif
1810 
1811 extern __inline __m128d
1812     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1813     _mm_and_pd(__m128d __A, __m128d __B) {
1814   return (vec_and((__v2df)__A, (__v2df)__B));
1815 }
1816 
1817 extern __inline __m128d
1818     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1819     _mm_andnot_pd(__m128d __A, __m128d __B) {
1820   return (vec_andc((__v2df)__B, (__v2df)__A));
1821 }
1822 
1823 extern __inline __m128d
1824     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1825     _mm_or_pd(__m128d __A, __m128d __B) {
1826   return (vec_or((__v2df)__A, (__v2df)__B));
1827 }
1828 
1829 extern __inline __m128d
1830     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1831     _mm_xor_pd(__m128d __A, __m128d __B) {
1832   return (vec_xor((__v2df)__A, (__v2df)__B));
1833 }
1834 
1835 extern __inline __m128i
1836     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1837     _mm_and_si128(__m128i __A, __m128i __B) {
1838   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1839 }
1840 
1841 extern __inline __m128i
1842     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1843     _mm_andnot_si128(__m128i __A, __m128i __B) {
1844   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1845 }
1846 
1847 extern __inline __m128i
1848     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1849     _mm_or_si128(__m128i __A, __m128i __B) {
1850   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1851 }
1852 
1853 extern __inline __m128i
1854     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1855     _mm_xor_si128(__m128i __A, __m128i __B) {
1856   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1857 }
1858 
1859 extern __inline __m128i
1860     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1861     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1862   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1863 }
1864 
1865 extern __inline __m128i
1866     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1867     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1868   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1869 }
1870 
1871 extern __inline __m128i
1872     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1873     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1874   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1875 }
1876 
1877 extern __inline __m128i
1878     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1879     _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1880   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1881 }
1882 
1883 extern __inline __m128i
1884     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1885     _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1886   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1887 }
1888 
1889 extern __inline __m128i
1890     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1891     _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1892   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1893 }
1894 
1895 extern __inline __m128i
1896     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1897     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1898   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1899 }
1900 
1901 extern __inline __m128i
1902     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1903     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1904   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1905 }
1906 
1907 extern __inline __m128i
1908     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1909     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1910   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1911 }
1912 
1913 extern __inline int
1914     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1915     _mm_extract_epi16(__m128i const __A, int const __N) {
1916   return (unsigned short)((__v8hi)__A)[__N & 7];
1917 }
1918 
1919 extern __inline __m128i
1920     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1921     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1922   __v8hi __result = (__v8hi)__A;
1923 
1924   __result[(__N & 7)] = __D;
1925 
1926   return (__m128i)__result;
1927 }
1928 
1929 extern __inline __m128i
1930     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1931     _mm_max_epi16(__m128i __A, __m128i __B) {
1932   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1933 }
1934 
1935 extern __inline __m128i
1936     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1937     _mm_max_epu8(__m128i __A, __m128i __B) {
1938   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1939 }
1940 
1941 extern __inline __m128i
1942     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1943     _mm_min_epi16(__m128i __A, __m128i __B) {
1944   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1945 }
1946 
1947 extern __inline __m128i
1948     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1949     _mm_min_epu8(__m128i __A, __m128i __B) {
1950   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1951 }
1952 
1953 #ifdef _ARCH_PWR8
1954 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1955 
1956 /* Return a mask created from the most significant bit of each 8-bit
1957    element in A.  */
1958 extern __inline int
1959     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1960     _mm_movemask_epi8(__m128i __A) {
1961 #ifdef _ARCH_PWR10
1962   return vec_extractm((__v16qu)__A);
1963 #else
1964   __vector unsigned long long __result;
1965   static const __vector unsigned char __perm_mask = {
1966       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1967       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1968 
1969   __result = ((__vector unsigned long long)vec_vbpermq(
1970       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1971 
1972 #ifdef __LITTLE_ENDIAN__
1973   return __result[1];
1974 #else
1975   return __result[0];
1976 #endif
1977 #endif /* !_ARCH_PWR10 */
1978 }
1979 #endif /* _ARCH_PWR8 */
1980 
1981 extern __inline __m128i
1982     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1983     _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1984   __v4su __w0, __w1;
1985   __v16qu __xform1 = {
1986 #ifdef __LITTLE_ENDIAN__
1987       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1988       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1989 #else
1990       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1991       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1992 #endif
1993   };
1994 
1995   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1996   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1997   return (__m128i)vec_perm(__w0, __w1, __xform1);
1998 }
1999 
2000 extern __inline __m128i
2001     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2002     _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2003   unsigned long __element_selector_98 = __mask & 0x03;
2004   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2005   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2006   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2007   static const unsigned short __permute_selectors[4] = {
2008 #ifdef __LITTLE_ENDIAN__
2009       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2010 #else
2011       0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2012 #endif
2013   };
2014   __v2du __pmask =
2015 #ifdef __LITTLE_ENDIAN__
2016       {0x1716151413121110UL, 0UL};
2017 #else
2018       {0x1011121314151617UL, 0UL};
2019 #endif
2020   __m64_union __t;
2021   __v2du __a, __r;
2022 
2023   __t.as_short[0] = __permute_selectors[__element_selector_98];
2024   __t.as_short[1] = __permute_selectors[__element_selector_BA];
2025   __t.as_short[2] = __permute_selectors[__element_selector_DC];
2026   __t.as_short[3] = __permute_selectors[__element_selector_FE];
2027   __pmask[1] = __t.as_m64;
2028   __a = (__v2du)__A;
2029   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2030   return (__m128i)__r;
2031 }
2032 
2033 extern __inline __m128i
2034     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2035     _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2036   unsigned long __element_selector_10 = __mask & 0x03;
2037   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2038   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2039   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2040   static const unsigned short __permute_selectors[4] = {
2041 #ifdef __LITTLE_ENDIAN__
2042       0x0100, 0x0302, 0x0504, 0x0706
2043 #else
2044       0x0001, 0x0203, 0x0405, 0x0607
2045 #endif
2046   };
2047   __v2du __pmask =
2048 #ifdef __LITTLE_ENDIAN__
2049       {0UL, 0x1f1e1d1c1b1a1918UL};
2050 #else
2051       {0UL, 0x18191a1b1c1d1e1fUL};
2052 #endif
2053   __m64_union __t;
2054   __v2du __a, __r;
2055   __t.as_short[0] = __permute_selectors[__element_selector_10];
2056   __t.as_short[1] = __permute_selectors[__element_selector_32];
2057   __t.as_short[2] = __permute_selectors[__element_selector_54];
2058   __t.as_short[3] = __permute_selectors[__element_selector_76];
2059   __pmask[0] = __t.as_m64;
2060   __a = (__v2du)__A;
2061   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2062   return (__m128i)__r;
2063 }
2064 
2065 extern __inline __m128i
2066     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2067     _mm_shuffle_epi32(__m128i __A, const int __mask) {
2068   unsigned long __element_selector_10 = __mask & 0x03;
2069   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2070   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2071   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2072   static const unsigned int __permute_selectors[4] = {
2073 #ifdef __LITTLE_ENDIAN__
2074       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2075 #else
2076       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2077 #endif
2078   };
2079   __v4su __t;
2080 
2081   __t[0] = __permute_selectors[__element_selector_10];
2082   __t[1] = __permute_selectors[__element_selector_32];
2083   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2084   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2085   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2086                            (__vector unsigned char)__t);
2087 }
2088 
2089 extern __inline void
2090     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2091     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2092   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2093   __v16qu __mask, __tmp;
2094   __m128i_u *__p = (__m128i_u *)__C;
2095 
2096   __tmp = (__v16qu)_mm_loadu_si128(__p);
2097   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2098   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2099   _mm_storeu_si128(__p, (__m128i)__tmp);
2100 }
2101 
2102 extern __inline __m128i
2103     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2104     _mm_avg_epu8(__m128i __A, __m128i __B) {
2105   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2106 }
2107 
2108 extern __inline __m128i
2109     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2110     _mm_avg_epu16(__m128i __A, __m128i __B) {
2111   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2112 }
2113 
2114 extern __inline __m128i
2115     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2116     _mm_sad_epu8(__m128i __A, __m128i __B) {
2117   __v16qu __a, __b;
2118   __v16qu __vabsdiff;
2119   __v4si __vsum;
2120   const __v4su __zero = {0, 0, 0, 0};
2121   __v4si __result;
2122 
2123   __a = (__v16qu)__A;
2124   __b = (__v16qu)__B;
2125 #ifndef _ARCH_PWR9
2126   __v16qu __vmin = vec_min(__a, __b);
2127   __v16qu __vmax = vec_max(__a, __b);
2128   __vabsdiff = vec_sub(__vmax, __vmin);
2129 #else
2130   __vabsdiff = vec_absd(__a, __b);
2131 #endif
2132   /* Sum four groups of bytes into integers.  */
2133   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2134 #ifdef __LITTLE_ENDIAN__
2135   /* Sum across four integers with two integer results.  */
2136   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2137   /* Note: vec_sum2s could be used here, but on little-endian, vector
2138      shifts are added that are not needed for this use-case.
2139      A vector shift to correctly position the 32-bit integer results
2140      (currently at [0] and [2]) to [1] and [3] would then need to be
2141      swapped back again since the desired results are two 64-bit
2142      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
2143 #else
2144   /* Sum across four integers with two integer results.  */
2145   __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2146   /* Rotate the sums into the correct position.  */
2147   __result = vec_sld(__result, __result, 6);
2148 #endif
2149   return (__m128i)__result;
2150 }
2151 
2152 extern __inline void
2153     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2154     _mm_stream_si32(int *__A, int __B) {
2155   /* Use the data cache block touch for store transient.  */
2156   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2157   *__A = __B;
2158 }
2159 
2160 extern __inline void
2161     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2162     _mm_stream_si64(long long int *__A, long long int __B) {
2163   /* Use the data cache block touch for store transient.  */
2164   __asm__("	dcbtstt	0,%0" : : "b"(__A) : "memory");
2165   *__A = __B;
2166 }
2167 
2168 extern __inline void
2169     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2170     _mm_stream_si128(__m128i *__A, __m128i __B) {
2171   /* Use the data cache block touch for store transient.  */
2172   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2173   *__A = __B;
2174 }
2175 
2176 extern __inline void
2177     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2178     _mm_stream_pd(double *__A, __m128d __B) {
2179   /* Use the data cache block touch for store transient.  */
2180   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2181   *(__m128d *)__A = __B;
2182 }
2183 
2184 extern __inline void
2185     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2186     _mm_clflush(void const *__A) {
2187   /* Use the data cache block flush.  */
2188   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2189 }
2190 
2191 extern __inline void
2192     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2193     _mm_lfence(void) {
2194   /* Use light weight sync for load to load ordering.  */
2195   __atomic_thread_fence(__ATOMIC_RELEASE);
2196 }
2197 
2198 extern __inline void
2199     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2200     _mm_mfence(void) {
2201   /* Use heavy weight sync for any to any ordering.  */
2202   __atomic_thread_fence(__ATOMIC_SEQ_CST);
2203 }
2204 
2205 extern __inline __m128i
2206     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2207     _mm_cvtsi32_si128(int __A) {
2208   return _mm_set_epi32(0, 0, 0, __A);
2209 }
2210 
2211 extern __inline __m128i
2212     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2213     _mm_cvtsi64_si128(long long __A) {
2214   return __extension__(__m128i)(__v2di){__A, 0LL};
2215 }
2216 
2217 /* Microsoft intrinsic.  */
2218 extern __inline __m128i
2219     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2220     _mm_cvtsi64x_si128(long long __A) {
2221   return __extension__(__m128i)(__v2di){__A, 0LL};
2222 }
2223 
2224 /* Casts between various SP, DP, INT vector types.  Note that these do no
2225    conversion of values, they just change the type.  */
2226 extern __inline __m128
2227     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2228     _mm_castpd_ps(__m128d __A) {
2229   return (__m128)__A;
2230 }
2231 
2232 extern __inline __m128i
2233     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2234     _mm_castpd_si128(__m128d __A) {
2235   return (__m128i)__A;
2236 }
2237 
2238 extern __inline __m128d
2239     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2240     _mm_castps_pd(__m128 __A) {
2241   return (__m128d)__A;
2242 }
2243 
2244 extern __inline __m128i
2245     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2246     _mm_castps_si128(__m128 __A) {
2247   return (__m128i)__A;
2248 }
2249 
2250 extern __inline __m128
2251     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2252     _mm_castsi128_ps(__m128i __A) {
2253   return (__m128)__A;
2254 }
2255 
2256 extern __inline __m128d
2257     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2258     _mm_castsi128_pd(__m128i __A) {
2259   return (__m128d)__A;
2260 }
2261 
2262 #else
2263 #include_next <emmintrin.h>
2264 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
2265         */
2266 
2267 #endif /* EMMINTRIN_H_ */
2268