1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19    However scalar float operations in vector (XMM) registers require
20    the POWER8 VSX ISA (2.07) level. There are differences for data
21    format and placement of float scalars in the vector register, which
22    require extra steps to match SSE2 scalar float semantics on POWER.
23 
24    It should be noted that there's much difference between X86_64's
25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26    portable <fenv.h> instead of access MXSCR directly.
27 
28    Most SSE2 scalar float intrinsic operations can be performed more
29    efficiently as C language float scalar operations or optimized to
30    use vector SIMD operations. We recommend this for new applications.
31 */
32 #error                                                                         \
33     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #endif
35 
36 #ifndef EMMINTRIN_H_
37 #define EMMINTRIN_H_
38 
39 #if defined(__ppc64__) &&                                                      \
40     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
41 
42 #include <altivec.h>
43 
44 /* We need definitions from the SSE header files.  */
45 #include <xmmintrin.h>
46 
47 /* SSE2 */
48 typedef __vector double __v2df;
49 typedef __vector long long __v2di;
50 typedef __vector unsigned long long __v2du;
51 typedef __vector int __v4si;
52 typedef __vector unsigned int __v4su;
53 typedef __vector short __v8hi;
54 typedef __vector unsigned short __v8hu;
55 typedef __vector signed char __v16qi;
56 typedef __vector unsigned char __v16qu;
57 
58 /* The Intel API is flexible enough that we must allow aliasing with other
59    vector types, and their scalar components.  */
60 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
61 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
62 
63 /* Unaligned version of the same types.  */
64 typedef long long __m128i_u
65     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
66 typedef double __m128d_u
67     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
68 
69 /* Define two value permute mask.  */
70 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
71 
72 /* Create a vector with element 0 as F and the rest zero.  */
73 extern __inline __m128d
74     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75     _mm_set_sd(double __F) {
76   return __extension__(__m128d){__F, 0.0};
77 }
78 
79 /* Create a vector with both elements equal to F.  */
80 extern __inline __m128d
81     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82     _mm_set1_pd(double __F) {
83   return __extension__(__m128d){__F, __F};
84 }
85 
86 extern __inline __m128d
87     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88     _mm_set_pd1(double __F) {
89   return _mm_set1_pd(__F);
90 }
91 
92 /* Create a vector with the lower value X and upper value W.  */
93 extern __inline __m128d
94     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95     _mm_set_pd(double __W, double __X) {
96   return __extension__(__m128d){__X, __W};
97 }
98 
99 /* Create a vector with the lower value W and upper value X.  */
100 extern __inline __m128d
101     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102     _mm_setr_pd(double __W, double __X) {
103   return __extension__(__m128d){__W, __X};
104 }
105 
106 /* Create an undefined vector.  */
107 extern __inline __m128d
108     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
109     _mm_undefined_pd(void) {
110   __m128d __Y = __Y;
111   return __Y;
112 }
113 
114 /* Create a vector of zeros.  */
115 extern __inline __m128d
116     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117     _mm_setzero_pd(void) {
118   return (__m128d)vec_splats(0);
119 }
120 
121 /* Sets the low DPFP value of A from the low value of B.  */
122 extern __inline __m128d
123     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124     _mm_move_sd(__m128d __A, __m128d __B) {
125   __v2df __result = (__v2df)__A;
126   __result[0] = ((__v2df)__B)[0];
127   return (__m128d)__result;
128 }
129 
130 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
131 extern __inline __m128d
132     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133     _mm_load_pd(double const *__P) {
134   return ((__m128d)vec_ld(0, (__v16qu *)__P));
135 }
136 
137 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
138 extern __inline __m128d
139     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140     _mm_loadu_pd(double const *__P) {
141   return (vec_vsx_ld(0, __P));
142 }
143 
144 /* Create a vector with all two elements equal to *P.  */
145 extern __inline __m128d
146     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147     _mm_load1_pd(double const *__P) {
148   return (vec_splats(*__P));
149 }
150 
151 /* Create a vector with element 0 as *P and the rest zero.  */
152 extern __inline __m128d
153     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154     _mm_load_sd(double const *__P) {
155   return _mm_set_sd(*__P);
156 }
157 
158 extern __inline __m128d
159     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160     _mm_load_pd1(double const *__P) {
161   return _mm_load1_pd(__P);
162 }
163 
164 /* Load two DPFP values in reverse order.  The address must be aligned.  */
165 extern __inline __m128d
166     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167     _mm_loadr_pd(double const *__P) {
168   __v2df __tmp = _mm_load_pd(__P);
169   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
170 }
171 
172 /* Store two DPFP values.  The address must be 16-byte aligned.  */
173 extern __inline void
174     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175     _mm_store_pd(double *__P, __m128d __A) {
176   vec_st((__v16qu)__A, 0, (__v16qu *)__P);
177 }
178 
179 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
180 extern __inline void
181     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182     _mm_storeu_pd(double *__P, __m128d __A) {
183   *(__m128d_u *)__P = __A;
184 }
185 
186 /* Stores the lower DPFP value.  */
187 extern __inline void
188     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189     _mm_store_sd(double *__P, __m128d __A) {
190   *__P = ((__v2df)__A)[0];
191 }
192 
193 extern __inline double
194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195     _mm_cvtsd_f64(__m128d __A) {
196   return ((__v2df)__A)[0];
197 }
198 
199 extern __inline void
200     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201     _mm_storel_pd(double *__P, __m128d __A) {
202   _mm_store_sd(__P, __A);
203 }
204 
205 /* Stores the upper DPFP value.  */
206 extern __inline void
207     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208     _mm_storeh_pd(double *__P, __m128d __A) {
209   *__P = ((__v2df)__A)[1];
210 }
211 /* Store the lower DPFP value across two words.
212    The address must be 16-byte aligned.  */
213 extern __inline void
214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215     _mm_store1_pd(double *__P, __m128d __A) {
216   _mm_store_pd(__P, vec_splat(__A, 0));
217 }
218 
219 extern __inline void
220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221     _mm_store_pd1(double *__P, __m128d __A) {
222   _mm_store1_pd(__P, __A);
223 }
224 
225 /* Store two DPFP values in reverse order.  The address must be aligned.  */
226 extern __inline void
227     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228     _mm_storer_pd(double *__P, __m128d __A) {
229   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
230 }
231 
232 /* Intel intrinsic.  */
233 extern __inline long long
234     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235     _mm_cvtsi128_si64(__m128i __A) {
236   return ((__v2di)__A)[0];
237 }
238 
239 /* Microsoft intrinsic.  */
240 extern __inline long long
241     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242     _mm_cvtsi128_si64x(__m128i __A) {
243   return ((__v2di)__A)[0];
244 }
245 
246 extern __inline __m128d
247     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248     _mm_add_pd(__m128d __A, __m128d __B) {
249   return (__m128d)((__v2df)__A + (__v2df)__B);
250 }
251 
252 /* Add the lower double-precision (64-bit) floating-point element in
253    a and b, store the result in the lower element of dst, and copy
254    the upper element from a to the upper element of dst. */
255 extern __inline __m128d
256     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257     _mm_add_sd(__m128d __A, __m128d __B) {
258   __A[0] = __A[0] + __B[0];
259   return (__A);
260 }
261 
262 extern __inline __m128d
263     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264     _mm_sub_pd(__m128d __A, __m128d __B) {
265   return (__m128d)((__v2df)__A - (__v2df)__B);
266 }
267 
268 extern __inline __m128d
269     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270     _mm_sub_sd(__m128d __A, __m128d __B) {
271   __A[0] = __A[0] - __B[0];
272   return (__A);
273 }
274 
275 extern __inline __m128d
276     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277     _mm_mul_pd(__m128d __A, __m128d __B) {
278   return (__m128d)((__v2df)__A * (__v2df)__B);
279 }
280 
281 extern __inline __m128d
282     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283     _mm_mul_sd(__m128d __A, __m128d __B) {
284   __A[0] = __A[0] * __B[0];
285   return (__A);
286 }
287 
288 extern __inline __m128d
289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290     _mm_div_pd(__m128d __A, __m128d __B) {
291   return (__m128d)((__v2df)__A / (__v2df)__B);
292 }
293 
294 extern __inline __m128d
295     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296     _mm_div_sd(__m128d __A, __m128d __B) {
297   __A[0] = __A[0] / __B[0];
298   return (__A);
299 }
300 
301 extern __inline __m128d
302     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303     _mm_sqrt_pd(__m128d __A) {
304   return (vec_sqrt(__A));
305 }
306 
307 /* Return pair {sqrt (B[0]), A[1]}.  */
308 extern __inline __m128d
309     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310     _mm_sqrt_sd(__m128d __A, __m128d __B) {
311   __v2df __c;
312   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
313   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
314 }
315 
316 extern __inline __m128d
317     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318     _mm_min_pd(__m128d __A, __m128d __B) {
319   return (vec_min(__A, __B));
320 }
321 
322 extern __inline __m128d
323     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
324     _mm_min_sd(__m128d __A, __m128d __B) {
325   __v2df __a, __b, __c;
326   __a = vec_splats(__A[0]);
327   __b = vec_splats(__B[0]);
328   __c = vec_min(__a, __b);
329   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
330 }
331 
332 extern __inline __m128d
333     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
334     _mm_max_pd(__m128d __A, __m128d __B) {
335   return (vec_max(__A, __B));
336 }
337 
338 extern __inline __m128d
339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340     _mm_max_sd(__m128d __A, __m128d __B) {
341   __v2df __a, __b, __c;
342   __a = vec_splats(__A[0]);
343   __b = vec_splats(__B[0]);
344   __c = vec_max(__a, __b);
345   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
346 }
347 
348 extern __inline __m128d
349     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350     _mm_cmpeq_pd(__m128d __A, __m128d __B) {
351   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
352 }
353 
354 extern __inline __m128d
355     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356     _mm_cmplt_pd(__m128d __A, __m128d __B) {
357   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
358 }
359 
360 extern __inline __m128d
361     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362     _mm_cmple_pd(__m128d __A, __m128d __B) {
363   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
364 }
365 
366 extern __inline __m128d
367     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368     _mm_cmpgt_pd(__m128d __A, __m128d __B) {
369   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
370 }
371 
372 extern __inline __m128d
373     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374     _mm_cmpge_pd(__m128d __A, __m128d __B) {
375   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
376 }
377 
378 extern __inline __m128d
379     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380     _mm_cmpneq_pd(__m128d __A, __m128d __B) {
381   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
382   return ((__m128d)vec_nor(__temp, __temp));
383 }
384 
385 extern __inline __m128d
386     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
388   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
389 }
390 
391 extern __inline __m128d
392     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393     _mm_cmpnle_pd(__m128d __A, __m128d __B) {
394   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
395 }
396 
397 extern __inline __m128d
398     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399     _mm_cmpngt_pd(__m128d __A, __m128d __B) {
400   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
401 }
402 
403 extern __inline __m128d
404     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405     _mm_cmpnge_pd(__m128d __A, __m128d __B) {
406   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
407 }
408 
409 extern __inline __m128d
410     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411     _mm_cmpord_pd(__m128d __A, __m128d __B) {
412   __v2du __c, __d;
413   /* Compare against self will return false (0's) if NAN.  */
414   __c = (__v2du)vec_cmpeq(__A, __A);
415   __d = (__v2du)vec_cmpeq(__B, __B);
416   /* A != NAN and B != NAN.  */
417   return ((__m128d)vec_and(__c, __d));
418 }
419 
420 extern __inline __m128d
421     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422     _mm_cmpunord_pd(__m128d __A, __m128d __B) {
423 #if _ARCH_PWR8
424   __v2du __c, __d;
425   /* Compare against self will return false (0's) if NAN.  */
426   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
427   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
428   /* A == NAN OR B == NAN converts too:
429      NOT(A != NAN) OR NOT(B != NAN).  */
430   __c = vec_nor(__c, __c);
431   return ((__m128d)vec_orc(__c, __d));
432 #else
433   __v2du __c, __d;
434   /* Compare against self will return false (0's) if NAN.  */
435   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
436   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
437   /* Convert the true ('1's) is NAN.  */
438   __c = vec_nor(__c, __c);
439   __d = vec_nor(__d, __d);
440   return ((__m128d)vec_or(__c, __d));
441 #endif
442 }
443 
444 extern __inline __m128d
445     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446     _mm_cmpeq_sd(__m128d __A, __m128d __B) {
447   __v2df __a, __b, __c;
448   /* PowerISA VSX does not allow partial (for just lower double)
449      results. So to insure we don't generate spurious exceptions
450      (from the upper double values) we splat the lower double
451      before we do the operation. */
452   __a = vec_splats(__A[0]);
453   __b = vec_splats(__B[0]);
454   __c = (__v2df)vec_cmpeq(__a, __b);
455   /* Then we merge the lower double result with the original upper
456      double from __A.  */
457   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
458 }
459 
460 extern __inline __m128d
461     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462     _mm_cmplt_sd(__m128d __A, __m128d __B) {
463   __v2df __a, __b, __c;
464   __a = vec_splats(__A[0]);
465   __b = vec_splats(__B[0]);
466   __c = (__v2df)vec_cmplt(__a, __b);
467   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
468 }
469 
470 extern __inline __m128d
471     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472     _mm_cmple_sd(__m128d __A, __m128d __B) {
473   __v2df __a, __b, __c;
474   __a = vec_splats(__A[0]);
475   __b = vec_splats(__B[0]);
476   __c = (__v2df)vec_cmple(__a, __b);
477   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
478 }
479 
480 extern __inline __m128d
481     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482     _mm_cmpgt_sd(__m128d __A, __m128d __B) {
483   __v2df __a, __b, __c;
484   __a = vec_splats(__A[0]);
485   __b = vec_splats(__B[0]);
486   __c = (__v2df)vec_cmpgt(__a, __b);
487   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
488 }
489 
490 extern __inline __m128d
491     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492     _mm_cmpge_sd(__m128d __A, __m128d __B) {
493   __v2df __a, __b, __c;
494   __a = vec_splats(__A[0]);
495   __b = vec_splats(__B[0]);
496   __c = (__v2df)vec_cmpge(__a, __b);
497   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
498 }
499 
500 extern __inline __m128d
501     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502     _mm_cmpneq_sd(__m128d __A, __m128d __B) {
503   __v2df __a, __b, __c;
504   __a = vec_splats(__A[0]);
505   __b = vec_splats(__B[0]);
506   __c = (__v2df)vec_cmpeq(__a, __b);
507   __c = vec_nor(__c, __c);
508   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
509 }
510 
511 extern __inline __m128d
512     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
514   __v2df __a, __b, __c;
515   __a = vec_splats(__A[0]);
516   __b = vec_splats(__B[0]);
517   /* Not less than is just greater than or equal.  */
518   __c = (__v2df)vec_cmpge(__a, __b);
519   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
520 }
521 
522 extern __inline __m128d
523     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524     _mm_cmpnle_sd(__m128d __A, __m128d __B) {
525   __v2df __a, __b, __c;
526   __a = vec_splats(__A[0]);
527   __b = vec_splats(__B[0]);
528   /* Not less than or equal is just greater than.  */
529   __c = (__v2df)vec_cmpge(__a, __b);
530   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
531 }
532 
533 extern __inline __m128d
534     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535     _mm_cmpngt_sd(__m128d __A, __m128d __B) {
536   __v2df __a, __b, __c;
537   __a = vec_splats(__A[0]);
538   __b = vec_splats(__B[0]);
539   /* Not greater than is just less than or equal.  */
540   __c = (__v2df)vec_cmple(__a, __b);
541   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
542 }
543 
544 extern __inline __m128d
545     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546     _mm_cmpnge_sd(__m128d __A, __m128d __B) {
547   __v2df __a, __b, __c;
548   __a = vec_splats(__A[0]);
549   __b = vec_splats(__B[0]);
550   /* Not greater than or equal is just less than.  */
551   __c = (__v2df)vec_cmplt(__a, __b);
552   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
553 }
554 
555 extern __inline __m128d
556     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557     _mm_cmpord_sd(__m128d __A, __m128d __B) {
558   __v2df __r;
559   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
560   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
561 }
562 
563 extern __inline __m128d
564     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565     _mm_cmpunord_sd(__m128d __A, __m128d __B) {
566   __v2df __r;
567   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
568   return (__m128d)_mm_setr_pd(__r[0], __A[1]);
569 }
570 
571 /* FIXME
572    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
573    exactly the same because GCC for PowerPC only generates unordered
574    compares (scalar and vector).
575    Technically __mm_comieq_sp et all should be using the ordered
576    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
577    be OK.   */
578 extern __inline int
579     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
580     _mm_comieq_sd(__m128d __A, __m128d __B) {
581   return (__A[0] == __B[0]);
582 }
583 
584 extern __inline int
585     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
586     _mm_comilt_sd(__m128d __A, __m128d __B) {
587   return (__A[0] < __B[0]);
588 }
589 
590 extern __inline int
591     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592     _mm_comile_sd(__m128d __A, __m128d __B) {
593   return (__A[0] <= __B[0]);
594 }
595 
596 extern __inline int
597     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598     _mm_comigt_sd(__m128d __A, __m128d __B) {
599   return (__A[0] > __B[0]);
600 }
601 
602 extern __inline int
603     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604     _mm_comige_sd(__m128d __A, __m128d __B) {
605   return (__A[0] >= __B[0]);
606 }
607 
608 extern __inline int
609     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610     _mm_comineq_sd(__m128d __A, __m128d __B) {
611   return (__A[0] != __B[0]);
612 }
613 
614 extern __inline int
615     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616     _mm_ucomieq_sd(__m128d __A, __m128d __B) {
617   return (__A[0] == __B[0]);
618 }
619 
620 extern __inline int
621     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622     _mm_ucomilt_sd(__m128d __A, __m128d __B) {
623   return (__A[0] < __B[0]);
624 }
625 
626 extern __inline int
627     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628     _mm_ucomile_sd(__m128d __A, __m128d __B) {
629   return (__A[0] <= __B[0]);
630 }
631 
632 extern __inline int
633     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634     _mm_ucomigt_sd(__m128d __A, __m128d __B) {
635   return (__A[0] > __B[0]);
636 }
637 
638 extern __inline int
639     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640     _mm_ucomige_sd(__m128d __A, __m128d __B) {
641   return (__A[0] >= __B[0]);
642 }
643 
644 extern __inline int
645     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646     _mm_ucomineq_sd(__m128d __A, __m128d __B) {
647   return (__A[0] != __B[0]);
648 }
649 
650 /* Create a vector of Qi, where i is the element number.  */
651 extern __inline __m128i
652     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653     _mm_set_epi64x(long long __q1, long long __q0) {
654   return __extension__(__m128i)(__v2di){__q0, __q1};
655 }
656 
657 extern __inline __m128i
658     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659     _mm_set_epi64(__m64 __q1, __m64 __q0) {
660   return _mm_set_epi64x((long long)__q1, (long long)__q0);
661 }
662 
663 extern __inline __m128i
664     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
665     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
666   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
667 }
668 
669 extern __inline __m128i
670     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
672                   short __q2, short __q1, short __q0) {
673   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
674                                         __q4, __q5, __q6, __q7};
675 }
676 
677 extern __inline __m128i
678     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
680                  char __q10, char __q09, char __q08, char __q07, char __q06,
681                  char __q05, char __q04, char __q03, char __q02, char __q01,
682                  char __q00) {
683   return __extension__(__m128i)(__v16qi){
684       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
685       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
686 }
687 
688 /* Set all of the elements of the vector to A.  */
689 extern __inline __m128i
690     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691     _mm_set1_epi64x(long long __A) {
692   return _mm_set_epi64x(__A, __A);
693 }
694 
695 extern __inline __m128i
696     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697     _mm_set1_epi64(__m64 __A) {
698   return _mm_set_epi64(__A, __A);
699 }
700 
701 extern __inline __m128i
702     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703     _mm_set1_epi32(int __A) {
704   return _mm_set_epi32(__A, __A, __A, __A);
705 }
706 
707 extern __inline __m128i
708     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709     _mm_set1_epi16(short __A) {
710   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
711 }
712 
713 extern __inline __m128i
714     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715     _mm_set1_epi8(char __A) {
716   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
717                       __A, __A, __A, __A, __A);
718 }
719 
720 /* Create a vector of Qi, where i is the element number.
721    The parameter order is reversed from the _mm_set_epi* functions.  */
722 extern __inline __m128i
723     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724     _mm_setr_epi64(__m64 __q0, __m64 __q1) {
725   return _mm_set_epi64(__q1, __q0);
726 }
727 
728 extern __inline __m128i
729     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
731   return _mm_set_epi32(__q3, __q2, __q1, __q0);
732 }
733 
734 extern __inline __m128i
735     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
737                    short __q5, short __q6, short __q7) {
738   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
739 }
740 
741 extern __inline __m128i
742     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
744                   char __q05, char __q06, char __q07, char __q08, char __q09,
745                   char __q10, char __q11, char __q12, char __q13, char __q14,
746                   char __q15) {
747   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
748                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
749 }
750 
751 /* Create a vector with element 0 as *P and the rest zero.  */
752 extern __inline __m128i
753     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754     _mm_load_si128(__m128i const *__P) {
755   return *__P;
756 }
757 
758 extern __inline __m128i
759     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760     _mm_loadu_si128(__m128i_u const *__P) {
761   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
762 }
763 
764 extern __inline __m128i
765     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766     _mm_loadl_epi64(__m128i_u const *__P) {
767   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
768 }
769 
770 extern __inline void
771     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772     _mm_store_si128(__m128i *__P, __m128i __B) {
773   vec_st((__v16qu)__B, 0, (__v16qu *)__P);
774 }
775 
776 extern __inline void
777     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
779   *__P = __B;
780 }
781 
782 extern __inline void
783     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
785   *(long long *)__P = ((__v2di)__B)[0];
786 }
787 
788 extern __inline __m64
789     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
790     _mm_movepi64_pi64(__m128i_u __B) {
791   return (__m64)((__v2di)__B)[0];
792 }
793 
794 extern __inline __m128i
795     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796     _mm_movpi64_epi64(__m64 __A) {
797   return _mm_set_epi64((__m64)0LL, __A);
798 }
799 
800 extern __inline __m128i
801     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
802     _mm_move_epi64(__m128i __A) {
803   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
804 }
805 
806 /* Create an undefined vector.  */
807 extern __inline __m128i
808     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809     _mm_undefined_si128(void) {
810   __m128i __Y = __Y;
811   return __Y;
812 }
813 
814 /* Create a vector of zeros.  */
815 extern __inline __m128i
816     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817     _mm_setzero_si128(void) {
818   return __extension__(__m128i)(__v4si){0, 0, 0, 0};
819 }
820 
821 #ifdef _ARCH_PWR8
822 extern __inline __m128d
823     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824     _mm_cvtepi32_pd(__m128i __A) {
825   __v2di __val;
826   /* For LE need to generate Vector Unpack Low Signed Word.
827      Which is generated from unpackh.  */
828   __val = (__v2di)vec_unpackh((__v4si)__A);
829 
830   return (__m128d)vec_ctf(__val, 0);
831 }
832 #endif
833 
834 extern __inline __m128
835     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836     _mm_cvtepi32_ps(__m128i __A) {
837   return ((__m128)vec_ctf((__v4si)__A, 0));
838 }
839 
840 extern __inline __m128i
841     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842     _mm_cvtpd_epi32(__m128d __A) {
843   __v2df __rounded = vec_rint(__A);
844   __v4si __result, __temp;
845   const __v4si __vzero = {0, 0, 0, 0};
846 
847   /* VSX Vector truncate Double-Precision to integer and Convert to
848    Signed Integer Word format with Saturate.  */
849   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
850 
851 #ifdef _ARCH_PWR8
852 #ifdef __LITTLE_ENDIAN__
853   __temp = vec_mergeo(__temp, __temp);
854 #else
855   __temp = vec_mergee(__temp, __temp);
856 #endif
857   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
858                                  (__vector long long)__vzero);
859 #else
860   {
861     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
862                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
863     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
864   }
865 #endif
866   return (__m128i)__result;
867 }
868 
869 extern __inline __m64
870     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871     _mm_cvtpd_pi32(__m128d __A) {
872   __m128i __result = _mm_cvtpd_epi32(__A);
873 
874   return (__m64)__result[0];
875 }
876 
877 extern __inline __m128
878     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879     _mm_cvtpd_ps(__m128d __A) {
880   __v4sf __result;
881   __v4si __temp;
882   const __v4si __vzero = {0, 0, 0, 0};
883 
884   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
885 
886 #ifdef _ARCH_PWR8
887 #ifdef __LITTLE_ENDIAN__
888   __temp = vec_mergeo(__temp, __temp);
889 #else
890   __temp = vec_mergee(__temp, __temp);
891 #endif
892   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
893                                  (__vector long long)__vzero);
894 #else
895   {
896     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
897                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
898     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
899   }
900 #endif
901   return ((__m128)__result);
902 }
903 
904 extern __inline __m128i
905     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906     _mm_cvttpd_epi32(__m128d __A) {
907   __v4si __result;
908   __v4si __temp;
909   const __v4si __vzero = {0, 0, 0, 0};
910 
911   /* VSX Vector truncate Double-Precision to integer and Convert to
912    Signed Integer Word format with Saturate.  */
913   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
914 
915 #ifdef _ARCH_PWR8
916 #ifdef __LITTLE_ENDIAN__
917   __temp = vec_mergeo(__temp, __temp);
918 #else
919   __temp = vec_mergee(__temp, __temp);
920 #endif
921   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
922                                  (__vector long long)__vzero);
923 #else
924   {
925     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
927     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
928   }
929 #endif
930 
931   return ((__m128i)__result);
932 }
933 
934 extern __inline __m64
935     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
936     _mm_cvttpd_pi32(__m128d __A) {
937   __m128i __result = _mm_cvttpd_epi32(__A);
938 
939   return (__m64)__result[0];
940 }
941 
942 extern __inline int
943     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
944     _mm_cvtsi128_si32(__m128i __A) {
945   return ((__v4si)__A)[0];
946 }
947 
948 #ifdef _ARCH_PWR8
949 extern __inline __m128d
950     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951     _mm_cvtpi32_pd(__m64 __A) {
952   __v4si __temp;
953   __v2di __tmp2;
954   __v2df __result;
955 
956   __temp = (__v4si)vec_splats(__A);
957   __tmp2 = (__v2di)vec_unpackl(__temp);
958   __result = vec_ctf((__vector signed long long)__tmp2, 0);
959   return (__m128d)__result;
960 }
961 #endif
962 
963 extern __inline __m128i
964     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
965     _mm_cvtps_epi32(__m128 __A) {
966   __v4sf __rounded;
967   __v4si __result;
968 
969   __rounded = vec_rint((__v4sf)__A);
970   __result = vec_cts(__rounded, 0);
971   return (__m128i)__result;
972 }
973 
974 extern __inline __m128i
975     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976     _mm_cvttps_epi32(__m128 __A) {
977   __v4si __result;
978 
979   __result = vec_cts((__v4sf)__A, 0);
980   return (__m128i)__result;
981 }
982 
983 extern __inline __m128d
984     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985     _mm_cvtps_pd(__m128 __A) {
986   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
987 #ifdef vec_doubleh
988   return (__m128d)vec_doubleh((__v4sf)__A);
989 #else
990   /* Otherwise the compiler is not current and so need to generate the
991      equivalent code.  */
992   __v4sf __a = (__v4sf)__A;
993   __v4sf __temp;
994   __v2df __result;
995 #ifdef __LITTLE_ENDIAN__
996   /* The input float values are in elements {[0], [1]} but the convert
997      instruction needs them in elements {[1], [3]}, So we use two
998      shift left double vector word immediates to get the elements
999      lined up.  */
1000   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1001   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1002 #else
1003   /* The input float values are in elements {[0], [1]} but the convert
1004      instruction needs them in elements {[0], [2]}, So we use two
1005      shift left double vector word immediates to get the elements
1006      lined up.  */
1007   __temp = vec_vmrghw(__a, __a);
1008 #endif
1009   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1010   return (__m128d)__result;
1011 #endif
1012 }
1013 
1014 extern __inline int
1015     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016     _mm_cvtsd_si32(__m128d __A) {
1017   __v2df __rounded = vec_rint((__v2df)__A);
1018   int __result = ((__v2df)__rounded)[0];
1019 
1020   return __result;
1021 }
1022 /* Intel intrinsic.  */
1023 extern __inline long long
1024     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025     _mm_cvtsd_si64(__m128d __A) {
1026   __v2df __rounded = vec_rint((__v2df)__A);
1027   long long __result = ((__v2df)__rounded)[0];
1028 
1029   return __result;
1030 }
1031 
1032 /* Microsoft intrinsic.  */
1033 extern __inline long long
1034     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1035     _mm_cvtsd_si64x(__m128d __A) {
1036   return _mm_cvtsd_si64((__v2df)__A);
1037 }
1038 
1039 extern __inline int
1040     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041     _mm_cvttsd_si32(__m128d __A) {
1042   int __result = ((__v2df)__A)[0];
1043 
1044   return __result;
1045 }
1046 
1047 /* Intel intrinsic.  */
1048 extern __inline long long
1049     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050     _mm_cvttsd_si64(__m128d __A) {
1051   long long __result = ((__v2df)__A)[0];
1052 
1053   return __result;
1054 }
1055 
1056 /* Microsoft intrinsic.  */
1057 extern __inline long long
1058     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059     _mm_cvttsd_si64x(__m128d __A) {
1060   return _mm_cvttsd_si64(__A);
1061 }
1062 
1063 extern __inline __m128
1064     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065     _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1066   __v4sf __result = (__v4sf)__A;
1067 
1068 #ifdef __LITTLE_ENDIAN__
1069   __v4sf __temp_s;
1070   /* Copy double element[0] to element [1] for conversion.  */
1071   __v2df __temp_b = vec_splat((__v2df)__B, 0);
1072 
1073   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1074   __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1075   /* Convert double to single float scalar in a vector.  */
1076   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1077   /* Shift the resulting scalar into vector element [0].  */
1078   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1079 #else
1080   __result[0] = ((__v2df)__B)[0];
1081 #endif
1082   return (__m128)__result;
1083 }
1084 
1085 extern __inline __m128d
1086     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087     _mm_cvtsi32_sd(__m128d __A, int __B) {
1088   __v2df __result = (__v2df)__A;
1089   double __db = __B;
1090   __result[0] = __db;
1091   return (__m128d)__result;
1092 }
1093 
1094 /* Intel intrinsic.  */
1095 extern __inline __m128d
1096     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097     _mm_cvtsi64_sd(__m128d __A, long long __B) {
1098   __v2df __result = (__v2df)__A;
1099   double __db = __B;
1100   __result[0] = __db;
1101   return (__m128d)__result;
1102 }
1103 
1104 /* Microsoft intrinsic.  */
1105 extern __inline __m128d
1106     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1107     _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1108   return _mm_cvtsi64_sd(__A, __B);
1109 }
1110 
1111 extern __inline __m128d
1112     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1113     _mm_cvtss_sd(__m128d __A, __m128 __B) {
1114 #ifdef __LITTLE_ENDIAN__
1115   /* Use splat to move element [0] into position for the convert. */
1116   __v4sf __temp = vec_splat((__v4sf)__B, 0);
1117   __v2df __res;
1118   /* Convert single float scalar to double in a vector.  */
1119   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1120   return (__m128d)vec_mergel(__res, (__v2df)__A);
1121 #else
1122   __v2df __res = (__v2df)__A;
1123   __res[0] = ((__v4sf)__B)[0];
1124   return (__m128d)__res;
1125 #endif
1126 }
1127 
1128 extern __inline __m128d
1129     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1131   __vector double __result;
1132   const int __litmsk = __mask & 0x3;
1133 
1134   if (__litmsk == 0)
1135     __result = vec_mergeh(__A, __B);
1136 #if __GNUC__ < 6
1137   else if (__litmsk == 1)
1138     __result = vec_xxpermdi(__B, __A, 2);
1139   else if (__litmsk == 2)
1140     __result = vec_xxpermdi(__B, __A, 1);
1141 #else
1142   else if (__litmsk == 1)
1143     __result = vec_xxpermdi(__A, __B, 2);
1144   else if (__litmsk == 2)
1145     __result = vec_xxpermdi(__A, __B, 1);
1146 #endif
1147   else
1148     __result = vec_mergel(__A, __B);
1149 
1150   return __result;
1151 }
1152 
1153 extern __inline __m128d
1154     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1155     _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1156   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1157 }
1158 
1159 extern __inline __m128d
1160     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161     _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1162   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1163 }
1164 
1165 extern __inline __m128d
1166     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167     _mm_loadh_pd(__m128d __A, double const *__B) {
1168   __v2df __result = (__v2df)__A;
1169   __result[1] = *__B;
1170   return (__m128d)__result;
1171 }
1172 
1173 extern __inline __m128d
1174     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175     _mm_loadl_pd(__m128d __A, double const *__B) {
1176   __v2df __result = (__v2df)__A;
1177   __result[0] = *__B;
1178   return (__m128d)__result;
1179 }
1180 
1181 #ifdef _ARCH_PWR8
1182 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1183 
1184 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1185 extern __inline int
1186     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187     _mm_movemask_pd(__m128d __A) {
1188 #ifdef _ARCH_PWR10
1189   return vec_extractm((__v2du)__A);
1190 #else
1191   __vector unsigned long long __result;
1192   static const __vector unsigned int __perm_mask = {
1193 #ifdef __LITTLE_ENDIAN__
1194       0x80800040, 0x80808080, 0x80808080, 0x80808080
1195 #else
1196       0x80808080, 0x80808080, 0x80808080, 0x80804000
1197 #endif
1198   };
1199 
1200   __result = ((__vector unsigned long long)vec_vbpermq(
1201       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1202 
1203 #ifdef __LITTLE_ENDIAN__
1204   return __result[1];
1205 #else
1206   return __result[0];
1207 #endif
1208 #endif /* !_ARCH_PWR10 */
1209 }
1210 #endif /* _ARCH_PWR8 */
1211 
1212 extern __inline __m128i
1213     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214     _mm_packs_epi16(__m128i __A, __m128i __B) {
1215   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1216 }
1217 
1218 extern __inline __m128i
1219     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220     _mm_packs_epi32(__m128i __A, __m128i __B) {
1221   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1222 }
1223 
1224 extern __inline __m128i
1225     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226     _mm_packus_epi16(__m128i __A, __m128i __B) {
1227   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1228 }
1229 
1230 extern __inline __m128i
1231     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1233   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1234 }
1235 
1236 extern __inline __m128i
1237     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1239   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1240 }
1241 
1242 extern __inline __m128i
1243     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1245   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1246 }
1247 
1248 extern __inline __m128i
1249     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1251   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1252 }
1253 
1254 extern __inline __m128i
1255     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1257   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1258 }
1259 
1260 extern __inline __m128i
1261     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1263   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1264 }
1265 
1266 extern __inline __m128i
1267     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1269   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1270 }
1271 
1272 extern __inline __m128i
1273     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1275   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1276 }
1277 
1278 extern __inline __m128i
1279     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280     _mm_add_epi8(__m128i __A, __m128i __B) {
1281   return (__m128i)((__v16qu)__A + (__v16qu)__B);
1282 }
1283 
1284 extern __inline __m128i
1285     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286     _mm_add_epi16(__m128i __A, __m128i __B) {
1287   return (__m128i)((__v8hu)__A + (__v8hu)__B);
1288 }
1289 
1290 extern __inline __m128i
1291     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292     _mm_add_epi32(__m128i __A, __m128i __B) {
1293   return (__m128i)((__v4su)__A + (__v4su)__B);
1294 }
1295 
1296 extern __inline __m128i
1297     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298     _mm_add_epi64(__m128i __A, __m128i __B) {
1299   return (__m128i)((__v2du)__A + (__v2du)__B);
1300 }
1301 
1302 extern __inline __m128i
1303     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304     _mm_adds_epi8(__m128i __A, __m128i __B) {
1305   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1306 }
1307 
1308 extern __inline __m128i
1309     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310     _mm_adds_epi16(__m128i __A, __m128i __B) {
1311   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1312 }
1313 
1314 extern __inline __m128i
1315     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316     _mm_adds_epu8(__m128i __A, __m128i __B) {
1317   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1318 }
1319 
1320 extern __inline __m128i
1321     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322     _mm_adds_epu16(__m128i __A, __m128i __B) {
1323   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1324 }
1325 
1326 extern __inline __m128i
1327     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328     _mm_sub_epi8(__m128i __A, __m128i __B) {
1329   return (__m128i)((__v16qu)__A - (__v16qu)__B);
1330 }
1331 
1332 extern __inline __m128i
1333     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334     _mm_sub_epi16(__m128i __A, __m128i __B) {
1335   return (__m128i)((__v8hu)__A - (__v8hu)__B);
1336 }
1337 
1338 extern __inline __m128i
1339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340     _mm_sub_epi32(__m128i __A, __m128i __B) {
1341   return (__m128i)((__v4su)__A - (__v4su)__B);
1342 }
1343 
1344 extern __inline __m128i
1345     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346     _mm_sub_epi64(__m128i __A, __m128i __B) {
1347   return (__m128i)((__v2du)__A - (__v2du)__B);
1348 }
1349 
1350 extern __inline __m128i
1351     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352     _mm_subs_epi8(__m128i __A, __m128i __B) {
1353   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1354 }
1355 
1356 extern __inline __m128i
1357     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358     _mm_subs_epi16(__m128i __A, __m128i __B) {
1359   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1360 }
1361 
1362 extern __inline __m128i
1363     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364     _mm_subs_epu8(__m128i __A, __m128i __B) {
1365   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1366 }
1367 
1368 extern __inline __m128i
1369     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370     _mm_subs_epu16(__m128i __A, __m128i __B) {
1371   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1372 }
1373 
1374 extern __inline __m128i
1375     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376     _mm_madd_epi16(__m128i __A, __m128i __B) {
1377   __vector signed int __zero = {0, 0, 0, 0};
1378 
1379   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1380 }
1381 
1382 extern __inline __m128i
1383     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384     _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1385   __vector signed int __w0, __w1;
1386 
1387   __vector unsigned char __xform1 = {
1388 #ifdef __LITTLE_ENDIAN__
1389       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1390       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1391 #else
1392       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1393       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1394 #endif
1395   };
1396 
1397   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1398   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1399   return (__m128i)vec_perm(__w0, __w1, __xform1);
1400 }
1401 
1402 extern __inline __m128i
1403     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404     _mm_mullo_epi16(__m128i __A, __m128i __B) {
1405   return (__m128i)((__v8hi)__A * (__v8hi)__B);
1406 }
1407 
1408 extern __inline __m64
1409     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410     _mm_mul_su32(__m64 __A, __m64 __B) {
1411   unsigned int __a = __A;
1412   unsigned int __b = __B;
1413 
1414   return ((__m64)__a * (__m64)__b);
1415 }
1416 
1417 #ifdef _ARCH_PWR8
1418 extern __inline __m128i
1419     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420     _mm_mul_epu32(__m128i __A, __m128i __B) {
1421 #if __GNUC__ < 8
1422   __v2du __result;
1423 
1424 #ifdef __LITTLE_ENDIAN__
1425   /* VMX Vector Multiply Odd Unsigned Word.  */
1426   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1427 #else
1428   /* VMX Vector Multiply Even Unsigned Word.  */
1429   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1430 #endif
1431   return (__m128i)__result;
1432 #else
1433   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1434 #endif
1435 }
1436 #endif
1437 
1438 extern __inline __m128i
1439     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440     _mm_slli_epi16(__m128i __A, int __B) {
1441   __v8hu __lshift;
1442   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1443 
1444   if (__B >= 0 && __B < 16) {
1445     if (__builtin_constant_p(__B))
1446       __lshift = (__v8hu)vec_splat_s16(__B);
1447     else
1448       __lshift = vec_splats((unsigned short)__B);
1449 
1450     __result = vec_sl((__v8hi)__A, __lshift);
1451   }
1452 
1453   return (__m128i)__result;
1454 }
1455 
1456 extern __inline __m128i
1457     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458     _mm_slli_epi32(__m128i __A, int __B) {
1459   __v4su __lshift;
1460   __v4si __result = {0, 0, 0, 0};
1461 
1462   if (__B >= 0 && __B < 32) {
1463     if (__builtin_constant_p(__B) && __B < 16)
1464       __lshift = (__v4su)vec_splat_s32(__B);
1465     else
1466       __lshift = vec_splats((unsigned int)__B);
1467 
1468     __result = vec_sl((__v4si)__A, __lshift);
1469   }
1470 
1471   return (__m128i)__result;
1472 }
1473 
1474 #ifdef _ARCH_PWR8
1475 extern __inline __m128i
1476     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1477     _mm_slli_epi64(__m128i __A, int __B) {
1478   __v2du __lshift;
1479   __v2di __result = {0, 0};
1480 
1481   if (__B >= 0 && __B < 64) {
1482     if (__builtin_constant_p(__B) && __B < 16)
1483       __lshift = (__v2du)vec_splat_s32(__B);
1484     else
1485       __lshift = (__v2du)vec_splats((unsigned int)__B);
1486 
1487     __result = vec_sl((__v2di)__A, __lshift);
1488   }
1489 
1490   return (__m128i)__result;
1491 }
1492 #endif
1493 
1494 extern __inline __m128i
1495     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496     _mm_srai_epi16(__m128i __A, int __B) {
1497   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1498   __v8hi __result;
1499 
1500   if (__B < 16) {
1501     if (__builtin_constant_p(__B))
1502       __rshift = (__v8hu)vec_splat_s16(__B);
1503     else
1504       __rshift = vec_splats((unsigned short)__B);
1505   }
1506   __result = vec_sra((__v8hi)__A, __rshift);
1507 
1508   return (__m128i)__result;
1509 }
1510 
1511 extern __inline __m128i
1512     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1513     _mm_srai_epi32(__m128i __A, int __B) {
1514   __v4su __rshift = {31, 31, 31, 31};
1515   __v4si __result;
1516 
1517   if (__B < 32) {
1518     if (__builtin_constant_p(__B)) {
1519       if (__B < 16)
1520         __rshift = (__v4su)vec_splat_s32(__B);
1521       else
1522         __rshift = (__v4su)vec_splats((unsigned int)__B);
1523     } else
1524       __rshift = vec_splats((unsigned int)__B);
1525   }
1526   __result = vec_sra((__v4si)__A, __rshift);
1527 
1528   return (__m128i)__result;
1529 }
1530 
1531 extern __inline __m128i
1532     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533     _mm_bslli_si128(__m128i __A, const int __N) {
1534   __v16qu __result;
1535   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1536 
1537   if (__N < 16)
1538     __result = vec_sld((__v16qu)__A, __zeros, __N);
1539   else
1540     __result = __zeros;
1541 
1542   return (__m128i)__result;
1543 }
1544 
1545 extern __inline __m128i
1546     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1547     _mm_bsrli_si128(__m128i __A, const int __N) {
1548   __v16qu __result;
1549   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1550 
1551   if (__N < 16)
1552 #ifdef __LITTLE_ENDIAN__
1553     if (__builtin_constant_p(__N))
1554       /* Would like to use Vector Shift Left Double by Octet
1555          Immediate here to use the immediate form and avoid
1556          load of __N * 8 value into a separate VR.  */
1557       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1558     else
1559 #endif
1560     {
1561       __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1562 #ifdef __LITTLE_ENDIAN__
1563       __result = vec_sro((__v16qu)__A, __shift);
1564 #else
1565     __result = vec_slo((__v16qu)__A, __shift);
1566 #endif
1567     }
1568   else
1569     __result = __zeros;
1570 
1571   return (__m128i)__result;
1572 }
1573 
1574 extern __inline __m128i
1575     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1576     _mm_srli_si128(__m128i __A, const int __N) {
1577   return _mm_bsrli_si128(__A, __N);
1578 }
1579 
1580 extern __inline __m128i
1581     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582     _mm_slli_si128(__m128i __A, const int _imm5) {
1583   __v16qu __result;
1584   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1585 
1586   if (_imm5 < 16)
1587 #ifdef __LITTLE_ENDIAN__
1588     __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1589 #else
1590     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1591 #endif
1592   else
1593     __result = __zeros;
1594 
1595   return (__m128i)__result;
1596 }
1597 
1598 extern __inline __m128i
1599     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600 
1601     _mm_srli_epi16(__m128i __A, int __B) {
1602   __v8hu __rshift;
1603   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1604 
1605   if (__B < 16) {
1606     if (__builtin_constant_p(__B))
1607       __rshift = (__v8hu)vec_splat_s16(__B);
1608     else
1609       __rshift = vec_splats((unsigned short)__B);
1610 
1611     __result = vec_sr((__v8hi)__A, __rshift);
1612   }
1613 
1614   return (__m128i)__result;
1615 }
1616 
1617 extern __inline __m128i
1618     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1619     _mm_srli_epi32(__m128i __A, int __B) {
1620   __v4su __rshift;
1621   __v4si __result = {0, 0, 0, 0};
1622 
1623   if (__B < 32) {
1624     if (__builtin_constant_p(__B)) {
1625       if (__B < 16)
1626         __rshift = (__v4su)vec_splat_s32(__B);
1627       else
1628         __rshift = (__v4su)vec_splats((unsigned int)__B);
1629     } else
1630       __rshift = vec_splats((unsigned int)__B);
1631 
1632     __result = vec_sr((__v4si)__A, __rshift);
1633   }
1634 
1635   return (__m128i)__result;
1636 }
1637 
1638 #ifdef _ARCH_PWR8
1639 extern __inline __m128i
1640     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1641     _mm_srli_epi64(__m128i __A, int __B) {
1642   __v2du __rshift;
1643   __v2di __result = {0, 0};
1644 
1645   if (__B < 64) {
1646     if (__builtin_constant_p(__B)) {
1647       if (__B < 16)
1648         __rshift = (__v2du)vec_splat_s32(__B);
1649       else
1650         __rshift = (__v2du)vec_splats((unsigned long long)__B);
1651     } else
1652       __rshift = (__v2du)vec_splats((unsigned int)__B);
1653 
1654     __result = vec_sr((__v2di)__A, __rshift);
1655   }
1656 
1657   return (__m128i)__result;
1658 }
1659 #endif
1660 
1661 extern __inline __m128i
1662     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1663     _mm_sll_epi16(__m128i __A, __m128i __B) {
1664   __v8hu __lshift;
1665   __vector __bool short __shmask;
1666   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1667   __v8hu __result;
1668 
1669 #ifdef __LITTLE_ENDIAN__
1670   __lshift = vec_splat((__v8hu)__B, 0);
1671 #else
1672   __lshift = vec_splat((__v8hu)__B, 3);
1673 #endif
1674   __shmask = vec_cmple(__lshift, __shmax);
1675   __result = vec_sl((__v8hu)__A, __lshift);
1676   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1677 
1678   return (__m128i)__result;
1679 }
1680 
1681 extern __inline __m128i
1682     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683     _mm_sll_epi32(__m128i __A, __m128i __B) {
1684   __v4su __lshift;
1685   __vector __bool int __shmask;
1686   const __v4su __shmax = {32, 32, 32, 32};
1687   __v4su __result;
1688 #ifdef __LITTLE_ENDIAN__
1689   __lshift = vec_splat((__v4su)__B, 0);
1690 #else
1691   __lshift = vec_splat((__v4su)__B, 1);
1692 #endif
1693   __shmask = vec_cmplt(__lshift, __shmax);
1694   __result = vec_sl((__v4su)__A, __lshift);
1695   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1696 
1697   return (__m128i)__result;
1698 }
1699 
1700 #ifdef _ARCH_PWR8
1701 extern __inline __m128i
1702     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1703     _mm_sll_epi64(__m128i __A, __m128i __B) {
1704   __v2du __lshift;
1705   __vector __bool long long __shmask;
1706   const __v2du __shmax = {64, 64};
1707   __v2du __result;
1708 
1709   __lshift = vec_splat((__v2du)__B, 0);
1710   __shmask = vec_cmplt(__lshift, __shmax);
1711   __result = vec_sl((__v2du)__A, __lshift);
1712   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1713 
1714   return (__m128i)__result;
1715 }
1716 #endif
1717 
1718 extern __inline __m128i
1719     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720     _mm_sra_epi16(__m128i __A, __m128i __B) {
1721   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1722   __v8hu __rshift;
1723   __v8hi __result;
1724 
1725 #ifdef __LITTLE_ENDIAN__
1726   __rshift = vec_splat((__v8hu)__B, 0);
1727 #else
1728   __rshift = vec_splat((__v8hu)__B, 3);
1729 #endif
1730   __rshift = vec_min(__rshift, __rshmax);
1731   __result = vec_sra((__v8hi)__A, __rshift);
1732 
1733   return (__m128i)__result;
1734 }
1735 
1736 extern __inline __m128i
1737     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738     _mm_sra_epi32(__m128i __A, __m128i __B) {
1739   const __v4su __rshmax = {31, 31, 31, 31};
1740   __v4su __rshift;
1741   __v4si __result;
1742 
1743 #ifdef __LITTLE_ENDIAN__
1744   __rshift = vec_splat((__v4su)__B, 0);
1745 #else
1746   __rshift = vec_splat((__v4su)__B, 1);
1747 #endif
1748   __rshift = vec_min(__rshift, __rshmax);
1749   __result = vec_sra((__v4si)__A, __rshift);
1750 
1751   return (__m128i)__result;
1752 }
1753 
1754 extern __inline __m128i
1755     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756     _mm_srl_epi16(__m128i __A, __m128i __B) {
1757   __v8hu __rshift;
1758   __vector __bool short __shmask;
1759   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1760   __v8hu __result;
1761 
1762 #ifdef __LITTLE_ENDIAN__
1763   __rshift = vec_splat((__v8hu)__B, 0);
1764 #else
1765   __rshift = vec_splat((__v8hu)__B, 3);
1766 #endif
1767   __shmask = vec_cmple(__rshift, __shmax);
1768   __result = vec_sr((__v8hu)__A, __rshift);
1769   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1770 
1771   return (__m128i)__result;
1772 }
1773 
1774 extern __inline __m128i
1775     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1776     _mm_srl_epi32(__m128i __A, __m128i __B) {
1777   __v4su __rshift;
1778   __vector __bool int __shmask;
1779   const __v4su __shmax = {32, 32, 32, 32};
1780   __v4su __result;
1781 
1782 #ifdef __LITTLE_ENDIAN__
1783   __rshift = vec_splat((__v4su)__B, 0);
1784 #else
1785   __rshift = vec_splat((__v4su)__B, 1);
1786 #endif
1787   __shmask = vec_cmplt(__rshift, __shmax);
1788   __result = vec_sr((__v4su)__A, __rshift);
1789   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1790 
1791   return (__m128i)__result;
1792 }
1793 
1794 #ifdef _ARCH_PWR8
1795 extern __inline __m128i
1796     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1797     _mm_srl_epi64(__m128i __A, __m128i __B) {
1798   __v2du __rshift;
1799   __vector __bool long long __shmask;
1800   const __v2du __shmax = {64, 64};
1801   __v2du __result;
1802 
1803   __rshift = vec_splat((__v2du)__B, 0);
1804   __shmask = vec_cmplt(__rshift, __shmax);
1805   __result = vec_sr((__v2du)__A, __rshift);
1806   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1807 
1808   return (__m128i)__result;
1809 }
1810 #endif
1811 
1812 extern __inline __m128d
1813     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1814     _mm_and_pd(__m128d __A, __m128d __B) {
1815   return (vec_and((__v2df)__A, (__v2df)__B));
1816 }
1817 
1818 extern __inline __m128d
1819     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1820     _mm_andnot_pd(__m128d __A, __m128d __B) {
1821   return (vec_andc((__v2df)__B, (__v2df)__A));
1822 }
1823 
1824 extern __inline __m128d
1825     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1826     _mm_or_pd(__m128d __A, __m128d __B) {
1827   return (vec_or((__v2df)__A, (__v2df)__B));
1828 }
1829 
1830 extern __inline __m128d
1831     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1832     _mm_xor_pd(__m128d __A, __m128d __B) {
1833   return (vec_xor((__v2df)__A, (__v2df)__B));
1834 }
1835 
1836 extern __inline __m128i
1837     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1838     _mm_and_si128(__m128i __A, __m128i __B) {
1839   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1840 }
1841 
1842 extern __inline __m128i
1843     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1844     _mm_andnot_si128(__m128i __A, __m128i __B) {
1845   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1846 }
1847 
1848 extern __inline __m128i
1849     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1850     _mm_or_si128(__m128i __A, __m128i __B) {
1851   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1852 }
1853 
1854 extern __inline __m128i
1855     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1856     _mm_xor_si128(__m128i __A, __m128i __B) {
1857   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1858 }
1859 
1860 extern __inline __m128i
1861     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1862     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1863   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1864 }
1865 
1866 extern __inline __m128i
1867     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1868     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1869   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1870 }
1871 
1872 extern __inline __m128i
1873     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1874     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1875   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1876 }
1877 
1878 extern __inline __m128i
1879     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1880     _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1881   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1882 }
1883 
1884 extern __inline __m128i
1885     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1886     _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1887   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1888 }
1889 
1890 extern __inline __m128i
1891     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1892     _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1893   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1894 }
1895 
1896 extern __inline __m128i
1897     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1898     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1899   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1900 }
1901 
1902 extern __inline __m128i
1903     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1904     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1905   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1906 }
1907 
1908 extern __inline __m128i
1909     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1910     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1911   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1912 }
1913 
1914 extern __inline int
1915     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1916     _mm_extract_epi16(__m128i const __A, int const __N) {
1917   return (unsigned short)((__v8hi)__A)[__N & 7];
1918 }
1919 
1920 extern __inline __m128i
1921     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1922     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1923   __v8hi __result = (__v8hi)__A;
1924 
1925   __result[(__N & 7)] = __D;
1926 
1927   return (__m128i)__result;
1928 }
1929 
1930 extern __inline __m128i
1931     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1932     _mm_max_epi16(__m128i __A, __m128i __B) {
1933   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1934 }
1935 
1936 extern __inline __m128i
1937     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1938     _mm_max_epu8(__m128i __A, __m128i __B) {
1939   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1940 }
1941 
1942 extern __inline __m128i
1943     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1944     _mm_min_epi16(__m128i __A, __m128i __B) {
1945   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1946 }
1947 
1948 extern __inline __m128i
1949     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1950     _mm_min_epu8(__m128i __A, __m128i __B) {
1951   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1952 }
1953 
1954 #ifdef _ARCH_PWR8
1955 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1956 
1957 /* Return a mask created from the most significant bit of each 8-bit
1958    element in A.  */
1959 extern __inline int
1960     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961     _mm_movemask_epi8(__m128i __A) {
1962 #ifdef _ARCH_PWR10
1963   return vec_extractm((__v16qu)__A);
1964 #else
1965   __vector unsigned long long __result;
1966   static const __vector unsigned char __perm_mask = {
1967       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1968       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1969 
1970   __result = ((__vector unsigned long long)vec_vbpermq(
1971       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1972 
1973 #ifdef __LITTLE_ENDIAN__
1974   return __result[1];
1975 #else
1976   return __result[0];
1977 #endif
1978 #endif /* !_ARCH_PWR10 */
1979 }
1980 #endif /* _ARCH_PWR8 */
1981 
1982 extern __inline __m128i
1983     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1984     _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1985   __v4su __w0, __w1;
1986   __v16qu __xform1 = {
1987 #ifdef __LITTLE_ENDIAN__
1988       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1989       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1990 #else
1991       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1992       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1993 #endif
1994   };
1995 
1996   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1997   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1998   return (__m128i)vec_perm(__w0, __w1, __xform1);
1999 }
2000 
2001 extern __inline __m128i
2002     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2003     _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2004   unsigned long __element_selector_98 = __mask & 0x03;
2005   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2006   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2007   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2008   static const unsigned short __permute_selectors[4] = {
2009 #ifdef __LITTLE_ENDIAN__
2010       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2011 #else
2012       0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2013 #endif
2014   };
2015   __v2du __pmask =
2016 #ifdef __LITTLE_ENDIAN__
2017       {0x1716151413121110UL, 0UL};
2018 #else
2019       {0x1011121314151617UL, 0UL};
2020 #endif
2021   __m64_union __t;
2022   __v2du __a, __r;
2023 
2024   __t.as_short[0] = __permute_selectors[__element_selector_98];
2025   __t.as_short[1] = __permute_selectors[__element_selector_BA];
2026   __t.as_short[2] = __permute_selectors[__element_selector_DC];
2027   __t.as_short[3] = __permute_selectors[__element_selector_FE];
2028   __pmask[1] = __t.as_m64;
2029   __a = (__v2du)__A;
2030   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2031   return (__m128i)__r;
2032 }
2033 
2034 extern __inline __m128i
2035     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2036     _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2037   unsigned long __element_selector_10 = __mask & 0x03;
2038   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2039   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2040   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2041   static const unsigned short __permute_selectors[4] = {
2042 #ifdef __LITTLE_ENDIAN__
2043       0x0100, 0x0302, 0x0504, 0x0706
2044 #else
2045       0x0001, 0x0203, 0x0405, 0x0607
2046 #endif
2047   };
2048   __v2du __pmask =
2049 #ifdef __LITTLE_ENDIAN__
2050       {0UL, 0x1f1e1d1c1b1a1918UL};
2051 #else
2052       {0UL, 0x18191a1b1c1d1e1fUL};
2053 #endif
2054   __m64_union __t;
2055   __v2du __a, __r;
2056   __t.as_short[0] = __permute_selectors[__element_selector_10];
2057   __t.as_short[1] = __permute_selectors[__element_selector_32];
2058   __t.as_short[2] = __permute_selectors[__element_selector_54];
2059   __t.as_short[3] = __permute_selectors[__element_selector_76];
2060   __pmask[0] = __t.as_m64;
2061   __a = (__v2du)__A;
2062   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2063   return (__m128i)__r;
2064 }
2065 
2066 extern __inline __m128i
2067     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2068     _mm_shuffle_epi32(__m128i __A, const int __mask) {
2069   unsigned long __element_selector_10 = __mask & 0x03;
2070   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2071   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2072   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2073   static const unsigned int __permute_selectors[4] = {
2074 #ifdef __LITTLE_ENDIAN__
2075       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2076 #else
2077       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2078 #endif
2079   };
2080   __v4su __t;
2081 
2082   __t[0] = __permute_selectors[__element_selector_10];
2083   __t[1] = __permute_selectors[__element_selector_32];
2084   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2085   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2086   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2087                            (__vector unsigned char)__t);
2088 }
2089 
2090 extern __inline void
2091     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2092     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2093   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2094   __v16qu __mask, __tmp;
2095   __m128i_u *__p = (__m128i_u *)__C;
2096 
2097   __tmp = (__v16qu)_mm_loadu_si128(__p);
2098   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2099   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2100   _mm_storeu_si128(__p, (__m128i)__tmp);
2101 }
2102 
2103 extern __inline __m128i
2104     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2105     _mm_avg_epu8(__m128i __A, __m128i __B) {
2106   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2107 }
2108 
2109 extern __inline __m128i
2110     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2111     _mm_avg_epu16(__m128i __A, __m128i __B) {
2112   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2113 }
2114 
2115 extern __inline __m128i
2116     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2117     _mm_sad_epu8(__m128i __A, __m128i __B) {
2118   __v16qu __a, __b;
2119   __v16qu __vabsdiff;
2120   __v4si __vsum;
2121   const __v4su __zero = {0, 0, 0, 0};
2122   __v4si __result;
2123 
2124   __a = (__v16qu)__A;
2125   __b = (__v16qu)__B;
2126 #ifndef _ARCH_PWR9
2127   __v16qu __vmin = vec_min(__a, __b);
2128   __v16qu __vmax = vec_max(__a, __b);
2129   __vabsdiff = vec_sub(__vmax, __vmin);
2130 #else
2131   __vabsdiff = vec_absd(__a, __b);
2132 #endif
2133   /* Sum four groups of bytes into integers.  */
2134   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2135 #ifdef __LITTLE_ENDIAN__
2136   /* Sum across four integers with two integer results.  */
2137   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2138   /* Note: vec_sum2s could be used here, but on little-endian, vector
2139      shifts are added that are not needed for this use-case.
2140      A vector shift to correctly position the 32-bit integer results
2141      (currently at [0] and [2]) to [1] and [3] would then need to be
2142      swapped back again since the desired results are two 64-bit
2143      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
2144 #else
2145   /* Sum across four integers with two integer results.  */
2146   __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2147   /* Rotate the sums into the correct position.  */
2148   __result = vec_sld(__result, __result, 6);
2149 #endif
2150   return (__m128i)__result;
2151 }
2152 
2153 extern __inline void
2154     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2155     _mm_stream_si32(int *__A, int __B) {
2156   /* Use the data cache block touch for store transient.  */
2157   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2158   *__A = __B;
2159 }
2160 
2161 extern __inline void
2162     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2163     _mm_stream_si64(long long int *__A, long long int __B) {
2164   /* Use the data cache block touch for store transient.  */
2165   __asm__("	dcbtstt	0,%0" : : "b"(__A) : "memory");
2166   *__A = __B;
2167 }
2168 
2169 extern __inline void
2170     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2171     _mm_stream_si128(__m128i *__A, __m128i __B) {
2172   /* Use the data cache block touch for store transient.  */
2173   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2174   *__A = __B;
2175 }
2176 
2177 extern __inline void
2178     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2179     _mm_stream_pd(double *__A, __m128d __B) {
2180   /* Use the data cache block touch for store transient.  */
2181   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2182   *(__m128d *)__A = __B;
2183 }
2184 
2185 extern __inline void
2186     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2187     _mm_clflush(void const *__A) {
2188   /* Use the data cache block flush.  */
2189   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2190 }
2191 
2192 extern __inline void
2193     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2194     _mm_lfence(void) {
2195   /* Use light weight sync for load to load ordering.  */
2196   __atomic_thread_fence(__ATOMIC_RELEASE);
2197 }
2198 
2199 extern __inline void
2200     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2201     _mm_mfence(void) {
2202   /* Use heavy weight sync for any to any ordering.  */
2203   __atomic_thread_fence(__ATOMIC_SEQ_CST);
2204 }
2205 
2206 extern __inline __m128i
2207     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2208     _mm_cvtsi32_si128(int __A) {
2209   return _mm_set_epi32(0, 0, 0, __A);
2210 }
2211 
2212 extern __inline __m128i
2213     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2214     _mm_cvtsi64_si128(long long __A) {
2215   return __extension__(__m128i)(__v2di){__A, 0LL};
2216 }
2217 
2218 /* Microsoft intrinsic.  */
2219 extern __inline __m128i
2220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2221     _mm_cvtsi64x_si128(long long __A) {
2222   return __extension__(__m128i)(__v2di){__A, 0LL};
2223 }
2224 
2225 /* Casts between various SP, DP, INT vector types.  Note that these do no
2226    conversion of values, they just change the type.  */
2227 extern __inline __m128
2228     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2229     _mm_castpd_ps(__m128d __A) {
2230   return (__m128)__A;
2231 }
2232 
2233 extern __inline __m128i
2234     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2235     _mm_castpd_si128(__m128d __A) {
2236   return (__m128i)__A;
2237 }
2238 
2239 extern __inline __m128d
2240     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2241     _mm_castps_pd(__m128 __A) {
2242   return (__m128d)__A;
2243 }
2244 
2245 extern __inline __m128i
2246     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2247     _mm_castps_si128(__m128 __A) {
2248   return (__m128i)__A;
2249 }
2250 
2251 extern __inline __m128
2252     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2253     _mm_castsi128_ps(__m128i __A) {
2254   return (__m128)__A;
2255 }
2256 
2257 extern __inline __m128d
2258     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2259     _mm_castsi128_pd(__m128i __A) {
2260   return (__m128d)__A;
2261 }
2262 
2263 #else
2264 #include_next <emmintrin.h>
2265 #endif /* defined(__ppc64__) &&
2266         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2267 
2268 #endif /* EMMINTRIN_H_ */
2269