1e5dd7070Spatrick /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2e5dd7070Spatrick  *
3e5dd7070Spatrick  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e5dd7070Spatrick  * See https://llvm.org/LICENSE.txt for license information.
5e5dd7070Spatrick  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e5dd7070Spatrick  *
7e5dd7070Spatrick  *===-----------------------------------------------------------------------===
8e5dd7070Spatrick  */
9e5dd7070Spatrick 
10e5dd7070Spatrick /* Implemented from the specification included in the Intel C++ Compiler
11e5dd7070Spatrick    User Guide and Reference, version 9.0.  */
12e5dd7070Spatrick 
13e5dd7070Spatrick #ifndef NO_WARN_X86_INTRINSICS
14e5dd7070Spatrick /* This header file is to help porting code using Intel intrinsics
15e5dd7070Spatrick    explicitly from x86_64 to powerpc64/powerpc64le.
16e5dd7070Spatrick 
17e5dd7070Spatrick    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18e5dd7070Spatrick    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19e5dd7070Spatrick    However scalar float operations in vector (XMM) registers require
20e5dd7070Spatrick    the POWER8 VSX ISA (2.07) level. There are differences for data
21e5dd7070Spatrick    format and placement of float scalars in the vector register, which
22e5dd7070Spatrick    require extra steps to match SSE2 scalar float semantics on POWER.
23e5dd7070Spatrick 
24e5dd7070Spatrick    It should be noted that there's much difference between X86_64's
25e5dd7070Spatrick    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26e5dd7070Spatrick    portable <fenv.h> instead of access MXSCR directly.
27e5dd7070Spatrick 
28e5dd7070Spatrick    Most SSE2 scalar float intrinsic operations can be performed more
29e5dd7070Spatrick    efficiently as C language float scalar operations or optimized to
30e5dd7070Spatrick    use vector SIMD operations. We recommend this for new applications.
31e5dd7070Spatrick */
32*12c85518Srobert #error                                                                         \
33*12c85518Srobert     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
34e5dd7070Spatrick #endif
35e5dd7070Spatrick 
36e5dd7070Spatrick #ifndef EMMINTRIN_H_
37e5dd7070Spatrick #define EMMINTRIN_H_
38e5dd7070Spatrick 
39*12c85518Srobert #if defined(__powerpc64__) &&                                                  \
40*12c85518Srobert     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
41e5dd7070Spatrick 
42e5dd7070Spatrick #include <altivec.h>
43e5dd7070Spatrick 
44e5dd7070Spatrick /* We need definitions from the SSE header files.  */
45e5dd7070Spatrick #include <xmmintrin.h>
46e5dd7070Spatrick 
47e5dd7070Spatrick /* SSE2 */
48e5dd7070Spatrick typedef __vector double __v2df;
49e5dd7070Spatrick typedef __vector long long __v2di;
50e5dd7070Spatrick typedef __vector unsigned long long __v2du;
51e5dd7070Spatrick typedef __vector int __v4si;
52e5dd7070Spatrick typedef __vector unsigned int __v4su;
53e5dd7070Spatrick typedef __vector short __v8hi;
54e5dd7070Spatrick typedef __vector unsigned short __v8hu;
55e5dd7070Spatrick typedef __vector signed char __v16qi;
56e5dd7070Spatrick typedef __vector unsigned char __v16qu;
57e5dd7070Spatrick 
58e5dd7070Spatrick /* The Intel API is flexible enough that we must allow aliasing with other
59e5dd7070Spatrick    vector types, and their scalar components.  */
60e5dd7070Spatrick typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
61e5dd7070Spatrick typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
62e5dd7070Spatrick 
63e5dd7070Spatrick /* Unaligned version of the same types.  */
64*12c85518Srobert typedef long long __m128i_u
65*12c85518Srobert     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
66*12c85518Srobert typedef double __m128d_u
67*12c85518Srobert     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
68e5dd7070Spatrick 
69e5dd7070Spatrick /* Define two value permute mask.  */
70e5dd7070Spatrick #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
71e5dd7070Spatrick 
72e5dd7070Spatrick /* Create a vector with element 0 as F and the rest zero.  */
73*12c85518Srobert extern __inline __m128d
74*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd(double __F)75*12c85518Srobert     _mm_set_sd(double __F) {
76e5dd7070Spatrick   return __extension__(__m128d){__F, 0.0};
77e5dd7070Spatrick }
78e5dd7070Spatrick 
79e5dd7070Spatrick /* Create a vector with both elements equal to F.  */
80*12c85518Srobert extern __inline __m128d
81*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd(double __F)82*12c85518Srobert     _mm_set1_pd(double __F) {
83e5dd7070Spatrick   return __extension__(__m128d){__F, __F};
84e5dd7070Spatrick }
85e5dd7070Spatrick 
86*12c85518Srobert extern __inline __m128d
87*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1(double __F)88*12c85518Srobert     _mm_set_pd1(double __F) {
89e5dd7070Spatrick   return _mm_set1_pd(__F);
90e5dd7070Spatrick }
91e5dd7070Spatrick 
92e5dd7070Spatrick /* Create a vector with the lower value X and upper value W.  */
93*12c85518Srobert extern __inline __m128d
94*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd(double __W,double __X)95*12c85518Srobert     _mm_set_pd(double __W, double __X) {
96e5dd7070Spatrick   return __extension__(__m128d){__X, __W};
97e5dd7070Spatrick }
98e5dd7070Spatrick 
99e5dd7070Spatrick /* Create a vector with the lower value W and upper value X.  */
100*12c85518Srobert extern __inline __m128d
101*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd(double __W,double __X)102*12c85518Srobert     _mm_setr_pd(double __W, double __X) {
103e5dd7070Spatrick   return __extension__(__m128d){__W, __X};
104e5dd7070Spatrick }
105e5dd7070Spatrick 
106e5dd7070Spatrick /* Create an undefined vector.  */
107*12c85518Srobert extern __inline __m128d
108*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd(void)109*12c85518Srobert     _mm_undefined_pd(void) {
110e5dd7070Spatrick   __m128d __Y = __Y;
111e5dd7070Spatrick   return __Y;
112e5dd7070Spatrick }
113e5dd7070Spatrick 
114e5dd7070Spatrick /* Create a vector of zeros.  */
115*12c85518Srobert extern __inline __m128d
116*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd(void)117*12c85518Srobert     _mm_setzero_pd(void) {
118e5dd7070Spatrick   return (__m128d)vec_splats(0);
119e5dd7070Spatrick }
120e5dd7070Spatrick 
121e5dd7070Spatrick /* Sets the low DPFP value of A from the low value of B.  */
122*12c85518Srobert extern __inline __m128d
123*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd(__m128d __A,__m128d __B)124*12c85518Srobert     _mm_move_sd(__m128d __A, __m128d __B) {
125*12c85518Srobert   __v2df __result = (__v2df)__A;
126*12c85518Srobert   __result[0] = ((__v2df)__B)[0];
127*12c85518Srobert   return (__m128d)__result;
128e5dd7070Spatrick }
129e5dd7070Spatrick 
130e5dd7070Spatrick /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
131*12c85518Srobert extern __inline __m128d
132*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd(double const * __P)133*12c85518Srobert     _mm_load_pd(double const *__P) {
134e5dd7070Spatrick   return ((__m128d)vec_ld(0, (__v16qu *)__P));
135e5dd7070Spatrick }
136e5dd7070Spatrick 
137e5dd7070Spatrick /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
138*12c85518Srobert extern __inline __m128d
139*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd(double const * __P)140*12c85518Srobert     _mm_loadu_pd(double const *__P) {
141e5dd7070Spatrick   return (vec_vsx_ld(0, __P));
142e5dd7070Spatrick }
143e5dd7070Spatrick 
144e5dd7070Spatrick /* Create a vector with all two elements equal to *P.  */
145*12c85518Srobert extern __inline __m128d
146*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd(double const * __P)147*12c85518Srobert     _mm_load1_pd(double const *__P) {
148e5dd7070Spatrick   return (vec_splats(*__P));
149e5dd7070Spatrick }
150e5dd7070Spatrick 
151e5dd7070Spatrick /* Create a vector with element 0 as *P and the rest zero.  */
152*12c85518Srobert extern __inline __m128d
153*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd(double const * __P)154*12c85518Srobert     _mm_load_sd(double const *__P) {
155e5dd7070Spatrick   return _mm_set_sd(*__P);
156e5dd7070Spatrick }
157e5dd7070Spatrick 
158*12c85518Srobert extern __inline __m128d
159*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1(double const * __P)160*12c85518Srobert     _mm_load_pd1(double const *__P) {
161e5dd7070Spatrick   return _mm_load1_pd(__P);
162e5dd7070Spatrick }
163e5dd7070Spatrick 
164e5dd7070Spatrick /* Load two DPFP values in reverse order.  The address must be aligned.  */
165*12c85518Srobert extern __inline __m128d
166*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd(double const * __P)167*12c85518Srobert     _mm_loadr_pd(double const *__P) {
168e5dd7070Spatrick   __v2df __tmp = _mm_load_pd(__P);
169e5dd7070Spatrick   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
170e5dd7070Spatrick }
171e5dd7070Spatrick 
172e5dd7070Spatrick /* Store two DPFP values.  The address must be 16-byte aligned.  */
173*12c85518Srobert extern __inline void
174*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd(double * __P,__m128d __A)175*12c85518Srobert     _mm_store_pd(double *__P, __m128d __A) {
176e5dd7070Spatrick   vec_st((__v16qu)__A, 0, (__v16qu *)__P);
177e5dd7070Spatrick }
178e5dd7070Spatrick 
179e5dd7070Spatrick /* Store two DPFP values.  The address need not be 16-byte aligned.  */
180*12c85518Srobert extern __inline void
181*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd(double * __P,__m128d __A)182*12c85518Srobert     _mm_storeu_pd(double *__P, __m128d __A) {
183e5dd7070Spatrick   *(__m128d_u *)__P = __A;
184e5dd7070Spatrick }
185e5dd7070Spatrick 
186e5dd7070Spatrick /* Stores the lower DPFP value.  */
187*12c85518Srobert extern __inline void
188*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd(double * __P,__m128d __A)189*12c85518Srobert     _mm_store_sd(double *__P, __m128d __A) {
190e5dd7070Spatrick   *__P = ((__v2df)__A)[0];
191e5dd7070Spatrick }
192e5dd7070Spatrick 
193*12c85518Srobert extern __inline double
194*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64(__m128d __A)195*12c85518Srobert     _mm_cvtsd_f64(__m128d __A) {
196e5dd7070Spatrick   return ((__v2df)__A)[0];
197e5dd7070Spatrick }
198e5dd7070Spatrick 
199*12c85518Srobert extern __inline void
200*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd(double * __P,__m128d __A)201*12c85518Srobert     _mm_storel_pd(double *__P, __m128d __A) {
202e5dd7070Spatrick   _mm_store_sd(__P, __A);
203e5dd7070Spatrick }
204e5dd7070Spatrick 
205e5dd7070Spatrick /* Stores the upper DPFP value.  */
206*12c85518Srobert extern __inline void
207*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd(double * __P,__m128d __A)208*12c85518Srobert     _mm_storeh_pd(double *__P, __m128d __A) {
209e5dd7070Spatrick   *__P = ((__v2df)__A)[1];
210e5dd7070Spatrick }
211e5dd7070Spatrick /* Store the lower DPFP value across two words.
212e5dd7070Spatrick    The address must be 16-byte aligned.  */
213*12c85518Srobert extern __inline void
214*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd(double * __P,__m128d __A)215*12c85518Srobert     _mm_store1_pd(double *__P, __m128d __A) {
216e5dd7070Spatrick   _mm_store_pd(__P, vec_splat(__A, 0));
217e5dd7070Spatrick }
218e5dd7070Spatrick 
219*12c85518Srobert extern __inline void
220*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1(double * __P,__m128d __A)221*12c85518Srobert     _mm_store_pd1(double *__P, __m128d __A) {
222e5dd7070Spatrick   _mm_store1_pd(__P, __A);
223e5dd7070Spatrick }
224e5dd7070Spatrick 
225e5dd7070Spatrick /* Store two DPFP values in reverse order.  The address must be aligned.  */
226*12c85518Srobert extern __inline void
227*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd(double * __P,__m128d __A)228*12c85518Srobert     _mm_storer_pd(double *__P, __m128d __A) {
229e5dd7070Spatrick   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
230e5dd7070Spatrick }
231e5dd7070Spatrick 
232e5dd7070Spatrick /* Intel intrinsic.  */
233*12c85518Srobert extern __inline long long
234*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64(__m128i __A)235*12c85518Srobert     _mm_cvtsi128_si64(__m128i __A) {
236e5dd7070Spatrick   return ((__v2di)__A)[0];
237e5dd7070Spatrick }
238e5dd7070Spatrick 
239e5dd7070Spatrick /* Microsoft intrinsic.  */
240*12c85518Srobert extern __inline long long
241*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x(__m128i __A)242*12c85518Srobert     _mm_cvtsi128_si64x(__m128i __A) {
243e5dd7070Spatrick   return ((__v2di)__A)[0];
244e5dd7070Spatrick }
245e5dd7070Spatrick 
246*12c85518Srobert extern __inline __m128d
247*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd(__m128d __A,__m128d __B)248*12c85518Srobert     _mm_add_pd(__m128d __A, __m128d __B) {
249e5dd7070Spatrick   return (__m128d)((__v2df)__A + (__v2df)__B);
250e5dd7070Spatrick }
251e5dd7070Spatrick 
252e5dd7070Spatrick /* Add the lower double-precision (64-bit) floating-point element in
253e5dd7070Spatrick    a and b, store the result in the lower element of dst, and copy
254e5dd7070Spatrick    the upper element from a to the upper element of dst. */
255*12c85518Srobert extern __inline __m128d
256*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd(__m128d __A,__m128d __B)257*12c85518Srobert     _mm_add_sd(__m128d __A, __m128d __B) {
258e5dd7070Spatrick   __A[0] = __A[0] + __B[0];
259e5dd7070Spatrick   return (__A);
260e5dd7070Spatrick }
261e5dd7070Spatrick 
262*12c85518Srobert extern __inline __m128d
263*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd(__m128d __A,__m128d __B)264*12c85518Srobert     _mm_sub_pd(__m128d __A, __m128d __B) {
265e5dd7070Spatrick   return (__m128d)((__v2df)__A - (__v2df)__B);
266e5dd7070Spatrick }
267e5dd7070Spatrick 
268*12c85518Srobert extern __inline __m128d
269*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd(__m128d __A,__m128d __B)270*12c85518Srobert     _mm_sub_sd(__m128d __A, __m128d __B) {
271e5dd7070Spatrick   __A[0] = __A[0] - __B[0];
272e5dd7070Spatrick   return (__A);
273e5dd7070Spatrick }
274e5dd7070Spatrick 
275*12c85518Srobert extern __inline __m128d
276*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd(__m128d __A,__m128d __B)277*12c85518Srobert     _mm_mul_pd(__m128d __A, __m128d __B) {
278e5dd7070Spatrick   return (__m128d)((__v2df)__A * (__v2df)__B);
279e5dd7070Spatrick }
280e5dd7070Spatrick 
281*12c85518Srobert extern __inline __m128d
282*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd(__m128d __A,__m128d __B)283*12c85518Srobert     _mm_mul_sd(__m128d __A, __m128d __B) {
284e5dd7070Spatrick   __A[0] = __A[0] * __B[0];
285e5dd7070Spatrick   return (__A);
286e5dd7070Spatrick }
287e5dd7070Spatrick 
288*12c85518Srobert extern __inline __m128d
289*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd(__m128d __A,__m128d __B)290*12c85518Srobert     _mm_div_pd(__m128d __A, __m128d __B) {
291e5dd7070Spatrick   return (__m128d)((__v2df)__A / (__v2df)__B);
292e5dd7070Spatrick }
293e5dd7070Spatrick 
294*12c85518Srobert extern __inline __m128d
295*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd(__m128d __A,__m128d __B)296*12c85518Srobert     _mm_div_sd(__m128d __A, __m128d __B) {
297e5dd7070Spatrick   __A[0] = __A[0] / __B[0];
298e5dd7070Spatrick   return (__A);
299e5dd7070Spatrick }
300e5dd7070Spatrick 
301*12c85518Srobert extern __inline __m128d
302*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd(__m128d __A)303*12c85518Srobert     _mm_sqrt_pd(__m128d __A) {
304e5dd7070Spatrick   return (vec_sqrt(__A));
305e5dd7070Spatrick }
306e5dd7070Spatrick 
307e5dd7070Spatrick /* Return pair {sqrt (B[0]), A[1]}.  */
308*12c85518Srobert extern __inline __m128d
309*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd(__m128d __A,__m128d __B)310*12c85518Srobert     _mm_sqrt_sd(__m128d __A, __m128d __B) {
311*12c85518Srobert   __v2df __c;
312*12c85518Srobert   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
313*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
314e5dd7070Spatrick }
315e5dd7070Spatrick 
316*12c85518Srobert extern __inline __m128d
317*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd(__m128d __A,__m128d __B)318*12c85518Srobert     _mm_min_pd(__m128d __A, __m128d __B) {
319e5dd7070Spatrick   return (vec_min(__A, __B));
320e5dd7070Spatrick }
321e5dd7070Spatrick 
322*12c85518Srobert extern __inline __m128d
323*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd(__m128d __A,__m128d __B)324*12c85518Srobert     _mm_min_sd(__m128d __A, __m128d __B) {
325*12c85518Srobert   __v2df __a, __b, __c;
326*12c85518Srobert   __a = vec_splats(__A[0]);
327*12c85518Srobert   __b = vec_splats(__B[0]);
328*12c85518Srobert   __c = vec_min(__a, __b);
329*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
330e5dd7070Spatrick }
331e5dd7070Spatrick 
332*12c85518Srobert extern __inline __m128d
333*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd(__m128d __A,__m128d __B)334*12c85518Srobert     _mm_max_pd(__m128d __A, __m128d __B) {
335e5dd7070Spatrick   return (vec_max(__A, __B));
336e5dd7070Spatrick }
337e5dd7070Spatrick 
338*12c85518Srobert extern __inline __m128d
339*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd(__m128d __A,__m128d __B)340*12c85518Srobert     _mm_max_sd(__m128d __A, __m128d __B) {
341*12c85518Srobert   __v2df __a, __b, __c;
342*12c85518Srobert   __a = vec_splats(__A[0]);
343*12c85518Srobert   __b = vec_splats(__B[0]);
344*12c85518Srobert   __c = vec_max(__a, __b);
345*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
346e5dd7070Spatrick }
347e5dd7070Spatrick 
348*12c85518Srobert extern __inline __m128d
349*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd(__m128d __A,__m128d __B)350*12c85518Srobert     _mm_cmpeq_pd(__m128d __A, __m128d __B) {
351e5dd7070Spatrick   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
352e5dd7070Spatrick }
353e5dd7070Spatrick 
354*12c85518Srobert extern __inline __m128d
355*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd(__m128d __A,__m128d __B)356*12c85518Srobert     _mm_cmplt_pd(__m128d __A, __m128d __B) {
357e5dd7070Spatrick   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
358e5dd7070Spatrick }
359e5dd7070Spatrick 
360*12c85518Srobert extern __inline __m128d
361*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd(__m128d __A,__m128d __B)362*12c85518Srobert     _mm_cmple_pd(__m128d __A, __m128d __B) {
363e5dd7070Spatrick   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
364e5dd7070Spatrick }
365e5dd7070Spatrick 
366*12c85518Srobert extern __inline __m128d
367*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd(__m128d __A,__m128d __B)368*12c85518Srobert     _mm_cmpgt_pd(__m128d __A, __m128d __B) {
369e5dd7070Spatrick   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
370e5dd7070Spatrick }
371e5dd7070Spatrick 
372*12c85518Srobert extern __inline __m128d
373*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd(__m128d __A,__m128d __B)374*12c85518Srobert     _mm_cmpge_pd(__m128d __A, __m128d __B) {
375e5dd7070Spatrick   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
376e5dd7070Spatrick }
377e5dd7070Spatrick 
378*12c85518Srobert extern __inline __m128d
379*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd(__m128d __A,__m128d __B)380*12c85518Srobert     _mm_cmpneq_pd(__m128d __A, __m128d __B) {
381*12c85518Srobert   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
382*12c85518Srobert   return ((__m128d)vec_nor(__temp, __temp));
383e5dd7070Spatrick }
384e5dd7070Spatrick 
385*12c85518Srobert extern __inline __m128d
386*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd(__m128d __A,__m128d __B)387*12c85518Srobert     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
388e5dd7070Spatrick   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
389e5dd7070Spatrick }
390e5dd7070Spatrick 
391*12c85518Srobert extern __inline __m128d
392*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd(__m128d __A,__m128d __B)393*12c85518Srobert     _mm_cmpnle_pd(__m128d __A, __m128d __B) {
394e5dd7070Spatrick   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
395e5dd7070Spatrick }
396e5dd7070Spatrick 
397*12c85518Srobert extern __inline __m128d
398*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd(__m128d __A,__m128d __B)399*12c85518Srobert     _mm_cmpngt_pd(__m128d __A, __m128d __B) {
400e5dd7070Spatrick   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
401e5dd7070Spatrick }
402e5dd7070Spatrick 
403*12c85518Srobert extern __inline __m128d
404*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd(__m128d __A,__m128d __B)405*12c85518Srobert     _mm_cmpnge_pd(__m128d __A, __m128d __B) {
406e5dd7070Spatrick   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
407e5dd7070Spatrick }
408e5dd7070Spatrick 
409*12c85518Srobert extern __inline __m128d
410*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd(__m128d __A,__m128d __B)411*12c85518Srobert     _mm_cmpord_pd(__m128d __A, __m128d __B) {
412*12c85518Srobert   __v2du __c, __d;
413e5dd7070Spatrick   /* Compare against self will return false (0's) if NAN.  */
414*12c85518Srobert   __c = (__v2du)vec_cmpeq(__A, __A);
415*12c85518Srobert   __d = (__v2du)vec_cmpeq(__B, __B);
416e5dd7070Spatrick   /* A != NAN and B != NAN.  */
417*12c85518Srobert   return ((__m128d)vec_and(__c, __d));
418e5dd7070Spatrick }
419e5dd7070Spatrick 
420*12c85518Srobert extern __inline __m128d
421*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd(__m128d __A,__m128d __B)422*12c85518Srobert     _mm_cmpunord_pd(__m128d __A, __m128d __B) {
423e5dd7070Spatrick #if _ARCH_PWR8
424*12c85518Srobert   __v2du __c, __d;
425e5dd7070Spatrick   /* Compare against self will return false (0's) if NAN.  */
426*12c85518Srobert   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
427*12c85518Srobert   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
428e5dd7070Spatrick   /* A == NAN OR B == NAN converts too:
429e5dd7070Spatrick      NOT(A != NAN) OR NOT(B != NAN).  */
430*12c85518Srobert   __c = vec_nor(__c, __c);
431*12c85518Srobert   return ((__m128d)vec_orc(__c, __d));
432e5dd7070Spatrick #else
433*12c85518Srobert   __v2du __c, __d;
434e5dd7070Spatrick   /* Compare against self will return false (0's) if NAN.  */
435*12c85518Srobert   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
436*12c85518Srobert   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
437e5dd7070Spatrick   /* Convert the true ('1's) is NAN.  */
438*12c85518Srobert   __c = vec_nor(__c, __c);
439*12c85518Srobert   __d = vec_nor(__d, __d);
440*12c85518Srobert   return ((__m128d)vec_or(__c, __d));
441e5dd7070Spatrick #endif
442e5dd7070Spatrick }
443e5dd7070Spatrick 
444*12c85518Srobert extern __inline __m128d
445*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A,__m128d __B)446*12c85518Srobert     _mm_cmpeq_sd(__m128d __A, __m128d __B) {
447*12c85518Srobert   __v2df __a, __b, __c;
448e5dd7070Spatrick   /* PowerISA VSX does not allow partial (for just lower double)
449e5dd7070Spatrick      results. So to insure we don't generate spurious exceptions
450e5dd7070Spatrick      (from the upper double values) we splat the lower double
451e5dd7070Spatrick      before we do the operation. */
452*12c85518Srobert   __a = vec_splats(__A[0]);
453*12c85518Srobert   __b = vec_splats(__B[0]);
454*12c85518Srobert   __c = (__v2df)vec_cmpeq(__a, __b);
455e5dd7070Spatrick   /* Then we merge the lower double result with the original upper
456e5dd7070Spatrick      double from __A.  */
457*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
458e5dd7070Spatrick }
459e5dd7070Spatrick 
460*12c85518Srobert extern __inline __m128d
461*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd(__m128d __A,__m128d __B)462*12c85518Srobert     _mm_cmplt_sd(__m128d __A, __m128d __B) {
463*12c85518Srobert   __v2df __a, __b, __c;
464*12c85518Srobert   __a = vec_splats(__A[0]);
465*12c85518Srobert   __b = vec_splats(__B[0]);
466*12c85518Srobert   __c = (__v2df)vec_cmplt(__a, __b);
467*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
468e5dd7070Spatrick }
469e5dd7070Spatrick 
470*12c85518Srobert extern __inline __m128d
471*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd(__m128d __A,__m128d __B)472*12c85518Srobert     _mm_cmple_sd(__m128d __A, __m128d __B) {
473*12c85518Srobert   __v2df __a, __b, __c;
474*12c85518Srobert   __a = vec_splats(__A[0]);
475*12c85518Srobert   __b = vec_splats(__B[0]);
476*12c85518Srobert   __c = (__v2df)vec_cmple(__a, __b);
477*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
478e5dd7070Spatrick }
479e5dd7070Spatrick 
480*12c85518Srobert extern __inline __m128d
481*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd(__m128d __A,__m128d __B)482*12c85518Srobert     _mm_cmpgt_sd(__m128d __A, __m128d __B) {
483*12c85518Srobert   __v2df __a, __b, __c;
484*12c85518Srobert   __a = vec_splats(__A[0]);
485*12c85518Srobert   __b = vec_splats(__B[0]);
486*12c85518Srobert   __c = (__v2df)vec_cmpgt(__a, __b);
487*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
488e5dd7070Spatrick }
489e5dd7070Spatrick 
490*12c85518Srobert extern __inline __m128d
491*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd(__m128d __A,__m128d __B)492*12c85518Srobert     _mm_cmpge_sd(__m128d __A, __m128d __B) {
493*12c85518Srobert   __v2df __a, __b, __c;
494*12c85518Srobert   __a = vec_splats(__A[0]);
495*12c85518Srobert   __b = vec_splats(__B[0]);
496*12c85518Srobert   __c = (__v2df)vec_cmpge(__a, __b);
497*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
498e5dd7070Spatrick }
499e5dd7070Spatrick 
500*12c85518Srobert extern __inline __m128d
501*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd(__m128d __A,__m128d __B)502*12c85518Srobert     _mm_cmpneq_sd(__m128d __A, __m128d __B) {
503*12c85518Srobert   __v2df __a, __b, __c;
504*12c85518Srobert   __a = vec_splats(__A[0]);
505*12c85518Srobert   __b = vec_splats(__B[0]);
506*12c85518Srobert   __c = (__v2df)vec_cmpeq(__a, __b);
507*12c85518Srobert   __c = vec_nor(__c, __c);
508*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
509e5dd7070Spatrick }
510e5dd7070Spatrick 
511*12c85518Srobert extern __inline __m128d
512*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd(__m128d __A,__m128d __B)513*12c85518Srobert     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
514*12c85518Srobert   __v2df __a, __b, __c;
515*12c85518Srobert   __a = vec_splats(__A[0]);
516*12c85518Srobert   __b = vec_splats(__B[0]);
517e5dd7070Spatrick   /* Not less than is just greater than or equal.  */
518*12c85518Srobert   __c = (__v2df)vec_cmpge(__a, __b);
519*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
520e5dd7070Spatrick }
521e5dd7070Spatrick 
522*12c85518Srobert extern __inline __m128d
523*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd(__m128d __A,__m128d __B)524*12c85518Srobert     _mm_cmpnle_sd(__m128d __A, __m128d __B) {
525*12c85518Srobert   __v2df __a, __b, __c;
526*12c85518Srobert   __a = vec_splats(__A[0]);
527*12c85518Srobert   __b = vec_splats(__B[0]);
528e5dd7070Spatrick   /* Not less than or equal is just greater than.  */
529*12c85518Srobert   __c = (__v2df)vec_cmpge(__a, __b);
530*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
531e5dd7070Spatrick }
532e5dd7070Spatrick 
533*12c85518Srobert extern __inline __m128d
534*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd(__m128d __A,__m128d __B)535*12c85518Srobert     _mm_cmpngt_sd(__m128d __A, __m128d __B) {
536*12c85518Srobert   __v2df __a, __b, __c;
537*12c85518Srobert   __a = vec_splats(__A[0]);
538*12c85518Srobert   __b = vec_splats(__B[0]);
539e5dd7070Spatrick   /* Not greater than is just less than or equal.  */
540*12c85518Srobert   __c = (__v2df)vec_cmple(__a, __b);
541*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
542e5dd7070Spatrick }
543e5dd7070Spatrick 
544*12c85518Srobert extern __inline __m128d
545*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd(__m128d __A,__m128d __B)546*12c85518Srobert     _mm_cmpnge_sd(__m128d __A, __m128d __B) {
547*12c85518Srobert   __v2df __a, __b, __c;
548*12c85518Srobert   __a = vec_splats(__A[0]);
549*12c85518Srobert   __b = vec_splats(__B[0]);
550e5dd7070Spatrick   /* Not greater than or equal is just less than.  */
551*12c85518Srobert   __c = (__v2df)vec_cmplt(__a, __b);
552*12c85518Srobert   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
553e5dd7070Spatrick }
554e5dd7070Spatrick 
555*12c85518Srobert extern __inline __m128d
556*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd(__m128d __A,__m128d __B)557*12c85518Srobert     _mm_cmpord_sd(__m128d __A, __m128d __B) {
558*12c85518Srobert   __v2df __r;
559*12c85518Srobert   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
560*12c85518Srobert   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
561e5dd7070Spatrick }
562e5dd7070Spatrick 
563*12c85518Srobert extern __inline __m128d
564*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd(__m128d __A,__m128d __B)565*12c85518Srobert     _mm_cmpunord_sd(__m128d __A, __m128d __B) {
566*12c85518Srobert   __v2df __r;
567*12c85518Srobert   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
568*12c85518Srobert   return (__m128d)_mm_setr_pd(__r[0], __A[1]);
569e5dd7070Spatrick }
570e5dd7070Spatrick 
571e5dd7070Spatrick /* FIXME
572e5dd7070Spatrick    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
573e5dd7070Spatrick    exactly the same because GCC for PowerPC only generates unordered
574e5dd7070Spatrick    compares (scalar and vector).
575e5dd7070Spatrick    Technically __mm_comieq_sp et all should be using the ordered
576e5dd7070Spatrick    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
577e5dd7070Spatrick    be OK.   */
578*12c85518Srobert extern __inline int
579*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd(__m128d __A,__m128d __B)580*12c85518Srobert     _mm_comieq_sd(__m128d __A, __m128d __B) {
581e5dd7070Spatrick   return (__A[0] == __B[0]);
582e5dd7070Spatrick }
583e5dd7070Spatrick 
584*12c85518Srobert extern __inline int
585*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd(__m128d __A,__m128d __B)586*12c85518Srobert     _mm_comilt_sd(__m128d __A, __m128d __B) {
587e5dd7070Spatrick   return (__A[0] < __B[0]);
588e5dd7070Spatrick }
589e5dd7070Spatrick 
590*12c85518Srobert extern __inline int
591*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd(__m128d __A,__m128d __B)592*12c85518Srobert     _mm_comile_sd(__m128d __A, __m128d __B) {
593e5dd7070Spatrick   return (__A[0] <= __B[0]);
594e5dd7070Spatrick }
595e5dd7070Spatrick 
596*12c85518Srobert extern __inline int
597*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd(__m128d __A,__m128d __B)598*12c85518Srobert     _mm_comigt_sd(__m128d __A, __m128d __B) {
599e5dd7070Spatrick   return (__A[0] > __B[0]);
600e5dd7070Spatrick }
601e5dd7070Spatrick 
602*12c85518Srobert extern __inline int
603*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd(__m128d __A,__m128d __B)604*12c85518Srobert     _mm_comige_sd(__m128d __A, __m128d __B) {
605e5dd7070Spatrick   return (__A[0] >= __B[0]);
606e5dd7070Spatrick }
607e5dd7070Spatrick 
608*12c85518Srobert extern __inline int
609*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd(__m128d __A,__m128d __B)610*12c85518Srobert     _mm_comineq_sd(__m128d __A, __m128d __B) {
611e5dd7070Spatrick   return (__A[0] != __B[0]);
612e5dd7070Spatrick }
613e5dd7070Spatrick 
614*12c85518Srobert extern __inline int
615*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd(__m128d __A,__m128d __B)616*12c85518Srobert     _mm_ucomieq_sd(__m128d __A, __m128d __B) {
617e5dd7070Spatrick   return (__A[0] == __B[0]);
618e5dd7070Spatrick }
619e5dd7070Spatrick 
620*12c85518Srobert extern __inline int
621*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd(__m128d __A,__m128d __B)622*12c85518Srobert     _mm_ucomilt_sd(__m128d __A, __m128d __B) {
623e5dd7070Spatrick   return (__A[0] < __B[0]);
624e5dd7070Spatrick }
625e5dd7070Spatrick 
626*12c85518Srobert extern __inline int
627*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd(__m128d __A,__m128d __B)628*12c85518Srobert     _mm_ucomile_sd(__m128d __A, __m128d __B) {
629e5dd7070Spatrick   return (__A[0] <= __B[0]);
630e5dd7070Spatrick }
631e5dd7070Spatrick 
632*12c85518Srobert extern __inline int
633*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd(__m128d __A,__m128d __B)634*12c85518Srobert     _mm_ucomigt_sd(__m128d __A, __m128d __B) {
635e5dd7070Spatrick   return (__A[0] > __B[0]);
636e5dd7070Spatrick }
637e5dd7070Spatrick 
638*12c85518Srobert extern __inline int
639*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd(__m128d __A,__m128d __B)640*12c85518Srobert     _mm_ucomige_sd(__m128d __A, __m128d __B) {
641e5dd7070Spatrick   return (__A[0] >= __B[0]);
642e5dd7070Spatrick }
643e5dd7070Spatrick 
644*12c85518Srobert extern __inline int
645*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd(__m128d __A,__m128d __B)646*12c85518Srobert     _mm_ucomineq_sd(__m128d __A, __m128d __B) {
647e5dd7070Spatrick   return (__A[0] != __B[0]);
648e5dd7070Spatrick }
649e5dd7070Spatrick 
650e5dd7070Spatrick /* Create a vector of Qi, where i is the element number.  */
651*12c85518Srobert extern __inline __m128i
652*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x(long long __q1,long long __q0)653*12c85518Srobert     _mm_set_epi64x(long long __q1, long long __q0) {
654e5dd7070Spatrick   return __extension__(__m128i)(__v2di){__q0, __q1};
655e5dd7070Spatrick }
656e5dd7070Spatrick 
657*12c85518Srobert extern __inline __m128i
658*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64(__m64 __q1,__m64 __q0)659*12c85518Srobert     _mm_set_epi64(__m64 __q1, __m64 __q0) {
660e5dd7070Spatrick   return _mm_set_epi64x((long long)__q1, (long long)__q0);
661e5dd7070Spatrick }
662e5dd7070Spatrick 
663*12c85518Srobert extern __inline __m128i
664*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32(int __q3,int __q2,int __q1,int __q0)665*12c85518Srobert     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
666e5dd7070Spatrick   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
667e5dd7070Spatrick }
668e5dd7070Spatrick 
669*12c85518Srobert extern __inline __m128i
670*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16(short __q7,short __q6,short __q5,short __q4,short __q3,short __q2,short __q1,short __q0)671*12c85518Srobert     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
672*12c85518Srobert                   short __q2, short __q1, short __q0) {
673*12c85518Srobert   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
674*12c85518Srobert                                         __q4, __q5, __q6, __q7};
675e5dd7070Spatrick }
676e5dd7070Spatrick 
677*12c85518Srobert extern __inline __m128i
678*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8(char __q15,char __q14,char __q13,char __q12,char __q11,char __q10,char __q09,char __q08,char __q07,char __q06,char __q05,char __q04,char __q03,char __q02,char __q01,char __q00)679*12c85518Srobert     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
680*12c85518Srobert                  char __q10, char __q09, char __q08, char __q07, char __q06,
681*12c85518Srobert                  char __q05, char __q04, char __q03, char __q02, char __q01,
682*12c85518Srobert                  char __q00) {
683e5dd7070Spatrick   return __extension__(__m128i)(__v16qi){
684e5dd7070Spatrick       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
685*12c85518Srobert       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
686e5dd7070Spatrick }
687e5dd7070Spatrick 
688e5dd7070Spatrick /* Set all of the elements of the vector to A.  */
689*12c85518Srobert extern __inline __m128i
690*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x(long long __A)691*12c85518Srobert     _mm_set1_epi64x(long long __A) {
692e5dd7070Spatrick   return _mm_set_epi64x(__A, __A);
693e5dd7070Spatrick }
694e5dd7070Spatrick 
695*12c85518Srobert extern __inline __m128i
696*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64(__m64 __A)697*12c85518Srobert     _mm_set1_epi64(__m64 __A) {
698e5dd7070Spatrick   return _mm_set_epi64(__A, __A);
699e5dd7070Spatrick }
700e5dd7070Spatrick 
701*12c85518Srobert extern __inline __m128i
702*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32(int __A)703*12c85518Srobert     _mm_set1_epi32(int __A) {
704e5dd7070Spatrick   return _mm_set_epi32(__A, __A, __A, __A);
705e5dd7070Spatrick }
706e5dd7070Spatrick 
707*12c85518Srobert extern __inline __m128i
708*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16(short __A)709*12c85518Srobert     _mm_set1_epi16(short __A) {
710e5dd7070Spatrick   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
711e5dd7070Spatrick }
712e5dd7070Spatrick 
713*12c85518Srobert extern __inline __m128i
714*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8(char __A)715*12c85518Srobert     _mm_set1_epi8(char __A) {
716*12c85518Srobert   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
717*12c85518Srobert                       __A, __A, __A, __A, __A);
718e5dd7070Spatrick }
719e5dd7070Spatrick 
720e5dd7070Spatrick /* Create a vector of Qi, where i is the element number.
721e5dd7070Spatrick    The parameter order is reversed from the _mm_set_epi* functions.  */
722*12c85518Srobert extern __inline __m128i
723*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64(__m64 __q0,__m64 __q1)724*12c85518Srobert     _mm_setr_epi64(__m64 __q0, __m64 __q1) {
725e5dd7070Spatrick   return _mm_set_epi64(__q1, __q0);
726e5dd7070Spatrick }
727e5dd7070Spatrick 
728*12c85518Srobert extern __inline __m128i
729*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32(int __q0,int __q1,int __q2,int __q3)730*12c85518Srobert     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
731e5dd7070Spatrick   return _mm_set_epi32(__q3, __q2, __q1, __q0);
732e5dd7070Spatrick }
733e5dd7070Spatrick 
734*12c85518Srobert extern __inline __m128i
735*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16(short __q0,short __q1,short __q2,short __q3,short __q4,short __q5,short __q6,short __q7)736*12c85518Srobert     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
737*12c85518Srobert                    short __q5, short __q6, short __q7) {
738e5dd7070Spatrick   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
739e5dd7070Spatrick }
740e5dd7070Spatrick 
741*12c85518Srobert extern __inline __m128i
742*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8(char __q00,char __q01,char __q02,char __q03,char __q04,char __q05,char __q06,char __q07,char __q08,char __q09,char __q10,char __q11,char __q12,char __q13,char __q14,char __q15)743*12c85518Srobert     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
744*12c85518Srobert                   char __q05, char __q06, char __q07, char __q08, char __q09,
745*12c85518Srobert                   char __q10, char __q11, char __q12, char __q13, char __q14,
746*12c85518Srobert                   char __q15) {
747e5dd7070Spatrick   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
748e5dd7070Spatrick                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
749e5dd7070Spatrick }
750e5dd7070Spatrick 
751e5dd7070Spatrick /* Create a vector with element 0 as *P and the rest zero.  */
752*12c85518Srobert extern __inline __m128i
753*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128(__m128i const * __P)754*12c85518Srobert     _mm_load_si128(__m128i const *__P) {
755e5dd7070Spatrick   return *__P;
756e5dd7070Spatrick }
757e5dd7070Spatrick 
758*12c85518Srobert extern __inline __m128i
759*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128(__m128i_u const * __P)760*12c85518Srobert     _mm_loadu_si128(__m128i_u const *__P) {
761e5dd7070Spatrick   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
762e5dd7070Spatrick }
763e5dd7070Spatrick 
764*12c85518Srobert extern __inline __m128i
765*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64(__m128i_u const * __P)766*12c85518Srobert     _mm_loadl_epi64(__m128i_u const *__P) {
767e5dd7070Spatrick   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
768e5dd7070Spatrick }
769e5dd7070Spatrick 
770*12c85518Srobert extern __inline void
771*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128(__m128i * __P,__m128i __B)772*12c85518Srobert     _mm_store_si128(__m128i *__P, __m128i __B) {
773e5dd7070Spatrick   vec_st((__v16qu)__B, 0, (__v16qu *)__P);
774e5dd7070Spatrick }
775e5dd7070Spatrick 
776*12c85518Srobert extern __inline void
777*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128(__m128i_u * __P,__m128i __B)778*12c85518Srobert     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
779e5dd7070Spatrick   *__P = __B;
780e5dd7070Spatrick }
781e5dd7070Spatrick 
782*12c85518Srobert extern __inline void
783*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64(__m128i_u * __P,__m128i __B)784*12c85518Srobert     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
785e5dd7070Spatrick   *(long long *)__P = ((__v2di)__B)[0];
786e5dd7070Spatrick }
787e5dd7070Spatrick 
788*12c85518Srobert extern __inline __m64
789*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64(__m128i_u __B)790*12c85518Srobert     _mm_movepi64_pi64(__m128i_u __B) {
791e5dd7070Spatrick   return (__m64)((__v2di)__B)[0];
792e5dd7070Spatrick }
793e5dd7070Spatrick 
794*12c85518Srobert extern __inline __m128i
795*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64(__m64 __A)796*12c85518Srobert     _mm_movpi64_epi64(__m64 __A) {
797e5dd7070Spatrick   return _mm_set_epi64((__m64)0LL, __A);
798e5dd7070Spatrick }
799e5dd7070Spatrick 
800*12c85518Srobert extern __inline __m128i
801*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64(__m128i __A)802*12c85518Srobert     _mm_move_epi64(__m128i __A) {
803e5dd7070Spatrick   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
804e5dd7070Spatrick }
805e5dd7070Spatrick 
806e5dd7070Spatrick /* Create an undefined vector.  */
807*12c85518Srobert extern __inline __m128i
808*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128(void)809*12c85518Srobert     _mm_undefined_si128(void) {
810e5dd7070Spatrick   __m128i __Y = __Y;
811e5dd7070Spatrick   return __Y;
812e5dd7070Spatrick }
813e5dd7070Spatrick 
814e5dd7070Spatrick /* Create a vector of zeros.  */
815*12c85518Srobert extern __inline __m128i
816*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128(void)817*12c85518Srobert     _mm_setzero_si128(void) {
818e5dd7070Spatrick   return __extension__(__m128i)(__v4si){0, 0, 0, 0};
819e5dd7070Spatrick }
820e5dd7070Spatrick 
821e5dd7070Spatrick #ifdef _ARCH_PWR8
822*12c85518Srobert extern __inline __m128d
823*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd(__m128i __A)824*12c85518Srobert     _mm_cvtepi32_pd(__m128i __A) {
825*12c85518Srobert   __v2di __val;
826e5dd7070Spatrick   /* For LE need to generate Vector Unpack Low Signed Word.
827e5dd7070Spatrick      Which is generated from unpackh.  */
828*12c85518Srobert   __val = (__v2di)vec_unpackh((__v4si)__A);
829e5dd7070Spatrick 
830*12c85518Srobert   return (__m128d)vec_ctf(__val, 0);
831e5dd7070Spatrick }
832e5dd7070Spatrick #endif
833e5dd7070Spatrick 
834*12c85518Srobert extern __inline __m128
835*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps(__m128i __A)836*12c85518Srobert     _mm_cvtepi32_ps(__m128i __A) {
837e5dd7070Spatrick   return ((__m128)vec_ctf((__v4si)__A, 0));
838e5dd7070Spatrick }
839e5dd7070Spatrick 
840*12c85518Srobert extern __inline __m128i
841*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32(__m128d __A)842*12c85518Srobert     _mm_cvtpd_epi32(__m128d __A) {
843*12c85518Srobert   __v2df __rounded = vec_rint(__A);
844*12c85518Srobert   __v4si __result, __temp;
845*12c85518Srobert   const __v4si __vzero = {0, 0, 0, 0};
846e5dd7070Spatrick 
847e5dd7070Spatrick   /* VSX Vector truncate Double-Precision to integer and Convert to
848e5dd7070Spatrick    Signed Integer Word format with Saturate.  */
849*12c85518Srobert   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
850e5dd7070Spatrick 
851e5dd7070Spatrick #ifdef _ARCH_PWR8
852*12c85518Srobert #ifdef __LITTLE_ENDIAN__
853*12c85518Srobert   __temp = vec_mergeo(__temp, __temp);
854*12c85518Srobert #else
855*12c85518Srobert   __temp = vec_mergee(__temp, __temp);
856*12c85518Srobert #endif
857*12c85518Srobert   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
858*12c85518Srobert                                  (__vector long long)__vzero);
859e5dd7070Spatrick #else
860e5dd7070Spatrick   {
861*12c85518Srobert     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
862e5dd7070Spatrick                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
863*12c85518Srobert     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
864e5dd7070Spatrick   }
865e5dd7070Spatrick #endif
866*12c85518Srobert   return (__m128i)__result;
867e5dd7070Spatrick }
868e5dd7070Spatrick 
869*12c85518Srobert extern __inline __m64
870*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32(__m128d __A)871*12c85518Srobert     _mm_cvtpd_pi32(__m128d __A) {
872*12c85518Srobert   __m128i __result = _mm_cvtpd_epi32(__A);
873e5dd7070Spatrick 
874*12c85518Srobert   return (__m64)__result[0];
875e5dd7070Spatrick }
876e5dd7070Spatrick 
877*12c85518Srobert extern __inline __m128
878*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps(__m128d __A)879*12c85518Srobert     _mm_cvtpd_ps(__m128d __A) {
880*12c85518Srobert   __v4sf __result;
881*12c85518Srobert   __v4si __temp;
882*12c85518Srobert   const __v4si __vzero = {0, 0, 0, 0};
883e5dd7070Spatrick 
884*12c85518Srobert   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
885e5dd7070Spatrick 
886e5dd7070Spatrick #ifdef _ARCH_PWR8
887*12c85518Srobert #ifdef __LITTLE_ENDIAN__
888*12c85518Srobert   __temp = vec_mergeo(__temp, __temp);
889*12c85518Srobert #else
890*12c85518Srobert   __temp = vec_mergee(__temp, __temp);
891*12c85518Srobert #endif
892*12c85518Srobert   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
893*12c85518Srobert                                  (__vector long long)__vzero);
894e5dd7070Spatrick #else
895e5dd7070Spatrick   {
896*12c85518Srobert     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
897e5dd7070Spatrick                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
898*12c85518Srobert     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
899e5dd7070Spatrick   }
900e5dd7070Spatrick #endif
901*12c85518Srobert   return ((__m128)__result);
902e5dd7070Spatrick }
903e5dd7070Spatrick 
904*12c85518Srobert extern __inline __m128i
905*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32(__m128d __A)906*12c85518Srobert     _mm_cvttpd_epi32(__m128d __A) {
907*12c85518Srobert   __v4si __result;
908*12c85518Srobert   __v4si __temp;
909*12c85518Srobert   const __v4si __vzero = {0, 0, 0, 0};
910e5dd7070Spatrick 
911e5dd7070Spatrick   /* VSX Vector truncate Double-Precision to integer and Convert to
912e5dd7070Spatrick    Signed Integer Word format with Saturate.  */
913*12c85518Srobert   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
914e5dd7070Spatrick 
915e5dd7070Spatrick #ifdef _ARCH_PWR8
916*12c85518Srobert #ifdef __LITTLE_ENDIAN__
917*12c85518Srobert   __temp = vec_mergeo(__temp, __temp);
918*12c85518Srobert #else
919*12c85518Srobert   __temp = vec_mergee(__temp, __temp);
920*12c85518Srobert #endif
921*12c85518Srobert   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
922*12c85518Srobert                                  (__vector long long)__vzero);
923e5dd7070Spatrick #else
924e5dd7070Spatrick   {
925*12c85518Srobert     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926e5dd7070Spatrick                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
927*12c85518Srobert     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
928e5dd7070Spatrick   }
929e5dd7070Spatrick #endif
930e5dd7070Spatrick 
931*12c85518Srobert   return ((__m128i)__result);
932e5dd7070Spatrick }
933e5dd7070Spatrick 
934*12c85518Srobert extern __inline __m64
935*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32(__m128d __A)936*12c85518Srobert     _mm_cvttpd_pi32(__m128d __A) {
937*12c85518Srobert   __m128i __result = _mm_cvttpd_epi32(__A);
938e5dd7070Spatrick 
939*12c85518Srobert   return (__m64)__result[0];
940e5dd7070Spatrick }
941e5dd7070Spatrick 
942*12c85518Srobert extern __inline int
943*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32(__m128i __A)944*12c85518Srobert     _mm_cvtsi128_si32(__m128i __A) {
945e5dd7070Spatrick   return ((__v4si)__A)[0];
946e5dd7070Spatrick }
947e5dd7070Spatrick 
948e5dd7070Spatrick #ifdef _ARCH_PWR8
949*12c85518Srobert extern __inline __m128d
950*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd(__m64 __A)951*12c85518Srobert     _mm_cvtpi32_pd(__m64 __A) {
952*12c85518Srobert   __v4si __temp;
953*12c85518Srobert   __v2di __tmp2;
954*12c85518Srobert   __v2df __result;
955e5dd7070Spatrick 
956*12c85518Srobert   __temp = (__v4si)vec_splats(__A);
957*12c85518Srobert   __tmp2 = (__v2di)vec_unpackl(__temp);
958*12c85518Srobert   __result = vec_ctf((__vector signed long long)__tmp2, 0);
959*12c85518Srobert   return (__m128d)__result;
960e5dd7070Spatrick }
961e5dd7070Spatrick #endif
962e5dd7070Spatrick 
963*12c85518Srobert extern __inline __m128i
964*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32(__m128 __A)965*12c85518Srobert     _mm_cvtps_epi32(__m128 __A) {
966*12c85518Srobert   __v4sf __rounded;
967*12c85518Srobert   __v4si __result;
968e5dd7070Spatrick 
969*12c85518Srobert   __rounded = vec_rint((__v4sf)__A);
970*12c85518Srobert   __result = vec_cts(__rounded, 0);
971*12c85518Srobert   return (__m128i)__result;
972e5dd7070Spatrick }
973e5dd7070Spatrick 
974*12c85518Srobert extern __inline __m128i
975*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32(__m128 __A)976*12c85518Srobert     _mm_cvttps_epi32(__m128 __A) {
977*12c85518Srobert   __v4si __result;
978e5dd7070Spatrick 
979*12c85518Srobert   __result = vec_cts((__v4sf)__A, 0);
980*12c85518Srobert   return (__m128i)__result;
981e5dd7070Spatrick }
982e5dd7070Spatrick 
983*12c85518Srobert extern __inline __m128d
984*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd(__m128 __A)985*12c85518Srobert     _mm_cvtps_pd(__m128 __A) {
986e5dd7070Spatrick   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
987e5dd7070Spatrick #ifdef vec_doubleh
988e5dd7070Spatrick   return (__m128d)vec_doubleh((__v4sf)__A);
989e5dd7070Spatrick #else
990e5dd7070Spatrick   /* Otherwise the compiler is not current and so need to generate the
991e5dd7070Spatrick      equivalent code.  */
992*12c85518Srobert   __v4sf __a = (__v4sf)__A;
993*12c85518Srobert   __v4sf __temp;
994*12c85518Srobert   __v2df __result;
995e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
996e5dd7070Spatrick   /* The input float values are in elements {[0], [1]} but the convert
997e5dd7070Spatrick      instruction needs them in elements {[1], [3]}, So we use two
998e5dd7070Spatrick      shift left double vector word immediates to get the elements
999e5dd7070Spatrick      lined up.  */
1000*12c85518Srobert   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1001*12c85518Srobert   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1002e5dd7070Spatrick #else
1003e5dd7070Spatrick   /* The input float values are in elements {[0], [1]} but the convert
1004e5dd7070Spatrick      instruction needs them in elements {[0], [2]}, So we use two
1005e5dd7070Spatrick      shift left double vector word immediates to get the elements
1006e5dd7070Spatrick      lined up.  */
1007*12c85518Srobert   __temp = vec_vmrghw(__a, __a);
1008e5dd7070Spatrick #endif
1009*12c85518Srobert   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1010*12c85518Srobert   return (__m128d)__result;
1011e5dd7070Spatrick #endif
1012e5dd7070Spatrick }
1013e5dd7070Spatrick 
1014*12c85518Srobert extern __inline int
1015*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32(__m128d __A)1016*12c85518Srobert     _mm_cvtsd_si32(__m128d __A) {
1017*12c85518Srobert   __v2df __rounded = vec_rint((__v2df)__A);
1018*12c85518Srobert   int __result = ((__v2df)__rounded)[0];
1019e5dd7070Spatrick 
1020*12c85518Srobert   return __result;
1021e5dd7070Spatrick }
1022e5dd7070Spatrick /* Intel intrinsic.  */
1023*12c85518Srobert extern __inline long long
1024*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64(__m128d __A)1025*12c85518Srobert     _mm_cvtsd_si64(__m128d __A) {
1026*12c85518Srobert   __v2df __rounded = vec_rint((__v2df)__A);
1027*12c85518Srobert   long long __result = ((__v2df)__rounded)[0];
1028e5dd7070Spatrick 
1029*12c85518Srobert   return __result;
1030e5dd7070Spatrick }
1031e5dd7070Spatrick 
1032e5dd7070Spatrick /* Microsoft intrinsic.  */
1033*12c85518Srobert extern __inline long long
1034*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x(__m128d __A)1035*12c85518Srobert     _mm_cvtsd_si64x(__m128d __A) {
1036e5dd7070Spatrick   return _mm_cvtsd_si64((__v2df)__A);
1037e5dd7070Spatrick }
1038e5dd7070Spatrick 
1039*12c85518Srobert extern __inline int
1040*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32(__m128d __A)1041*12c85518Srobert     _mm_cvttsd_si32(__m128d __A) {
1042*12c85518Srobert   int __result = ((__v2df)__A)[0];
1043e5dd7070Spatrick 
1044*12c85518Srobert   return __result;
1045e5dd7070Spatrick }
1046e5dd7070Spatrick 
1047e5dd7070Spatrick /* Intel intrinsic.  */
1048*12c85518Srobert extern __inline long long
1049*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64(__m128d __A)1050*12c85518Srobert     _mm_cvttsd_si64(__m128d __A) {
1051*12c85518Srobert   long long __result = ((__v2df)__A)[0];
1052e5dd7070Spatrick 
1053*12c85518Srobert   return __result;
1054e5dd7070Spatrick }
1055e5dd7070Spatrick 
1056e5dd7070Spatrick /* Microsoft intrinsic.  */
1057*12c85518Srobert extern __inline long long
1058*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x(__m128d __A)1059*12c85518Srobert     _mm_cvttsd_si64x(__m128d __A) {
1060e5dd7070Spatrick   return _mm_cvttsd_si64(__A);
1061e5dd7070Spatrick }
1062e5dd7070Spatrick 
1063*12c85518Srobert extern __inline __m128
1064*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss(__m128 __A,__m128d __B)1065*12c85518Srobert     _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1066*12c85518Srobert   __v4sf __result = (__v4sf)__A;
1067e5dd7070Spatrick 
1068e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1069*12c85518Srobert   __v4sf __temp_s;
1070e5dd7070Spatrick   /* Copy double element[0] to element [1] for conversion.  */
1071*12c85518Srobert   __v2df __temp_b = vec_splat((__v2df)__B, 0);
1072e5dd7070Spatrick 
1073e5dd7070Spatrick   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1074*12c85518Srobert   __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1075e5dd7070Spatrick   /* Convert double to single float scalar in a vector.  */
1076*12c85518Srobert   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1077e5dd7070Spatrick   /* Shift the resulting scalar into vector element [0].  */
1078*12c85518Srobert   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1079e5dd7070Spatrick #else
1080*12c85518Srobert   __result[0] = ((__v2df)__B)[0];
1081e5dd7070Spatrick #endif
1082*12c85518Srobert   return (__m128)__result;
1083e5dd7070Spatrick }
1084e5dd7070Spatrick 
1085*12c85518Srobert extern __inline __m128d
1086*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd(__m128d __A,int __B)1087*12c85518Srobert     _mm_cvtsi32_sd(__m128d __A, int __B) {
1088*12c85518Srobert   __v2df __result = (__v2df)__A;
1089*12c85518Srobert   double __db = __B;
1090*12c85518Srobert   __result[0] = __db;
1091*12c85518Srobert   return (__m128d)__result;
1092e5dd7070Spatrick }
1093e5dd7070Spatrick 
1094e5dd7070Spatrick /* Intel intrinsic.  */
1095*12c85518Srobert extern __inline __m128d
1096*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd(__m128d __A,long long __B)1097*12c85518Srobert     _mm_cvtsi64_sd(__m128d __A, long long __B) {
1098*12c85518Srobert   __v2df __result = (__v2df)__A;
1099*12c85518Srobert   double __db = __B;
1100*12c85518Srobert   __result[0] = __db;
1101*12c85518Srobert   return (__m128d)__result;
1102e5dd7070Spatrick }
1103e5dd7070Spatrick 
1104e5dd7070Spatrick /* Microsoft intrinsic.  */
1105*12c85518Srobert extern __inline __m128d
1106*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd(__m128d __A,long long __B)1107*12c85518Srobert     _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1108e5dd7070Spatrick   return _mm_cvtsi64_sd(__A, __B);
1109e5dd7070Spatrick }
1110e5dd7070Spatrick 
1111*12c85518Srobert extern __inline __m128d
1112*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd(__m128d __A,__m128 __B)1113*12c85518Srobert     _mm_cvtss_sd(__m128d __A, __m128 __B) {
1114e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1115e5dd7070Spatrick   /* Use splat to move element [0] into position for the convert. */
1116*12c85518Srobert   __v4sf __temp = vec_splat((__v4sf)__B, 0);
1117*12c85518Srobert   __v2df __res;
1118e5dd7070Spatrick   /* Convert single float scalar to double in a vector.  */
1119*12c85518Srobert   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1120*12c85518Srobert   return (__m128d)vec_mergel(__res, (__v2df)__A);
1121e5dd7070Spatrick #else
1122*12c85518Srobert   __v2df __res = (__v2df)__A;
1123*12c85518Srobert   __res[0] = ((__v4sf)__B)[0];
1124*12c85518Srobert   return (__m128d)__res;
1125e5dd7070Spatrick #endif
1126e5dd7070Spatrick }
1127e5dd7070Spatrick 
1128*12c85518Srobert extern __inline __m128d
1129*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A,__m128d __B,const int __mask)1130*12c85518Srobert     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1131*12c85518Srobert   __vector double __result;
1132*12c85518Srobert   const int __litmsk = __mask & 0x3;
1133e5dd7070Spatrick 
1134*12c85518Srobert   if (__litmsk == 0)
1135*12c85518Srobert     __result = vec_mergeh(__A, __B);
1136e5dd7070Spatrick #if __GNUC__ < 6
1137*12c85518Srobert   else if (__litmsk == 1)
1138*12c85518Srobert     __result = vec_xxpermdi(__B, __A, 2);
1139*12c85518Srobert   else if (__litmsk == 2)
1140*12c85518Srobert     __result = vec_xxpermdi(__B, __A, 1);
1141e5dd7070Spatrick #else
1142*12c85518Srobert   else if (__litmsk == 1)
1143*12c85518Srobert     __result = vec_xxpermdi(__A, __B, 2);
1144*12c85518Srobert   else if (__litmsk == 2)
1145*12c85518Srobert     __result = vec_xxpermdi(__A, __B, 1);
1146e5dd7070Spatrick #endif
1147e5dd7070Spatrick   else
1148*12c85518Srobert     __result = vec_mergel(__A, __B);
1149e5dd7070Spatrick 
1150*12c85518Srobert   return __result;
1151e5dd7070Spatrick }
1152e5dd7070Spatrick 
1153*12c85518Srobert extern __inline __m128d
1154*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd(__m128d __A,__m128d __B)1155*12c85518Srobert     _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1156e5dd7070Spatrick   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1157e5dd7070Spatrick }
1158e5dd7070Spatrick 
1159*12c85518Srobert extern __inline __m128d
1160*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd(__m128d __A,__m128d __B)1161*12c85518Srobert     _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1162e5dd7070Spatrick   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1163e5dd7070Spatrick }
1164e5dd7070Spatrick 
1165*12c85518Srobert extern __inline __m128d
1166*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd(__m128d __A,double const * __B)1167*12c85518Srobert     _mm_loadh_pd(__m128d __A, double const *__B) {
1168*12c85518Srobert   __v2df __result = (__v2df)__A;
1169*12c85518Srobert   __result[1] = *__B;
1170*12c85518Srobert   return (__m128d)__result;
1171e5dd7070Spatrick }
1172e5dd7070Spatrick 
1173*12c85518Srobert extern __inline __m128d
1174*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd(__m128d __A,double const * __B)1175*12c85518Srobert     _mm_loadl_pd(__m128d __A, double const *__B) {
1176*12c85518Srobert   __v2df __result = (__v2df)__A;
1177*12c85518Srobert   __result[0] = *__B;
1178*12c85518Srobert   return (__m128d)__result;
1179e5dd7070Spatrick }
1180e5dd7070Spatrick 
1181e5dd7070Spatrick #ifdef _ARCH_PWR8
1182e5dd7070Spatrick /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1183e5dd7070Spatrick 
1184e5dd7070Spatrick /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1185*12c85518Srobert extern __inline int
1186*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd(__m128d __A)1187*12c85518Srobert     _mm_movemask_pd(__m128d __A) {
1188*12c85518Srobert #ifdef _ARCH_PWR10
1189*12c85518Srobert   return vec_extractm((__v2du)__A);
1190*12c85518Srobert #else
1191*12c85518Srobert   __vector unsigned long long __result;
1192*12c85518Srobert   static const __vector unsigned int __perm_mask = {
1193e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1194e5dd7070Spatrick       0x80800040, 0x80808080, 0x80808080, 0x80808080
1195e5dd7070Spatrick #else
1196e5dd7070Spatrick       0x80808080, 0x80808080, 0x80808080, 0x80804000
1197e5dd7070Spatrick #endif
1198e5dd7070Spatrick   };
1199e5dd7070Spatrick 
1200*12c85518Srobert   __result = ((__vector unsigned long long)vec_vbpermq(
1201*12c85518Srobert       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1202e5dd7070Spatrick 
1203e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1204*12c85518Srobert   return __result[1];
1205e5dd7070Spatrick #else
1206*12c85518Srobert   return __result[0];
1207e5dd7070Spatrick #endif
1208*12c85518Srobert #endif /* !_ARCH_PWR10 */
1209e5dd7070Spatrick }
1210e5dd7070Spatrick #endif /* _ARCH_PWR8 */
1211e5dd7070Spatrick 
1212*12c85518Srobert extern __inline __m128i
1213*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16(__m128i __A,__m128i __B)1214*12c85518Srobert     _mm_packs_epi16(__m128i __A, __m128i __B) {
1215e5dd7070Spatrick   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1216e5dd7070Spatrick }
1217e5dd7070Spatrick 
1218*12c85518Srobert extern __inline __m128i
1219*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32(__m128i __A,__m128i __B)1220*12c85518Srobert     _mm_packs_epi32(__m128i __A, __m128i __B) {
1221e5dd7070Spatrick   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1222e5dd7070Spatrick }
1223e5dd7070Spatrick 
1224*12c85518Srobert extern __inline __m128i
1225*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16(__m128i __A,__m128i __B)1226*12c85518Srobert     _mm_packus_epi16(__m128i __A, __m128i __B) {
1227e5dd7070Spatrick   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1228e5dd7070Spatrick }
1229e5dd7070Spatrick 
1230*12c85518Srobert extern __inline __m128i
1231*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8(__m128i __A,__m128i __B)1232*12c85518Srobert     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1233e5dd7070Spatrick   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1234e5dd7070Spatrick }
1235e5dd7070Spatrick 
1236*12c85518Srobert extern __inline __m128i
1237*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16(__m128i __A,__m128i __B)1238*12c85518Srobert     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1239e5dd7070Spatrick   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1240e5dd7070Spatrick }
1241e5dd7070Spatrick 
1242*12c85518Srobert extern __inline __m128i
1243*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32(__m128i __A,__m128i __B)1244*12c85518Srobert     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1245e5dd7070Spatrick   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1246e5dd7070Spatrick }
1247e5dd7070Spatrick 
1248*12c85518Srobert extern __inline __m128i
1249*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64(__m128i __A,__m128i __B)1250*12c85518Srobert     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1251*12c85518Srobert   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1252e5dd7070Spatrick }
1253e5dd7070Spatrick 
1254*12c85518Srobert extern __inline __m128i
1255*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8(__m128i __A,__m128i __B)1256*12c85518Srobert     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1257e5dd7070Spatrick   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1258e5dd7070Spatrick }
1259e5dd7070Spatrick 
1260*12c85518Srobert extern __inline __m128i
1261*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16(__m128i __A,__m128i __B)1262*12c85518Srobert     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1263e5dd7070Spatrick   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1264e5dd7070Spatrick }
1265e5dd7070Spatrick 
1266*12c85518Srobert extern __inline __m128i
1267*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32(__m128i __A,__m128i __B)1268*12c85518Srobert     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1269e5dd7070Spatrick   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1270e5dd7070Spatrick }
1271e5dd7070Spatrick 
1272*12c85518Srobert extern __inline __m128i
1273*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64(__m128i __A,__m128i __B)1274*12c85518Srobert     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1275*12c85518Srobert   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1276e5dd7070Spatrick }
1277e5dd7070Spatrick 
1278*12c85518Srobert extern __inline __m128i
1279*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8(__m128i __A,__m128i __B)1280*12c85518Srobert     _mm_add_epi8(__m128i __A, __m128i __B) {
1281e5dd7070Spatrick   return (__m128i)((__v16qu)__A + (__v16qu)__B);
1282e5dd7070Spatrick }
1283e5dd7070Spatrick 
1284*12c85518Srobert extern __inline __m128i
1285*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16(__m128i __A,__m128i __B)1286*12c85518Srobert     _mm_add_epi16(__m128i __A, __m128i __B) {
1287e5dd7070Spatrick   return (__m128i)((__v8hu)__A + (__v8hu)__B);
1288e5dd7070Spatrick }
1289e5dd7070Spatrick 
1290*12c85518Srobert extern __inline __m128i
1291*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32(__m128i __A,__m128i __B)1292*12c85518Srobert     _mm_add_epi32(__m128i __A, __m128i __B) {
1293e5dd7070Spatrick   return (__m128i)((__v4su)__A + (__v4su)__B);
1294e5dd7070Spatrick }
1295e5dd7070Spatrick 
1296*12c85518Srobert extern __inline __m128i
1297*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64(__m128i __A,__m128i __B)1298*12c85518Srobert     _mm_add_epi64(__m128i __A, __m128i __B) {
1299e5dd7070Spatrick   return (__m128i)((__v2du)__A + (__v2du)__B);
1300e5dd7070Spatrick }
1301e5dd7070Spatrick 
1302*12c85518Srobert extern __inline __m128i
1303*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8(__m128i __A,__m128i __B)1304*12c85518Srobert     _mm_adds_epi8(__m128i __A, __m128i __B) {
1305e5dd7070Spatrick   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1306e5dd7070Spatrick }
1307e5dd7070Spatrick 
1308*12c85518Srobert extern __inline __m128i
1309*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16(__m128i __A,__m128i __B)1310*12c85518Srobert     _mm_adds_epi16(__m128i __A, __m128i __B) {
1311e5dd7070Spatrick   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1312e5dd7070Spatrick }
1313e5dd7070Spatrick 
1314*12c85518Srobert extern __inline __m128i
1315*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8(__m128i __A,__m128i __B)1316*12c85518Srobert     _mm_adds_epu8(__m128i __A, __m128i __B) {
1317e5dd7070Spatrick   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1318e5dd7070Spatrick }
1319e5dd7070Spatrick 
1320*12c85518Srobert extern __inline __m128i
1321*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16(__m128i __A,__m128i __B)1322*12c85518Srobert     _mm_adds_epu16(__m128i __A, __m128i __B) {
1323e5dd7070Spatrick   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1324e5dd7070Spatrick }
1325e5dd7070Spatrick 
1326*12c85518Srobert extern __inline __m128i
1327*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8(__m128i __A,__m128i __B)1328*12c85518Srobert     _mm_sub_epi8(__m128i __A, __m128i __B) {
1329e5dd7070Spatrick   return (__m128i)((__v16qu)__A - (__v16qu)__B);
1330e5dd7070Spatrick }
1331e5dd7070Spatrick 
1332*12c85518Srobert extern __inline __m128i
1333*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16(__m128i __A,__m128i __B)1334*12c85518Srobert     _mm_sub_epi16(__m128i __A, __m128i __B) {
1335e5dd7070Spatrick   return (__m128i)((__v8hu)__A - (__v8hu)__B);
1336e5dd7070Spatrick }
1337e5dd7070Spatrick 
1338*12c85518Srobert extern __inline __m128i
1339*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32(__m128i __A,__m128i __B)1340*12c85518Srobert     _mm_sub_epi32(__m128i __A, __m128i __B) {
1341e5dd7070Spatrick   return (__m128i)((__v4su)__A - (__v4su)__B);
1342e5dd7070Spatrick }
1343e5dd7070Spatrick 
1344*12c85518Srobert extern __inline __m128i
1345*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64(__m128i __A,__m128i __B)1346*12c85518Srobert     _mm_sub_epi64(__m128i __A, __m128i __B) {
1347e5dd7070Spatrick   return (__m128i)((__v2du)__A - (__v2du)__B);
1348e5dd7070Spatrick }
1349e5dd7070Spatrick 
1350*12c85518Srobert extern __inline __m128i
1351*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8(__m128i __A,__m128i __B)1352*12c85518Srobert     _mm_subs_epi8(__m128i __A, __m128i __B) {
1353e5dd7070Spatrick   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1354e5dd7070Spatrick }
1355e5dd7070Spatrick 
1356*12c85518Srobert extern __inline __m128i
1357*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16(__m128i __A,__m128i __B)1358*12c85518Srobert     _mm_subs_epi16(__m128i __A, __m128i __B) {
1359e5dd7070Spatrick   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1360e5dd7070Spatrick }
1361e5dd7070Spatrick 
1362*12c85518Srobert extern __inline __m128i
1363*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8(__m128i __A,__m128i __B)1364*12c85518Srobert     _mm_subs_epu8(__m128i __A, __m128i __B) {
1365e5dd7070Spatrick   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1366e5dd7070Spatrick }
1367e5dd7070Spatrick 
1368*12c85518Srobert extern __inline __m128i
1369*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16(__m128i __A,__m128i __B)1370*12c85518Srobert     _mm_subs_epu16(__m128i __A, __m128i __B) {
1371e5dd7070Spatrick   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1372e5dd7070Spatrick }
1373e5dd7070Spatrick 
1374*12c85518Srobert extern __inline __m128i
1375*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16(__m128i __A,__m128i __B)1376*12c85518Srobert     _mm_madd_epi16(__m128i __A, __m128i __B) {
1377*12c85518Srobert   __vector signed int __zero = {0, 0, 0, 0};
1378e5dd7070Spatrick 
1379*12c85518Srobert   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1380e5dd7070Spatrick }
1381e5dd7070Spatrick 
1382*12c85518Srobert extern __inline __m128i
1383*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16(__m128i __A,__m128i __B)1384*12c85518Srobert     _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1385*12c85518Srobert   __vector signed int __w0, __w1;
1386e5dd7070Spatrick 
1387*12c85518Srobert   __vector unsigned char __xform1 = {
1388e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1389*12c85518Srobert       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1390*12c85518Srobert       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1391e5dd7070Spatrick #else
1392*12c85518Srobert       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1393*12c85518Srobert       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1394e5dd7070Spatrick #endif
1395e5dd7070Spatrick   };
1396e5dd7070Spatrick 
1397*12c85518Srobert   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1398*12c85518Srobert   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1399*12c85518Srobert   return (__m128i)vec_perm(__w0, __w1, __xform1);
1400e5dd7070Spatrick }
1401e5dd7070Spatrick 
1402*12c85518Srobert extern __inline __m128i
1403*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16(__m128i __A,__m128i __B)1404*12c85518Srobert     _mm_mullo_epi16(__m128i __A, __m128i __B) {
1405e5dd7070Spatrick   return (__m128i)((__v8hi)__A * (__v8hi)__B);
1406e5dd7070Spatrick }
1407e5dd7070Spatrick 
1408*12c85518Srobert extern __inline __m64
1409*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32(__m64 __A,__m64 __B)1410*12c85518Srobert     _mm_mul_su32(__m64 __A, __m64 __B) {
1411*12c85518Srobert   unsigned int __a = __A;
1412*12c85518Srobert   unsigned int __b = __B;
1413e5dd7070Spatrick 
1414*12c85518Srobert   return ((__m64)__a * (__m64)__b);
1415e5dd7070Spatrick }
1416e5dd7070Spatrick 
1417*12c85518Srobert #ifdef _ARCH_PWR8
1418*12c85518Srobert extern __inline __m128i
1419*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32(__m128i __A,__m128i __B)1420*12c85518Srobert     _mm_mul_epu32(__m128i __A, __m128i __B) {
1421e5dd7070Spatrick #if __GNUC__ < 8
1422*12c85518Srobert   __v2du __result;
1423e5dd7070Spatrick 
1424e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1425e5dd7070Spatrick   /* VMX Vector Multiply Odd Unsigned Word.  */
1426*12c85518Srobert   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1427e5dd7070Spatrick #else
1428e5dd7070Spatrick   /* VMX Vector Multiply Even Unsigned Word.  */
1429*12c85518Srobert   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1430e5dd7070Spatrick #endif
1431*12c85518Srobert   return (__m128i)__result;
1432e5dd7070Spatrick #else
1433e5dd7070Spatrick   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1434e5dd7070Spatrick #endif
1435e5dd7070Spatrick }
1436*12c85518Srobert #endif
1437e5dd7070Spatrick 
1438*12c85518Srobert extern __inline __m128i
1439*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16(__m128i __A,int __B)1440*12c85518Srobert     _mm_slli_epi16(__m128i __A, int __B) {
1441*12c85518Srobert   __v8hu __lshift;
1442*12c85518Srobert   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1443e5dd7070Spatrick 
1444*12c85518Srobert   if (__B >= 0 && __B < 16) {
1445e5dd7070Spatrick     if (__builtin_constant_p(__B))
1446*12c85518Srobert       __lshift = (__v8hu)vec_splat_s16(__B);
1447e5dd7070Spatrick     else
1448*12c85518Srobert       __lshift = vec_splats((unsigned short)__B);
1449e5dd7070Spatrick 
1450*12c85518Srobert     __result = vec_sl((__v8hi)__A, __lshift);
1451e5dd7070Spatrick   }
1452e5dd7070Spatrick 
1453*12c85518Srobert   return (__m128i)__result;
1454e5dd7070Spatrick }
1455e5dd7070Spatrick 
1456*12c85518Srobert extern __inline __m128i
1457*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32(__m128i __A,int __B)1458*12c85518Srobert     _mm_slli_epi32(__m128i __A, int __B) {
1459*12c85518Srobert   __v4su __lshift;
1460*12c85518Srobert   __v4si __result = {0, 0, 0, 0};
1461e5dd7070Spatrick 
1462*12c85518Srobert   if (__B >= 0 && __B < 32) {
1463e5dd7070Spatrick     if (__builtin_constant_p(__B) && __B < 16)
1464*12c85518Srobert       __lshift = (__v4su)vec_splat_s32(__B);
1465e5dd7070Spatrick     else
1466*12c85518Srobert       __lshift = vec_splats((unsigned int)__B);
1467e5dd7070Spatrick 
1468*12c85518Srobert     __result = vec_sl((__v4si)__A, __lshift);
1469e5dd7070Spatrick   }
1470e5dd7070Spatrick 
1471*12c85518Srobert   return (__m128i)__result;
1472e5dd7070Spatrick }
1473e5dd7070Spatrick 
1474e5dd7070Spatrick #ifdef _ARCH_PWR8
1475*12c85518Srobert extern __inline __m128i
1476*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64(__m128i __A,int __B)1477*12c85518Srobert     _mm_slli_epi64(__m128i __A, int __B) {
1478*12c85518Srobert   __v2du __lshift;
1479*12c85518Srobert   __v2di __result = {0, 0};
1480e5dd7070Spatrick 
1481*12c85518Srobert   if (__B >= 0 && __B < 64) {
1482e5dd7070Spatrick     if (__builtin_constant_p(__B) && __B < 16)
1483*12c85518Srobert       __lshift = (__v2du)vec_splat_s32(__B);
1484e5dd7070Spatrick     else
1485*12c85518Srobert       __lshift = (__v2du)vec_splats((unsigned int)__B);
1486e5dd7070Spatrick 
1487*12c85518Srobert     __result = vec_sl((__v2di)__A, __lshift);
1488e5dd7070Spatrick   }
1489e5dd7070Spatrick 
1490*12c85518Srobert   return (__m128i)__result;
1491e5dd7070Spatrick }
1492e5dd7070Spatrick #endif
1493e5dd7070Spatrick 
1494*12c85518Srobert extern __inline __m128i
1495*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16(__m128i __A,int __B)1496*12c85518Srobert     _mm_srai_epi16(__m128i __A, int __B) {
1497*12c85518Srobert   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1498*12c85518Srobert   __v8hi __result;
1499e5dd7070Spatrick 
1500*12c85518Srobert   if (__B < 16) {
1501e5dd7070Spatrick     if (__builtin_constant_p(__B))
1502*12c85518Srobert       __rshift = (__v8hu)vec_splat_s16(__B);
1503e5dd7070Spatrick     else
1504*12c85518Srobert       __rshift = vec_splats((unsigned short)__B);
1505e5dd7070Spatrick   }
1506*12c85518Srobert   __result = vec_sra((__v8hi)__A, __rshift);
1507e5dd7070Spatrick 
1508*12c85518Srobert   return (__m128i)__result;
1509e5dd7070Spatrick }
1510e5dd7070Spatrick 
1511*12c85518Srobert extern __inline __m128i
1512*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32(__m128i __A,int __B)1513*12c85518Srobert     _mm_srai_epi32(__m128i __A, int __B) {
1514*12c85518Srobert   __v4su __rshift = {31, 31, 31, 31};
1515*12c85518Srobert   __v4si __result;
1516e5dd7070Spatrick 
1517*12c85518Srobert   if (__B < 32) {
1518*12c85518Srobert     if (__builtin_constant_p(__B)) {
1519e5dd7070Spatrick       if (__B < 16)
1520*12c85518Srobert         __rshift = (__v4su)vec_splat_s32(__B);
1521e5dd7070Spatrick       else
1522*12c85518Srobert         __rshift = (__v4su)vec_splats((unsigned int)__B);
1523*12c85518Srobert     } else
1524*12c85518Srobert       __rshift = vec_splats((unsigned int)__B);
1525e5dd7070Spatrick   }
1526*12c85518Srobert   __result = vec_sra((__v4si)__A, __rshift);
1527e5dd7070Spatrick 
1528*12c85518Srobert   return (__m128i)__result;
1529e5dd7070Spatrick }
1530e5dd7070Spatrick 
1531*12c85518Srobert extern __inline __m128i
1532*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128(__m128i __A,const int __N)1533*12c85518Srobert     _mm_bslli_si128(__m128i __A, const int __N) {
1534*12c85518Srobert   __v16qu __result;
1535*12c85518Srobert   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1536e5dd7070Spatrick 
1537e5dd7070Spatrick   if (__N < 16)
1538*12c85518Srobert     __result = vec_sld((__v16qu)__A, __zeros, __N);
1539e5dd7070Spatrick   else
1540*12c85518Srobert     __result = __zeros;
1541e5dd7070Spatrick 
1542*12c85518Srobert   return (__m128i)__result;
1543e5dd7070Spatrick }
1544e5dd7070Spatrick 
1545*12c85518Srobert extern __inline __m128i
1546*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128(__m128i __A,const int __N)1547*12c85518Srobert     _mm_bsrli_si128(__m128i __A, const int __N) {
1548*12c85518Srobert   __v16qu __result;
1549*12c85518Srobert   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1550e5dd7070Spatrick 
1551e5dd7070Spatrick   if (__N < 16)
1552e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1553e5dd7070Spatrick     if (__builtin_constant_p(__N))
1554e5dd7070Spatrick       /* Would like to use Vector Shift Left Double by Octet
1555e5dd7070Spatrick          Immediate here to use the immediate form and avoid
1556e5dd7070Spatrick          load of __N * 8 value into a separate VR.  */
1557*12c85518Srobert       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1558e5dd7070Spatrick     else
1559e5dd7070Spatrick #endif
1560e5dd7070Spatrick     {
1561*12c85518Srobert       __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1562e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1563*12c85518Srobert       __result = vec_sro((__v16qu)__A, __shift);
1564e5dd7070Spatrick #else
1565*12c85518Srobert     __result = vec_slo((__v16qu)__A, __shift);
1566e5dd7070Spatrick #endif
1567e5dd7070Spatrick     }
1568e5dd7070Spatrick   else
1569*12c85518Srobert     __result = __zeros;
1570e5dd7070Spatrick 
1571*12c85518Srobert   return (__m128i)__result;
1572e5dd7070Spatrick }
1573e5dd7070Spatrick 
1574*12c85518Srobert extern __inline __m128i
1575*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128(__m128i __A,const int __N)1576*12c85518Srobert     _mm_srli_si128(__m128i __A, const int __N) {
1577e5dd7070Spatrick   return _mm_bsrli_si128(__A, __N);
1578e5dd7070Spatrick }
1579e5dd7070Spatrick 
1580*12c85518Srobert extern __inline __m128i
1581*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128(__m128i __A,const int _imm5)1582*12c85518Srobert     _mm_slli_si128(__m128i __A, const int _imm5) {
1583*12c85518Srobert   __v16qu __result;
1584*12c85518Srobert   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1585e5dd7070Spatrick 
1586e5dd7070Spatrick   if (_imm5 < 16)
1587e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1588*12c85518Srobert     __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1589e5dd7070Spatrick #else
1590*12c85518Srobert     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1591e5dd7070Spatrick #endif
1592e5dd7070Spatrick   else
1593*12c85518Srobert     __result = __zeros;
1594e5dd7070Spatrick 
1595*12c85518Srobert   return (__m128i)__result;
1596e5dd7070Spatrick }
1597e5dd7070Spatrick 
1598*12c85518Srobert extern __inline __m128i
1599*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600e5dd7070Spatrick 
_mm_srli_epi16(__m128i __A,int __B)1601*12c85518Srobert     _mm_srli_epi16(__m128i __A, int __B) {
1602*12c85518Srobert   __v8hu __rshift;
1603*12c85518Srobert   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1604e5dd7070Spatrick 
1605*12c85518Srobert   if (__B < 16) {
1606e5dd7070Spatrick     if (__builtin_constant_p(__B))
1607*12c85518Srobert       __rshift = (__v8hu)vec_splat_s16(__B);
1608e5dd7070Spatrick     else
1609*12c85518Srobert       __rshift = vec_splats((unsigned short)__B);
1610e5dd7070Spatrick 
1611*12c85518Srobert     __result = vec_sr((__v8hi)__A, __rshift);
1612e5dd7070Spatrick   }
1613e5dd7070Spatrick 
1614*12c85518Srobert   return (__m128i)__result;
1615e5dd7070Spatrick }
1616e5dd7070Spatrick 
1617*12c85518Srobert extern __inline __m128i
1618*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32(__m128i __A,int __B)1619*12c85518Srobert     _mm_srli_epi32(__m128i __A, int __B) {
1620*12c85518Srobert   __v4su __rshift;
1621*12c85518Srobert   __v4si __result = {0, 0, 0, 0};
1622e5dd7070Spatrick 
1623*12c85518Srobert   if (__B < 32) {
1624*12c85518Srobert     if (__builtin_constant_p(__B)) {
1625e5dd7070Spatrick       if (__B < 16)
1626*12c85518Srobert         __rshift = (__v4su)vec_splat_s32(__B);
1627e5dd7070Spatrick       else
1628*12c85518Srobert         __rshift = (__v4su)vec_splats((unsigned int)__B);
1629*12c85518Srobert     } else
1630*12c85518Srobert       __rshift = vec_splats((unsigned int)__B);
1631e5dd7070Spatrick 
1632*12c85518Srobert     __result = vec_sr((__v4si)__A, __rshift);
1633e5dd7070Spatrick   }
1634e5dd7070Spatrick 
1635*12c85518Srobert   return (__m128i)__result;
1636e5dd7070Spatrick }
1637e5dd7070Spatrick 
1638e5dd7070Spatrick #ifdef _ARCH_PWR8
1639*12c85518Srobert extern __inline __m128i
1640*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64(__m128i __A,int __B)1641*12c85518Srobert     _mm_srli_epi64(__m128i __A, int __B) {
1642*12c85518Srobert   __v2du __rshift;
1643*12c85518Srobert   __v2di __result = {0, 0};
1644e5dd7070Spatrick 
1645*12c85518Srobert   if (__B < 64) {
1646*12c85518Srobert     if (__builtin_constant_p(__B)) {
1647e5dd7070Spatrick       if (__B < 16)
1648*12c85518Srobert         __rshift = (__v2du)vec_splat_s32(__B);
1649e5dd7070Spatrick       else
1650*12c85518Srobert         __rshift = (__v2du)vec_splats((unsigned long long)__B);
1651*12c85518Srobert     } else
1652*12c85518Srobert       __rshift = (__v2du)vec_splats((unsigned int)__B);
1653e5dd7070Spatrick 
1654*12c85518Srobert     __result = vec_sr((__v2di)__A, __rshift);
1655e5dd7070Spatrick   }
1656e5dd7070Spatrick 
1657*12c85518Srobert   return (__m128i)__result;
1658e5dd7070Spatrick }
1659e5dd7070Spatrick #endif
1660e5dd7070Spatrick 
1661*12c85518Srobert extern __inline __m128i
1662*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16(__m128i __A,__m128i __B)1663*12c85518Srobert     _mm_sll_epi16(__m128i __A, __m128i __B) {
1664*12c85518Srobert   __v8hu __lshift;
1665*12c85518Srobert   __vector __bool short __shmask;
1666*12c85518Srobert   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1667*12c85518Srobert   __v8hu __result;
1668e5dd7070Spatrick 
1669e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1670*12c85518Srobert   __lshift = vec_splat((__v8hu)__B, 0);
1671e5dd7070Spatrick #else
1672*12c85518Srobert   __lshift = vec_splat((__v8hu)__B, 3);
1673e5dd7070Spatrick #endif
1674*12c85518Srobert   __shmask = vec_cmple(__lshift, __shmax);
1675*12c85518Srobert   __result = vec_sl((__v8hu)__A, __lshift);
1676*12c85518Srobert   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1677e5dd7070Spatrick 
1678*12c85518Srobert   return (__m128i)__result;
1679e5dd7070Spatrick }
1680e5dd7070Spatrick 
1681*12c85518Srobert extern __inline __m128i
1682*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32(__m128i __A,__m128i __B)1683*12c85518Srobert     _mm_sll_epi32(__m128i __A, __m128i __B) {
1684*12c85518Srobert   __v4su __lshift;
1685*12c85518Srobert   __vector __bool int __shmask;
1686*12c85518Srobert   const __v4su __shmax = {32, 32, 32, 32};
1687*12c85518Srobert   __v4su __result;
1688e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1689*12c85518Srobert   __lshift = vec_splat((__v4su)__B, 0);
1690e5dd7070Spatrick #else
1691*12c85518Srobert   __lshift = vec_splat((__v4su)__B, 1);
1692e5dd7070Spatrick #endif
1693*12c85518Srobert   __shmask = vec_cmplt(__lshift, __shmax);
1694*12c85518Srobert   __result = vec_sl((__v4su)__A, __lshift);
1695*12c85518Srobert   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1696e5dd7070Spatrick 
1697*12c85518Srobert   return (__m128i)__result;
1698e5dd7070Spatrick }
1699e5dd7070Spatrick 
1700e5dd7070Spatrick #ifdef _ARCH_PWR8
1701*12c85518Srobert extern __inline __m128i
1702*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64(__m128i __A,__m128i __B)1703*12c85518Srobert     _mm_sll_epi64(__m128i __A, __m128i __B) {
1704*12c85518Srobert   __v2du __lshift;
1705*12c85518Srobert   __vector __bool long long __shmask;
1706*12c85518Srobert   const __v2du __shmax = {64, 64};
1707*12c85518Srobert   __v2du __result;
1708e5dd7070Spatrick 
1709*12c85518Srobert   __lshift = vec_splat((__v2du)__B, 0);
1710*12c85518Srobert   __shmask = vec_cmplt(__lshift, __shmax);
1711*12c85518Srobert   __result = vec_sl((__v2du)__A, __lshift);
1712*12c85518Srobert   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1713e5dd7070Spatrick 
1714*12c85518Srobert   return (__m128i)__result;
1715e5dd7070Spatrick }
1716e5dd7070Spatrick #endif
1717e5dd7070Spatrick 
1718*12c85518Srobert extern __inline __m128i
1719*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16(__m128i __A,__m128i __B)1720*12c85518Srobert     _mm_sra_epi16(__m128i __A, __m128i __B) {
1721*12c85518Srobert   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1722*12c85518Srobert   __v8hu __rshift;
1723*12c85518Srobert   __v8hi __result;
1724e5dd7070Spatrick 
1725e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1726*12c85518Srobert   __rshift = vec_splat((__v8hu)__B, 0);
1727e5dd7070Spatrick #else
1728*12c85518Srobert   __rshift = vec_splat((__v8hu)__B, 3);
1729e5dd7070Spatrick #endif
1730*12c85518Srobert   __rshift = vec_min(__rshift, __rshmax);
1731*12c85518Srobert   __result = vec_sra((__v8hi)__A, __rshift);
1732e5dd7070Spatrick 
1733*12c85518Srobert   return (__m128i)__result;
1734e5dd7070Spatrick }
1735e5dd7070Spatrick 
1736*12c85518Srobert extern __inline __m128i
1737*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32(__m128i __A,__m128i __B)1738*12c85518Srobert     _mm_sra_epi32(__m128i __A, __m128i __B) {
1739*12c85518Srobert   const __v4su __rshmax = {31, 31, 31, 31};
1740*12c85518Srobert   __v4su __rshift;
1741*12c85518Srobert   __v4si __result;
1742e5dd7070Spatrick 
1743e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1744*12c85518Srobert   __rshift = vec_splat((__v4su)__B, 0);
1745e5dd7070Spatrick #else
1746*12c85518Srobert   __rshift = vec_splat((__v4su)__B, 1);
1747e5dd7070Spatrick #endif
1748*12c85518Srobert   __rshift = vec_min(__rshift, __rshmax);
1749*12c85518Srobert   __result = vec_sra((__v4si)__A, __rshift);
1750e5dd7070Spatrick 
1751*12c85518Srobert   return (__m128i)__result;
1752e5dd7070Spatrick }
1753e5dd7070Spatrick 
1754*12c85518Srobert extern __inline __m128i
1755*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16(__m128i __A,__m128i __B)1756*12c85518Srobert     _mm_srl_epi16(__m128i __A, __m128i __B) {
1757*12c85518Srobert   __v8hu __rshift;
1758*12c85518Srobert   __vector __bool short __shmask;
1759*12c85518Srobert   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1760*12c85518Srobert   __v8hu __result;
1761e5dd7070Spatrick 
1762e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1763*12c85518Srobert   __rshift = vec_splat((__v8hu)__B, 0);
1764e5dd7070Spatrick #else
1765*12c85518Srobert   __rshift = vec_splat((__v8hu)__B, 3);
1766e5dd7070Spatrick #endif
1767*12c85518Srobert   __shmask = vec_cmple(__rshift, __shmax);
1768*12c85518Srobert   __result = vec_sr((__v8hu)__A, __rshift);
1769*12c85518Srobert   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1770e5dd7070Spatrick 
1771*12c85518Srobert   return (__m128i)__result;
1772e5dd7070Spatrick }
1773e5dd7070Spatrick 
1774*12c85518Srobert extern __inline __m128i
1775*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32(__m128i __A,__m128i __B)1776*12c85518Srobert     _mm_srl_epi32(__m128i __A, __m128i __B) {
1777*12c85518Srobert   __v4su __rshift;
1778*12c85518Srobert   __vector __bool int __shmask;
1779*12c85518Srobert   const __v4su __shmax = {32, 32, 32, 32};
1780*12c85518Srobert   __v4su __result;
1781e5dd7070Spatrick 
1782e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1783*12c85518Srobert   __rshift = vec_splat((__v4su)__B, 0);
1784e5dd7070Spatrick #else
1785*12c85518Srobert   __rshift = vec_splat((__v4su)__B, 1);
1786e5dd7070Spatrick #endif
1787*12c85518Srobert   __shmask = vec_cmplt(__rshift, __shmax);
1788*12c85518Srobert   __result = vec_sr((__v4su)__A, __rshift);
1789*12c85518Srobert   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1790e5dd7070Spatrick 
1791*12c85518Srobert   return (__m128i)__result;
1792e5dd7070Spatrick }
1793e5dd7070Spatrick 
1794e5dd7070Spatrick #ifdef _ARCH_PWR8
1795*12c85518Srobert extern __inline __m128i
1796*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64(__m128i __A,__m128i __B)1797*12c85518Srobert     _mm_srl_epi64(__m128i __A, __m128i __B) {
1798*12c85518Srobert   __v2du __rshift;
1799*12c85518Srobert   __vector __bool long long __shmask;
1800*12c85518Srobert   const __v2du __shmax = {64, 64};
1801*12c85518Srobert   __v2du __result;
1802e5dd7070Spatrick 
1803*12c85518Srobert   __rshift = vec_splat((__v2du)__B, 0);
1804*12c85518Srobert   __shmask = vec_cmplt(__rshift, __shmax);
1805*12c85518Srobert   __result = vec_sr((__v2du)__A, __rshift);
1806*12c85518Srobert   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1807e5dd7070Spatrick 
1808*12c85518Srobert   return (__m128i)__result;
1809e5dd7070Spatrick }
1810e5dd7070Spatrick #endif
1811e5dd7070Spatrick 
1812*12c85518Srobert extern __inline __m128d
1813*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd(__m128d __A,__m128d __B)1814*12c85518Srobert     _mm_and_pd(__m128d __A, __m128d __B) {
1815e5dd7070Spatrick   return (vec_and((__v2df)__A, (__v2df)__B));
1816e5dd7070Spatrick }
1817e5dd7070Spatrick 
1818*12c85518Srobert extern __inline __m128d
1819*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd(__m128d __A,__m128d __B)1820*12c85518Srobert     _mm_andnot_pd(__m128d __A, __m128d __B) {
1821e5dd7070Spatrick   return (vec_andc((__v2df)__B, (__v2df)__A));
1822e5dd7070Spatrick }
1823e5dd7070Spatrick 
1824*12c85518Srobert extern __inline __m128d
1825*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd(__m128d __A,__m128d __B)1826*12c85518Srobert     _mm_or_pd(__m128d __A, __m128d __B) {
1827e5dd7070Spatrick   return (vec_or((__v2df)__A, (__v2df)__B));
1828e5dd7070Spatrick }
1829e5dd7070Spatrick 
1830*12c85518Srobert extern __inline __m128d
1831*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd(__m128d __A,__m128d __B)1832*12c85518Srobert     _mm_xor_pd(__m128d __A, __m128d __B) {
1833e5dd7070Spatrick   return (vec_xor((__v2df)__A, (__v2df)__B));
1834e5dd7070Spatrick }
1835e5dd7070Spatrick 
1836*12c85518Srobert extern __inline __m128i
1837*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128(__m128i __A,__m128i __B)1838*12c85518Srobert     _mm_and_si128(__m128i __A, __m128i __B) {
1839e5dd7070Spatrick   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1840e5dd7070Spatrick }
1841e5dd7070Spatrick 
1842*12c85518Srobert extern __inline __m128i
1843*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128(__m128i __A,__m128i __B)1844*12c85518Srobert     _mm_andnot_si128(__m128i __A, __m128i __B) {
1845e5dd7070Spatrick   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1846e5dd7070Spatrick }
1847e5dd7070Spatrick 
1848*12c85518Srobert extern __inline __m128i
1849*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128(__m128i __A,__m128i __B)1850*12c85518Srobert     _mm_or_si128(__m128i __A, __m128i __B) {
1851e5dd7070Spatrick   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1852e5dd7070Spatrick }
1853e5dd7070Spatrick 
1854*12c85518Srobert extern __inline __m128i
1855*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128(__m128i __A,__m128i __B)1856*12c85518Srobert     _mm_xor_si128(__m128i __A, __m128i __B) {
1857e5dd7070Spatrick   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1858e5dd7070Spatrick }
1859e5dd7070Spatrick 
1860*12c85518Srobert extern __inline __m128i
1861*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8(__m128i __A,__m128i __B)1862*12c85518Srobert     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1863e5dd7070Spatrick   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1864e5dd7070Spatrick }
1865e5dd7070Spatrick 
1866*12c85518Srobert extern __inline __m128i
1867*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16(__m128i __A,__m128i __B)1868*12c85518Srobert     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1869e5dd7070Spatrick   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1870e5dd7070Spatrick }
1871e5dd7070Spatrick 
1872*12c85518Srobert extern __inline __m128i
1873*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32(__m128i __A,__m128i __B)1874*12c85518Srobert     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1875e5dd7070Spatrick   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1876e5dd7070Spatrick }
1877e5dd7070Spatrick 
1878*12c85518Srobert extern __inline __m128i
1879*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8(__m128i __A,__m128i __B)1880*12c85518Srobert     _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1881e5dd7070Spatrick   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1882e5dd7070Spatrick }
1883e5dd7070Spatrick 
1884*12c85518Srobert extern __inline __m128i
1885*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16(__m128i __A,__m128i __B)1886*12c85518Srobert     _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1887e5dd7070Spatrick   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1888e5dd7070Spatrick }
1889e5dd7070Spatrick 
1890*12c85518Srobert extern __inline __m128i
1891*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32(__m128i __A,__m128i __B)1892*12c85518Srobert     _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1893e5dd7070Spatrick   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1894e5dd7070Spatrick }
1895e5dd7070Spatrick 
1896*12c85518Srobert extern __inline __m128i
1897*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8(__m128i __A,__m128i __B)1898*12c85518Srobert     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1899e5dd7070Spatrick   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1900e5dd7070Spatrick }
1901e5dd7070Spatrick 
1902*12c85518Srobert extern __inline __m128i
1903*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16(__m128i __A,__m128i __B)1904*12c85518Srobert     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1905e5dd7070Spatrick   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1906e5dd7070Spatrick }
1907e5dd7070Spatrick 
1908*12c85518Srobert extern __inline __m128i
1909*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32(__m128i __A,__m128i __B)1910*12c85518Srobert     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1911e5dd7070Spatrick   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1912e5dd7070Spatrick }
1913e5dd7070Spatrick 
1914*12c85518Srobert extern __inline int
1915*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16(__m128i const __A,int const __N)1916*12c85518Srobert     _mm_extract_epi16(__m128i const __A, int const __N) {
1917e5dd7070Spatrick   return (unsigned short)((__v8hi)__A)[__N & 7];
1918e5dd7070Spatrick }
1919e5dd7070Spatrick 
1920*12c85518Srobert extern __inline __m128i
1921*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16(__m128i const __A,int const __D,int const __N)1922*12c85518Srobert     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1923*12c85518Srobert   __v8hi __result = (__v8hi)__A;
1924e5dd7070Spatrick 
1925*12c85518Srobert   __result[(__N & 7)] = __D;
1926e5dd7070Spatrick 
1927*12c85518Srobert   return (__m128i)__result;
1928e5dd7070Spatrick }
1929e5dd7070Spatrick 
1930*12c85518Srobert extern __inline __m128i
1931*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16(__m128i __A,__m128i __B)1932*12c85518Srobert     _mm_max_epi16(__m128i __A, __m128i __B) {
1933e5dd7070Spatrick   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1934e5dd7070Spatrick }
1935e5dd7070Spatrick 
1936*12c85518Srobert extern __inline __m128i
1937*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8(__m128i __A,__m128i __B)1938*12c85518Srobert     _mm_max_epu8(__m128i __A, __m128i __B) {
1939e5dd7070Spatrick   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1940e5dd7070Spatrick }
1941e5dd7070Spatrick 
1942*12c85518Srobert extern __inline __m128i
1943*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16(__m128i __A,__m128i __B)1944*12c85518Srobert     _mm_min_epi16(__m128i __A, __m128i __B) {
1945e5dd7070Spatrick   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1946e5dd7070Spatrick }
1947e5dd7070Spatrick 
1948*12c85518Srobert extern __inline __m128i
1949*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8(__m128i __A,__m128i __B)1950*12c85518Srobert     _mm_min_epu8(__m128i __A, __m128i __B) {
1951e5dd7070Spatrick   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1952e5dd7070Spatrick }
1953e5dd7070Spatrick 
1954e5dd7070Spatrick #ifdef _ARCH_PWR8
1955e5dd7070Spatrick /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1956e5dd7070Spatrick 
1957*12c85518Srobert /* Return a mask created from the most significant bit of each 8-bit
1958*12c85518Srobert    element in A.  */
1959*12c85518Srobert extern __inline int
1960*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8(__m128i __A)1961*12c85518Srobert     _mm_movemask_epi8(__m128i __A) {
1962*12c85518Srobert #ifdef _ARCH_PWR10
1963*12c85518Srobert   return vec_extractm((__v16qu)__A);
1964*12c85518Srobert #else
1965*12c85518Srobert   __vector unsigned long long __result;
1966*12c85518Srobert   static const __vector unsigned char __perm_mask = {
1967e5dd7070Spatrick       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1968*12c85518Srobert       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1969e5dd7070Spatrick 
1970*12c85518Srobert   __result = ((__vector unsigned long long)vec_vbpermq(
1971*12c85518Srobert       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1972e5dd7070Spatrick 
1973e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1974*12c85518Srobert   return __result[1];
1975e5dd7070Spatrick #else
1976*12c85518Srobert   return __result[0];
1977e5dd7070Spatrick #endif
1978*12c85518Srobert #endif /* !_ARCH_PWR10 */
1979e5dd7070Spatrick }
1980e5dd7070Spatrick #endif /* _ARCH_PWR8 */
1981e5dd7070Spatrick 
1982*12c85518Srobert extern __inline __m128i
1983*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16(__m128i __A,__m128i __B)1984*12c85518Srobert     _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1985*12c85518Srobert   __v4su __w0, __w1;
1986*12c85518Srobert   __v16qu __xform1 = {
1987e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
1988*12c85518Srobert       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1989*12c85518Srobert       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1990e5dd7070Spatrick #else
1991*12c85518Srobert       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1992*12c85518Srobert       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1993e5dd7070Spatrick #endif
1994e5dd7070Spatrick   };
1995e5dd7070Spatrick 
1996*12c85518Srobert   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1997*12c85518Srobert   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1998*12c85518Srobert   return (__m128i)vec_perm(__w0, __w1, __xform1);
1999e5dd7070Spatrick }
2000e5dd7070Spatrick 
2001*12c85518Srobert extern __inline __m128i
2002*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16(__m128i __A,const int __mask)2003*12c85518Srobert     _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2004*12c85518Srobert   unsigned long __element_selector_98 = __mask & 0x03;
2005*12c85518Srobert   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2006*12c85518Srobert   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2007*12c85518Srobert   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2008*12c85518Srobert   static const unsigned short __permute_selectors[4] = {
2009e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
2010e5dd7070Spatrick       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2011e5dd7070Spatrick #else
2012e5dd7070Spatrick       0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2013e5dd7070Spatrick #endif
2014e5dd7070Spatrick   };
2015*12c85518Srobert   __v2du __pmask =
2016e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
2017e5dd7070Spatrick       {0x1716151413121110UL, 0UL};
2018e5dd7070Spatrick #else
2019e5dd7070Spatrick       {0x1011121314151617UL, 0UL};
2020e5dd7070Spatrick #endif
2021*12c85518Srobert   __m64_union __t;
2022*12c85518Srobert   __v2du __a, __r;
2023e5dd7070Spatrick 
2024*12c85518Srobert   __t.as_short[0] = __permute_selectors[__element_selector_98];
2025*12c85518Srobert   __t.as_short[1] = __permute_selectors[__element_selector_BA];
2026*12c85518Srobert   __t.as_short[2] = __permute_selectors[__element_selector_DC];
2027*12c85518Srobert   __t.as_short[3] = __permute_selectors[__element_selector_FE];
2028*12c85518Srobert   __pmask[1] = __t.as_m64;
2029*12c85518Srobert   __a = (__v2du)__A;
2030*12c85518Srobert   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2031*12c85518Srobert   return (__m128i)__r;
2032e5dd7070Spatrick }
2033e5dd7070Spatrick 
2034*12c85518Srobert extern __inline __m128i
2035*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16(__m128i __A,const int __mask)2036*12c85518Srobert     _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2037*12c85518Srobert   unsigned long __element_selector_10 = __mask & 0x03;
2038*12c85518Srobert   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2039*12c85518Srobert   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2040*12c85518Srobert   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2041*12c85518Srobert   static const unsigned short __permute_selectors[4] = {
2042e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
2043e5dd7070Spatrick       0x0100, 0x0302, 0x0504, 0x0706
2044e5dd7070Spatrick #else
2045e5dd7070Spatrick       0x0001, 0x0203, 0x0405, 0x0607
2046e5dd7070Spatrick #endif
2047e5dd7070Spatrick   };
2048*12c85518Srobert   __v2du __pmask =
2049e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
2050e5dd7070Spatrick       {0UL, 0x1f1e1d1c1b1a1918UL};
2051e5dd7070Spatrick #else
2052e5dd7070Spatrick       {0UL, 0x18191a1b1c1d1e1fUL};
2053e5dd7070Spatrick #endif
2054*12c85518Srobert   __m64_union __t;
2055*12c85518Srobert   __v2du __a, __r;
2056*12c85518Srobert   __t.as_short[0] = __permute_selectors[__element_selector_10];
2057*12c85518Srobert   __t.as_short[1] = __permute_selectors[__element_selector_32];
2058*12c85518Srobert   __t.as_short[2] = __permute_selectors[__element_selector_54];
2059*12c85518Srobert   __t.as_short[3] = __permute_selectors[__element_selector_76];
2060*12c85518Srobert   __pmask[0] = __t.as_m64;
2061*12c85518Srobert   __a = (__v2du)__A;
2062*12c85518Srobert   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2063*12c85518Srobert   return (__m128i)__r;
2064e5dd7070Spatrick }
2065e5dd7070Spatrick 
2066*12c85518Srobert extern __inline __m128i
2067*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32(__m128i __A,const int __mask)2068*12c85518Srobert     _mm_shuffle_epi32(__m128i __A, const int __mask) {
2069*12c85518Srobert   unsigned long __element_selector_10 = __mask & 0x03;
2070*12c85518Srobert   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2071*12c85518Srobert   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2072*12c85518Srobert   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2073*12c85518Srobert   static const unsigned int __permute_selectors[4] = {
2074e5dd7070Spatrick #ifdef __LITTLE_ENDIAN__
2075e5dd7070Spatrick       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2076e5dd7070Spatrick #else
2077e5dd7070Spatrick       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2078e5dd7070Spatrick #endif
2079e5dd7070Spatrick   };
2080*12c85518Srobert   __v4su __t;
2081e5dd7070Spatrick 
2082*12c85518Srobert   __t[0] = __permute_selectors[__element_selector_10];
2083*12c85518Srobert   __t[1] = __permute_selectors[__element_selector_32];
2084*12c85518Srobert   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2085*12c85518Srobert   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2086*12c85518Srobert   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2087*12c85518Srobert                            (__vector unsigned char)__t);
2088e5dd7070Spatrick }
2089e5dd7070Spatrick 
2090*12c85518Srobert extern __inline void
2091*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128(__m128i __A,__m128i __B,char * __C)2092*12c85518Srobert     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2093*12c85518Srobert   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2094*12c85518Srobert   __v16qu __mask, __tmp;
2095*12c85518Srobert   __m128i_u *__p = (__m128i_u *)__C;
2096e5dd7070Spatrick 
2097*12c85518Srobert   __tmp = (__v16qu)_mm_loadu_si128(__p);
2098*12c85518Srobert   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2099*12c85518Srobert   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2100*12c85518Srobert   _mm_storeu_si128(__p, (__m128i)__tmp);
2101e5dd7070Spatrick }
2102e5dd7070Spatrick 
2103*12c85518Srobert extern __inline __m128i
2104*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8(__m128i __A,__m128i __B)2105*12c85518Srobert     _mm_avg_epu8(__m128i __A, __m128i __B) {
2106e5dd7070Spatrick   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2107e5dd7070Spatrick }
2108e5dd7070Spatrick 
2109*12c85518Srobert extern __inline __m128i
2110*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16(__m128i __A,__m128i __B)2111*12c85518Srobert     _mm_avg_epu16(__m128i __A, __m128i __B) {
2112e5dd7070Spatrick   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2113e5dd7070Spatrick }
2114e5dd7070Spatrick 
2115*12c85518Srobert extern __inline __m128i
2116*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8(__m128i __A,__m128i __B)2117*12c85518Srobert     _mm_sad_epu8(__m128i __A, __m128i __B) {
2118*12c85518Srobert   __v16qu __a, __b;
2119*12c85518Srobert   __v16qu __vabsdiff;
2120*12c85518Srobert   __v4si __vsum;
2121*12c85518Srobert   const __v4su __zero = {0, 0, 0, 0};
2122*12c85518Srobert   __v4si __result;
2123e5dd7070Spatrick 
2124*12c85518Srobert   __a = (__v16qu)__A;
2125*12c85518Srobert   __b = (__v16qu)__B;
2126*12c85518Srobert #ifndef _ARCH_PWR9
2127*12c85518Srobert   __v16qu __vmin = vec_min(__a, __b);
2128*12c85518Srobert   __v16qu __vmax = vec_max(__a, __b);
2129*12c85518Srobert   __vabsdiff = vec_sub(__vmax, __vmin);
2130e5dd7070Spatrick #else
2131*12c85518Srobert   __vabsdiff = vec_absd(__a, __b);
2132e5dd7070Spatrick #endif
2133*12c85518Srobert   /* Sum four groups of bytes into integers.  */
2134*12c85518Srobert   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2135*12c85518Srobert #ifdef __LITTLE_ENDIAN__
2136*12c85518Srobert   /* Sum across four integers with two integer results.  */
2137*12c85518Srobert   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2138*12c85518Srobert   /* Note: vec_sum2s could be used here, but on little-endian, vector
2139*12c85518Srobert      shifts are added that are not needed for this use-case.
2140*12c85518Srobert      A vector shift to correctly position the 32-bit integer results
2141*12c85518Srobert      (currently at [0] and [2]) to [1] and [3] would then need to be
2142*12c85518Srobert      swapped back again since the desired results are two 64-bit
2143*12c85518Srobert      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
2144*12c85518Srobert #else
2145*12c85518Srobert   /* Sum across four integers with two integer results.  */
2146*12c85518Srobert   __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2147e5dd7070Spatrick   /* Rotate the sums into the correct position.  */
2148*12c85518Srobert   __result = vec_sld(__result, __result, 6);
2149*12c85518Srobert #endif
2150*12c85518Srobert   return (__m128i)__result;
2151e5dd7070Spatrick }
2152e5dd7070Spatrick 
2153*12c85518Srobert extern __inline void
2154*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32(int * __A,int __B)2155*12c85518Srobert     _mm_stream_si32(int *__A, int __B) {
2156e5dd7070Spatrick   /* Use the data cache block touch for store transient.  */
2157*12c85518Srobert   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2158e5dd7070Spatrick   *__A = __B;
2159e5dd7070Spatrick }
2160e5dd7070Spatrick 
2161*12c85518Srobert extern __inline void
2162*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64(long long int * __A,long long int __B)2163*12c85518Srobert     _mm_stream_si64(long long int *__A, long long int __B) {
2164e5dd7070Spatrick   /* Use the data cache block touch for store transient.  */
2165*12c85518Srobert   __asm__("	dcbtstt	0,%0" : : "b"(__A) : "memory");
2166e5dd7070Spatrick   *__A = __B;
2167e5dd7070Spatrick }
2168e5dd7070Spatrick 
2169*12c85518Srobert extern __inline void
2170*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128(__m128i * __A,__m128i __B)2171*12c85518Srobert     _mm_stream_si128(__m128i *__A, __m128i __B) {
2172e5dd7070Spatrick   /* Use the data cache block touch for store transient.  */
2173*12c85518Srobert   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2174e5dd7070Spatrick   *__A = __B;
2175e5dd7070Spatrick }
2176e5dd7070Spatrick 
2177*12c85518Srobert extern __inline void
2178*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd(double * __A,__m128d __B)2179*12c85518Srobert     _mm_stream_pd(double *__A, __m128d __B) {
2180e5dd7070Spatrick   /* Use the data cache block touch for store transient.  */
2181*12c85518Srobert   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2182e5dd7070Spatrick   *(__m128d *)__A = __B;
2183e5dd7070Spatrick }
2184e5dd7070Spatrick 
2185*12c85518Srobert extern __inline void
2186*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush(void const * __A)2187*12c85518Srobert     _mm_clflush(void const *__A) {
2188e5dd7070Spatrick   /* Use the data cache block flush.  */
2189*12c85518Srobert   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2190e5dd7070Spatrick }
2191e5dd7070Spatrick 
2192*12c85518Srobert extern __inline void
2193*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence(void)2194*12c85518Srobert     _mm_lfence(void) {
2195e5dd7070Spatrick   /* Use light weight sync for load to load ordering.  */
2196e5dd7070Spatrick   __atomic_thread_fence(__ATOMIC_RELEASE);
2197e5dd7070Spatrick }
2198e5dd7070Spatrick 
2199*12c85518Srobert extern __inline void
2200*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence(void)2201*12c85518Srobert     _mm_mfence(void) {
2202e5dd7070Spatrick   /* Use heavy weight sync for any to any ordering.  */
2203e5dd7070Spatrick   __atomic_thread_fence(__ATOMIC_SEQ_CST);
2204e5dd7070Spatrick }
2205e5dd7070Spatrick 
2206*12c85518Srobert extern __inline __m128i
2207*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128(int __A)2208*12c85518Srobert     _mm_cvtsi32_si128(int __A) {
2209e5dd7070Spatrick   return _mm_set_epi32(0, 0, 0, __A);
2210e5dd7070Spatrick }
2211e5dd7070Spatrick 
2212*12c85518Srobert extern __inline __m128i
2213*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128(long long __A)2214*12c85518Srobert     _mm_cvtsi64_si128(long long __A) {
2215e5dd7070Spatrick   return __extension__(__m128i)(__v2di){__A, 0LL};
2216e5dd7070Spatrick }
2217e5dd7070Spatrick 
2218e5dd7070Spatrick /* Microsoft intrinsic.  */
2219*12c85518Srobert extern __inline __m128i
2220*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128(long long __A)2221*12c85518Srobert     _mm_cvtsi64x_si128(long long __A) {
2222e5dd7070Spatrick   return __extension__(__m128i)(__v2di){__A, 0LL};
2223e5dd7070Spatrick }
2224e5dd7070Spatrick 
2225e5dd7070Spatrick /* Casts between various SP, DP, INT vector types.  Note that these do no
2226e5dd7070Spatrick    conversion of values, they just change the type.  */
2227*12c85518Srobert extern __inline __m128
2228*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)2229*12c85518Srobert     _mm_castpd_ps(__m128d __A) {
2230e5dd7070Spatrick   return (__m128)__A;
2231e5dd7070Spatrick }
2232e5dd7070Spatrick 
2233*12c85518Srobert extern __inline __m128i
2234*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)2235*12c85518Srobert     _mm_castpd_si128(__m128d __A) {
2236e5dd7070Spatrick   return (__m128i)__A;
2237e5dd7070Spatrick }
2238e5dd7070Spatrick 
2239*12c85518Srobert extern __inline __m128d
2240*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)2241*12c85518Srobert     _mm_castps_pd(__m128 __A) {
2242e5dd7070Spatrick   return (__m128d)__A;
2243e5dd7070Spatrick }
2244e5dd7070Spatrick 
2245*12c85518Srobert extern __inline __m128i
2246*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)2247*12c85518Srobert     _mm_castps_si128(__m128 __A) {
2248e5dd7070Spatrick   return (__m128i)__A;
2249e5dd7070Spatrick }
2250e5dd7070Spatrick 
2251*12c85518Srobert extern __inline __m128
2252*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)2253*12c85518Srobert     _mm_castsi128_ps(__m128i __A) {
2254e5dd7070Spatrick   return (__m128)__A;
2255e5dd7070Spatrick }
2256e5dd7070Spatrick 
2257*12c85518Srobert extern __inline __m128d
2258*12c85518Srobert     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)2259*12c85518Srobert     _mm_castsi128_pd(__m128i __A) {
2260e5dd7070Spatrick   return (__m128d)__A;
2261e5dd7070Spatrick }
2262e5dd7070Spatrick 
2263e5dd7070Spatrick #else
2264e5dd7070Spatrick #include_next <emmintrin.h>
2265*12c85518Srobert #endif /* defined(__powerpc64__) &&                                            \
2266*12c85518Srobert         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2267e5dd7070Spatrick 
2268e5dd7070Spatrick #endif /* EMMINTRIN_H_ */
2269