1 /* Copyright (C) 2002-2021 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 9.0.  */
26 
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29    makes explicit use of Intel intrinsics to powerpc64le.
30    It is the user's responsibility to determine if the results are
31    acceptable and make additional changes as necessary.
32    Note that much code that uses Intel intrinsics can be rewritten in
33    standard C or GNU C extensions, which are more portable and better
34    optimized across multiple targets.
35 
36    In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37    VMX/VSX ISA is a good match for vector float SIMD operations.
38    However scalar float operations in vector (XMM) registers require
39    the POWER8 VSX ISA (2.07) level. Also there are important
40    differences for data format and placement of float scalars in the
41    vector register. For PowerISA Scalar floats in FPRs (left most
42    64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43    uses the right most 32-bits of the XMM. These differences require
44    extra steps on POWER to match the SSE scalar float semantics.
45 
46    Most SSE scalar float intrinsic operations can be performed more
47    efficiently as C language float scalar operations or optimized to
48    use vector SIMD operations.  We recommend this for new applications.
49 
50    Another difference is the format and details of the X86_64 MXSCR vs
51    the PowerISA FPSCR / VSCR registers. We recommend applications
52    replace direct access to the MXSCR with the more portable <fenv.h>
53    Posix APIs. */
54 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
55 #endif
56 
57 #ifndef _XMMINTRIN_H_INCLUDED
58 #define _XMMINTRIN_H_INCLUDED
59 
60 /* Define four value permute mask */
61 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
62 
63 #include <altivec.h>
64 
65 /* Avoid collisions between altivec.h and strict adherence to C++ and
66    C11 standards.  This should eventually be done inside altivec.h itself,
67    but only after testing a full distro build.  */
68 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
69 				 (defined(__STDC_VERSION__) &&	\
70 				  __STDC_VERSION__ >= 201112L))
71 #undef vector
72 #undef pixel
73 #undef bool
74 #endif
75 
76 #include <assert.h>
77 
78 /* We need type definitions from the MMX header file.  */
79 #include <mmintrin.h>
80 
81 /* Get _mm_malloc () and _mm_free ().  */
82 #include <mm_malloc.h>
83 
84 /* The Intel API is flexible enough that we must allow aliasing with other
85    vector types, and their scalar components.  */
86 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
87 
88 /* Unaligned version of the same type.  */
89 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
90 				       __aligned__ (1)));
91 
92 /* Internal data types for implementing the intrinsics.  */
93 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
94 
95 /* Create an undefined vector.  */
96 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_ps(void)97 _mm_undefined_ps (void)
98 {
99   __m128 __Y = __Y;
100   return __Y;
101 }
102 
103 /* Create a vector of zeros.  */
104 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_ps(void)105 _mm_setzero_ps (void)
106 {
107   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
108 }
109 
110 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
111 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps(float const * __P)112 _mm_load_ps (float const *__P)
113 {
114   assert(((unsigned long)__P & 0xfUL) == 0UL);
115   return ((__m128)vec_ld(0, (__v4sf*)__P));
116 }
117 
118 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_ps(float const * __P)120 _mm_loadu_ps (float const *__P)
121 {
122   return (vec_vsx_ld(0, __P));
123 }
124 
125 /* Load four SPFP values in reverse order.  The address must be aligned.  */
126 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_ps(float const * __P)127 _mm_loadr_ps (float const *__P)
128 {
129   __v4sf   __tmp;
130   __m128 result;
131   static const __vector unsigned char permute_vector =
132     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
133 	0x17, 0x10, 0x11, 0x12, 0x13 };
134 
135   __tmp = vec_ld (0, (__v4sf *) __P);
136   result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
137   return result;
138 }
139 
140 /* Create a vector with all four elements equal to F.  */
141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_ps(float __F)142 _mm_set1_ps (float __F)
143 {
144   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
145 }
146 
147 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ps1(float __F)148 _mm_set_ps1 (float __F)
149 {
150   return _mm_set1_ps (__F);
151 }
152 
153 /* Create the vector [Z Y X W].  */
154 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ps(const float __Z,const float __Y,const float __X,const float __W)155 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
156 {
157   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
158 }
159 
160 /* Create the vector [W X Y Z].  */
161 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_ps(float __Z,float __Y,float __X,float __W)162 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
163 {
164   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
165 }
166 
167 /* Store four SPFP values.  The address must be 16-byte aligned.  */
168 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps(float * __P,__m128 __A)169 _mm_store_ps (float *__P, __m128 __A)
170 {
171   assert(((unsigned long)__P & 0xfUL) == 0UL);
172   vec_st((__v4sf)__A, 0, (__v4sf*)__P);
173 }
174 
175 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_ps(float * __P,__m128 __A)177 _mm_storeu_ps (float *__P, __m128 __A)
178 {
179   *(__m128_u *)__P = __A;
180 }
181 
182 /* Store four SPFP values in reverse order.  The address must be aligned.  */
183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_ps(float * __P,__m128 __A)184 _mm_storer_ps (float *__P, __m128 __A)
185 {
186   __v4sf   __tmp;
187   static const __vector unsigned char permute_vector =
188     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
189 	0x17, 0x10, 0x11, 0x12, 0x13 };
190 
191   __tmp = (__m128) vec_perm (__A, __A, permute_vector);
192 
193   _mm_store_ps (__P, __tmp);
194 }
195 
196 /* Store the lower SPFP value across four words.  */
197 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_ps(float * __P,__m128 __A)198 _mm_store1_ps (float *__P, __m128 __A)
199 {
200   __v4sf __va = vec_splat((__v4sf)__A, 0);
201   _mm_store_ps (__P, __va);
202 }
203 
204 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps1(float * __P,__m128 __A)205 _mm_store_ps1 (float *__P, __m128 __A)
206 {
207   _mm_store1_ps (__P, __A);
208 }
209 
210 /* Create a vector with element 0 as F and the rest zero.  */
211 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ss(float __F)212 _mm_set_ss (float __F)
213 {
214   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
215 }
216 
217 /* Sets the low SPFP value of A from the low value of B.  */
218 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_ss(__m128 __A,__m128 __B)219 _mm_move_ss (__m128 __A, __m128 __B)
220 {
221   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
222 
223   return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
224 }
225 
226 /* Create a vector with element 0 as *P and the rest zero.  */
227 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ss(float const * __P)228 _mm_load_ss (float const *__P)
229 {
230   return _mm_set_ss (*__P);
231 }
232 
233 /* Stores the lower SPFP value.  */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ss(float * __P,__m128 __A)235 _mm_store_ss (float *__P, __m128 __A)
236 {
237   *__P = ((__v4sf)__A)[0];
238 }
239 
240 /* Perform the respective operation on the lower SPFP (single-precision
241    floating-point) values of A and B; the upper three SPFP values are
242    passed through from A.  */
243 
244 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ss(__m128 __A,__m128 __B)245 _mm_add_ss (__m128 __A, __m128 __B)
246 {
247 #ifdef _ARCH_PWR7
248   __m128 a, b, c;
249   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
250   /* PowerISA VSX does not allow partial (for just lower double)
251      results. So to insure we don't generate spurious exceptions
252      (from the upper double values) we splat the lower double
253      before we to the operation.  */
254   a = vec_splat (__A, 0);
255   b = vec_splat (__B, 0);
256   c = a + b;
257   /* Then we merge the lower float result with the original upper
258      float elements from __A.  */
259   return (vec_sel (__A, c, mask));
260 #else
261   __A[0] = __A[0] + __B[0];
262   return (__A);
263 #endif
264 }
265 
266 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ss(__m128 __A,__m128 __B)267 _mm_sub_ss (__m128 __A, __m128 __B)
268 {
269 #ifdef _ARCH_PWR7
270   __m128 a, b, c;
271   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
272   /* PowerISA VSX does not allow partial (for just lower double)
273      results. So to insure we don't generate spurious exceptions
274      (from the upper double values) we splat the lower double
275      before we to the operation.  */
276   a = vec_splat (__A, 0);
277   b = vec_splat (__B, 0);
278   c = a - b;
279   /* Then we merge the lower float result with the original upper
280      float elements from __A.  */
281   return (vec_sel (__A, c, mask));
282 #else
283   __A[0] = __A[0] - __B[0];
284   return (__A);
285 #endif
286 }
287 
288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ss(__m128 __A,__m128 __B)289 _mm_mul_ss (__m128 __A, __m128 __B)
290 {
291 #ifdef _ARCH_PWR7
292   __m128 a, b, c;
293   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
294   /* PowerISA VSX does not allow partial (for just lower double)
295      results. So to insure we don't generate spurious exceptions
296      (from the upper double values) we splat the lower double
297      before we to the operation.  */
298   a = vec_splat (__A, 0);
299   b = vec_splat (__B, 0);
300   c = a * b;
301   /* Then we merge the lower float result with the original upper
302      float elements from __A.  */
303   return (vec_sel (__A, c, mask));
304 #else
305   __A[0] = __A[0] * __B[0];
306   return (__A);
307 #endif
308 }
309 
310 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ss(__m128 __A,__m128 __B)311 _mm_div_ss (__m128 __A, __m128 __B)
312 {
313 #ifdef _ARCH_PWR7
314   __m128 a, b, c;
315   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
316   /* PowerISA VSX does not allow partial (for just lower double)
317      results. So to insure we don't generate spurious exceptions
318      (from the upper double values) we splat the lower double
319      before we to the operation.  */
320   a = vec_splat (__A, 0);
321   b = vec_splat (__B, 0);
322   c = a / b;
323   /* Then we merge the lower float result with the original upper
324      float elements from __A.  */
325   return (vec_sel (__A, c, mask));
326 #else
327   __A[0] = __A[0] / __B[0];
328   return (__A);
329 #endif
330 }
331 
332 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ss(__m128 __A)333 _mm_sqrt_ss (__m128 __A)
334 {
335   __m128 a, c;
336   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
337   /* PowerISA VSX does not allow partial (for just lower double)
338    * results. So to insure we don't generate spurious exceptions
339    * (from the upper double values) we splat the lower double
340    * before we to the operation. */
341   a = vec_splat (__A, 0);
342   c = vec_sqrt (a);
343   /* Then we merge the lower float result with the original upper
344    * float elements from __A.  */
345   return (vec_sel (__A, c, mask));
346 }
347 
348 /* Perform the respective operation on the four SPFP values in A and B.  */
349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ps(__m128 __A,__m128 __B)350 _mm_add_ps (__m128 __A, __m128 __B)
351 {
352   return (__m128) ((__v4sf)__A + (__v4sf)__B);
353 }
354 
355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ps(__m128 __A,__m128 __B)356 _mm_sub_ps (__m128 __A, __m128 __B)
357 {
358   return (__m128) ((__v4sf)__A - (__v4sf)__B);
359 }
360 
361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ps(__m128 __A,__m128 __B)362 _mm_mul_ps (__m128 __A, __m128 __B)
363 {
364   return (__m128) ((__v4sf)__A * (__v4sf)__B);
365 }
366 
367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ps(__m128 __A,__m128 __B)368 _mm_div_ps (__m128 __A, __m128 __B)
369 {
370   return (__m128) ((__v4sf)__A / (__v4sf)__B);
371 }
372 
373 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ps(__m128 __A)374 _mm_sqrt_ps (__m128 __A)
375 {
376   return (vec_sqrt ((__v4sf)__A));
377 }
378 
379 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ps(__m128 __A)380 _mm_rcp_ps (__m128 __A)
381 {
382   return (vec_re ((__v4sf)__A));
383 }
384 
385 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt_ps(__m128 __A)386 _mm_rsqrt_ps (__m128 __A)
387 {
388   return (vec_rsqrte (__A));
389 }
390 
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ss(__m128 __A)392 _mm_rcp_ss (__m128 __A)
393 {
394   __m128 a, c;
395   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
396   /* PowerISA VSX does not allow partial (for just lower double)
397    * results. So to insure we don't generate spurious exceptions
398    * (from the upper double values) we splat the lower double
399    * before we to the operation. */
400   a = vec_splat (__A, 0);
401   c = _mm_rcp_ps (a);
402   /* Then we merge the lower float result with the original upper
403    * float elements from __A.  */
404   return (vec_sel (__A, c, mask));
405 }
406 
407 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt_ss(__m128 __A)408 _mm_rsqrt_ss (__m128 __A)
409 {
410   __m128 a, c;
411   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
412   /* PowerISA VSX does not allow partial (for just lower double)
413    * results. So to insure we don't generate spurious exceptions
414    * (from the upper double values) we splat the lower double
415    * before we to the operation. */
416   a = vec_splat (__A, 0);
417   c = vec_rsqrte (a);
418   /* Then we merge the lower float result with the original upper
419    * float elements from __A.  */
420   return (vec_sel (__A, c, mask));
421 }
422 
423 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ss(__m128 __A,__m128 __B)424 _mm_min_ss (__m128 __A, __m128 __B)
425 {
426   __v4sf a, b, c;
427   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
428   /* PowerISA VSX does not allow partial (for just lower float)
429    * results. So to insure we don't generate spurious exceptions
430    * (from the upper float values) we splat the lower float
431    * before we to the operation. */
432   a = vec_splat ((__v4sf)__A, 0);
433   b = vec_splat ((__v4sf)__B, 0);
434   c = vec_min (a, b);
435   /* Then we merge the lower float result with the original upper
436    * float elements from __A.  */
437   return (vec_sel ((__v4sf)__A, c, mask));
438 }
439 
440 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ss(__m128 __A,__m128 __B)441 _mm_max_ss (__m128 __A, __m128 __B)
442 {
443   __v4sf a, b, c;
444   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
445   /* PowerISA VSX does not allow partial (for just lower float)
446    * results. So to insure we don't generate spurious exceptions
447    * (from the upper float values) we splat the lower float
448    * before we to the operation. */
449   a = vec_splat (__A, 0);
450   b = vec_splat (__B, 0);
451   c = vec_max (a, b);
452   /* Then we merge the lower float result with the original upper
453    * float elements from __A.  */
454   return (vec_sel ((__v4sf)__A, c, mask));
455 }
456 
457 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ps(__m128 __A,__m128 __B)458 _mm_min_ps (__m128 __A, __m128 __B)
459 {
460   __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
461   return vec_sel (__B, __A, m);
462 }
463 
464 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ps(__m128 __A,__m128 __B)465 _mm_max_ps (__m128 __A, __m128 __B)
466 {
467   __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
468   return vec_sel (__B, __A, m);
469 }
470 
471 /* Perform logical bit-wise operations on 128-bit values.  */
472 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_ps(__m128 __A,__m128 __B)473 _mm_and_ps (__m128 __A, __m128 __B)
474 {
475   return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
476 //  return __builtin_ia32_andps (__A, __B);
477 }
478 
479 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_ps(__m128 __A,__m128 __B)480 _mm_andnot_ps (__m128 __A, __m128 __B)
481 {
482   return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
483 }
484 
485 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_ps(__m128 __A,__m128 __B)486 _mm_or_ps (__m128 __A, __m128 __B)
487 {
488   return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
489 }
490 
491 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_ps(__m128 __A,__m128 __B)492 _mm_xor_ps (__m128 __A, __m128 __B)
493 {
494   return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
495 }
496 
497 /* Perform a comparison on the four SPFP values of A and B.  For each
498    element, if the comparison is true, place a mask of all ones in the
499    result, otherwise a mask of zeros.  */
500 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_ps(__m128 __A,__m128 __B)501 _mm_cmpeq_ps (__m128 __A, __m128 __B)
502 {
503   return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
504 }
505 
506 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_ps(__m128 __A,__m128 __B)507 _mm_cmplt_ps (__m128 __A, __m128 __B)
508 {
509   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
510 }
511 
512 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_ps(__m128 __A,__m128 __B)513 _mm_cmple_ps (__m128 __A, __m128 __B)
514 {
515   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
516 }
517 
518 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_ps(__m128 __A,__m128 __B)519 _mm_cmpgt_ps (__m128 __A, __m128 __B)
520 {
521   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
522 }
523 
524 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_ps(__m128 __A,__m128 __B)525 _mm_cmpge_ps (__m128 __A, __m128 __B)
526 {
527   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
528 }
529 
530 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_ps(__m128 __A,__m128 __B)531 _mm_cmpneq_ps (__m128  __A, __m128  __B)
532 {
533   __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
534   return ((__m128)vec_nor (temp, temp));
535 }
536 
537 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_ps(__m128 __A,__m128 __B)538 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
539 {
540   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
541 }
542 
543 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_ps(__m128 __A,__m128 __B)544 _mm_cmpnle_ps (__m128 __A, __m128 __B)
545 {
546   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
547 }
548 
549 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_ps(__m128 __A,__m128 __B)550 _mm_cmpngt_ps (__m128 __A, __m128 __B)
551 {
552   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
553 }
554 
555 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_ps(__m128 __A,__m128 __B)556 _mm_cmpnge_ps (__m128 __A, __m128 __B)
557 {
558   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
559 }
560 
561 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_ps(__m128 __A,__m128 __B)562 _mm_cmpord_ps (__m128  __A, __m128  __B)
563 {
564   __vector unsigned int a, b;
565   __vector unsigned int c, d;
566   static const __vector unsigned int float_exp_mask =
567     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
568 
569   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
570   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
571   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
572   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
573   return ((__m128 ) vec_and (c, d));
574 }
575 
576 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_ps(__m128 __A,__m128 __B)577 _mm_cmpunord_ps (__m128 __A, __m128 __B)
578 {
579   __vector unsigned int a, b;
580   __vector unsigned int c, d;
581   static const __vector unsigned int float_exp_mask =
582     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
583 
584   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
585   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
586   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
587   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
588   return ((__m128 ) vec_or (c, d));
589 }
590 
591 /* Perform a comparison on the lower SPFP values of A and B.  If the
592    comparison is true, place a mask of all ones in the result, otherwise a
593    mask of zeros.  The upper three SPFP values are passed through from A.  */
594 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_ss(__m128 __A,__m128 __B)595 _mm_cmpeq_ss (__m128  __A, __m128  __B)
596 {
597   static const __vector unsigned int mask =
598     { 0xffffffff, 0, 0, 0 };
599   __v4sf a, b, c;
600   /* PowerISA VMX does not allow partial (for just element 0)
601    * results. So to insure we don't generate spurious exceptions
602    * (from the upper elements) we splat the lower float
603    * before we to the operation. */
604   a = vec_splat ((__v4sf) __A, 0);
605   b = vec_splat ((__v4sf) __B, 0);
606   c = (__v4sf) vec_cmpeq(a, b);
607   /* Then we merge the lower float result with the original upper
608    * float elements from __A.  */
609   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
610 }
611 
612 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_ss(__m128 __A,__m128 __B)613 _mm_cmplt_ss (__m128 __A, __m128 __B)
614 {
615   static const __vector unsigned int mask =
616     { 0xffffffff, 0, 0, 0 };
617   __v4sf a, b, c;
618   /* PowerISA VMX does not allow partial (for just element 0)
619    * results. So to insure we don't generate spurious exceptions
620    * (from the upper elements) we splat the lower float
621    * before we to the operation. */
622   a = vec_splat ((__v4sf) __A, 0);
623   b = vec_splat ((__v4sf) __B, 0);
624   c = (__v4sf) vec_cmplt(a, b);
625   /* Then we merge the lower float result with the original upper
626    * float elements from __A.  */
627   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
628 }
629 
630 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_ss(__m128 __A,__m128 __B)631 _mm_cmple_ss (__m128 __A, __m128 __B)
632 {
633   static const __vector unsigned int mask =
634     { 0xffffffff, 0, 0, 0 };
635   __v4sf a, b, c;
636   /* PowerISA VMX does not allow partial (for just element 0)
637    * results. So to insure we don't generate spurious exceptions
638    * (from the upper elements) we splat the lower float
639    * before we to the operation. */
640   a = vec_splat ((__v4sf) __A, 0);
641   b = vec_splat ((__v4sf) __B, 0);
642   c = (__v4sf) vec_cmple(a, b);
643   /* Then we merge the lower float result with the original upper
644    * float elements from __A.  */
645   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
646 }
647 
648 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_ss(__m128 __A,__m128 __B)649 _mm_cmpgt_ss (__m128 __A, __m128 __B)
650 {
651   static const __vector unsigned int mask =
652     { 0xffffffff, 0, 0, 0 };
653   __v4sf a, b, c;
654   /* PowerISA VMX does not allow partial (for just element 0)
655    * results. So to insure we don't generate spurious exceptions
656    * (from the upper elements) we splat the lower float
657    * before we to the operation. */
658   a = vec_splat ((__v4sf) __A, 0);
659   b = vec_splat ((__v4sf) __B, 0);
660   c = (__v4sf) vec_cmpgt(a, b);
661   /* Then we merge the lower float result with the original upper
662    * float elements from __A.  */
663   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
664 }
665 
666 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_ss(__m128 __A,__m128 __B)667 _mm_cmpge_ss (__m128 __A, __m128 __B)
668 {
669   static const __vector unsigned int mask =
670     { 0xffffffff, 0, 0, 0 };
671   __v4sf a, b, c;
672   /* PowerISA VMX does not allow partial (for just element 0)
673    * results. So to insure we don't generate spurious exceptions
674    * (from the upper elements) we splat the lower float
675    * before we to the operation. */
676   a = vec_splat ((__v4sf) __A, 0);
677   b = vec_splat ((__v4sf) __B, 0);
678   c = (__v4sf) vec_cmpge(a, b);
679   /* Then we merge the lower float result with the original upper
680    * float elements from __A.  */
681   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
682 }
683 
684 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_ss(__m128 __A,__m128 __B)685 _mm_cmpneq_ss (__m128 __A, __m128 __B)
686 {
687   static const __vector unsigned int mask =
688     { 0xffffffff, 0, 0, 0 };
689   __v4sf a, b, c;
690   /* PowerISA VMX does not allow partial (for just element 0)
691    * results. So to insure we don't generate spurious exceptions
692    * (from the upper elements) we splat the lower float
693    * before we to the operation. */
694   a = vec_splat ((__v4sf) __A, 0);
695   b = vec_splat ((__v4sf) __B, 0);
696   c = (__v4sf) vec_cmpeq(a, b);
697   c = vec_nor (c, c);
698   /* Then we merge the lower float result with the original upper
699    * float elements from __A.  */
700   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
701 }
702 
703 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_ss(__m128 __A,__m128 __B)704 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
705 {
706   static const __vector unsigned int mask =
707     { 0xffffffff, 0, 0, 0 };
708   __v4sf a, b, c;
709   /* PowerISA VMX does not allow partial (for just element 0)
710    * results. So to insure we don't generate spurious exceptions
711    * (from the upper elements) we splat the lower float
712    * before we to the operation. */
713   a = vec_splat ((__v4sf) __A, 0);
714   b = vec_splat ((__v4sf) __B, 0);
715   c = (__v4sf) vec_cmpge(a, b);
716   /* Then we merge the lower float result with the original upper
717    * float elements from __A.  */
718   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
719 }
720 
721 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_ss(__m128 __A,__m128 __B)722 _mm_cmpnle_ss (__m128 __A, __m128 __B)
723 {
724   static const __vector unsigned int mask =
725     { 0xffffffff, 0, 0, 0 };
726   __v4sf a, b, c;
727   /* PowerISA VMX does not allow partial (for just element 0)
728    * results. So to insure we don't generate spurious exceptions
729    * (from the upper elements) we splat the lower float
730    * before we to the operation. */
731   a = vec_splat ((__v4sf) __A, 0);
732   b = vec_splat ((__v4sf) __B, 0);
733   c = (__v4sf) vec_cmpgt(a, b);
734   /* Then we merge the lower float result with the original upper
735    * float elements from __A.  */
736   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
737 }
738 
739 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_ss(__m128 __A,__m128 __B)740 _mm_cmpngt_ss (__m128 __A, __m128 __B)
741 {
742   static const __vector unsigned int mask =
743     { 0xffffffff, 0, 0, 0 };
744   __v4sf a, b, c;
745   /* PowerISA VMX does not allow partial (for just element 0)
746    * results. So to insure we don't generate spurious exceptions
747    * (from the upper elements) we splat the lower float
748    * before we to the operation. */
749   a = vec_splat ((__v4sf) __A, 0);
750   b = vec_splat ((__v4sf) __B, 0);
751   c = (__v4sf) vec_cmple(a, b);
752   /* Then we merge the lower float result with the original upper
753    * float elements from __A.  */
754   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
755 }
756 
757 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_ss(__m128 __A,__m128 __B)758 _mm_cmpnge_ss (__m128 __A, __m128 __B)
759 {
760   static const __vector unsigned int mask =
761     { 0xffffffff, 0, 0, 0 };
762   __v4sf a, b, c;
763   /* PowerISA VMX does not allow partial (for just element 0)
764    * results. So to insure we don't generate spurious exceptions
765    * (from the upper elements) we splat the lower float
766    * before we do the operation. */
767   a = vec_splat ((__v4sf) __A, 0);
768   b = vec_splat ((__v4sf) __B, 0);
769   c = (__v4sf) vec_cmplt(a, b);
770   /* Then we merge the lower float result with the original upper
771    * float elements from __A.  */
772   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
773 }
774 
775 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_ss(__m128 __A,__m128 __B)776 _mm_cmpord_ss (__m128 __A, __m128 __B)
777 {
778   __vector unsigned int a, b;
779   __vector unsigned int c, d;
780   static const __vector unsigned int float_exp_mask =
781     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
782   static const __vector unsigned int mask =
783     { 0xffffffff, 0, 0, 0 };
784 
785   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
786   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
787   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
788   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
789   c = vec_and (c, d);
790   /* Then we merge the lower float result with the original upper
791    * float elements from __A.  */
792   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
793 }
794 
795 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_ss(__m128 __A,__m128 __B)796 _mm_cmpunord_ss (__m128 __A, __m128 __B)
797 {
798   __vector unsigned int a, b;
799   __vector unsigned int c, d;
800   static const __vector unsigned int float_exp_mask =
801     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
802   static const __vector unsigned int mask =
803     { 0xffffffff, 0, 0, 0 };
804 
805   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
806   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
807   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
808   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
809   c = vec_or (c, d);
810   /* Then we merge the lower float result with the original upper
811    * float elements from __A.  */
812   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
813 }
814 
815 /* Compare the lower SPFP values of A and B and return 1 if true
816    and 0 if false.  */
817 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_ss(__m128 __A,__m128 __B)818 _mm_comieq_ss (__m128 __A, __m128 __B)
819 {
820   return (__A[0] == __B[0]);
821 }
822 
823 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_ss(__m128 __A,__m128 __B)824 _mm_comilt_ss (__m128 __A, __m128 __B)
825 {
826   return (__A[0] < __B[0]);
827 }
828 
829 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_ss(__m128 __A,__m128 __B)830 _mm_comile_ss (__m128 __A, __m128 __B)
831 {
832   return (__A[0] <= __B[0]);
833 }
834 
835 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_ss(__m128 __A,__m128 __B)836 _mm_comigt_ss (__m128 __A, __m128 __B)
837 {
838   return (__A[0] > __B[0]);
839 }
840 
841 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_ss(__m128 __A,__m128 __B)842 _mm_comige_ss (__m128 __A, __m128 __B)
843 {
844   return (__A[0] >= __B[0]);
845 }
846 
847 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_ss(__m128 __A,__m128 __B)848 _mm_comineq_ss (__m128 __A, __m128 __B)
849 {
850   return (__A[0] != __B[0]);
851 }
852 
853 /* FIXME
854  * The __mm_ucomi??_ss implementations below are exactly the same as
855  * __mm_comi??_ss because GCC for PowerPC only generates unordered
856  * compares (scalar and vector).
857  * Technically __mm_comieq_ss et al should be using the ordered
858  * compare and signal for QNaNs.
859  * The __mm_ucomieq_sd et all should be OK, as is.
860  */
861 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_ss(__m128 __A,__m128 __B)862 _mm_ucomieq_ss (__m128 __A, __m128 __B)
863 {
864   return (__A[0] == __B[0]);
865 }
866 
867 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_ss(__m128 __A,__m128 __B)868 _mm_ucomilt_ss (__m128 __A, __m128 __B)
869 {
870   return (__A[0] < __B[0]);
871 }
872 
873 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_ss(__m128 __A,__m128 __B)874 _mm_ucomile_ss (__m128 __A, __m128 __B)
875 {
876   return (__A[0] <= __B[0]);
877 }
878 
879 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_ss(__m128 __A,__m128 __B)880 _mm_ucomigt_ss (__m128 __A, __m128 __B)
881 {
882   return (__A[0] > __B[0]);
883 }
884 
885 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_ss(__m128 __A,__m128 __B)886 _mm_ucomige_ss (__m128 __A, __m128 __B)
887 {
888   return (__A[0] >= __B[0]);
889 }
890 
891 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_ss(__m128 __A,__m128 __B)892 _mm_ucomineq_ss (__m128 __A, __m128 __B)
893 {
894   return (__A[0] != __B[0]);
895 }
896 
897 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_f32(__m128 __A)898 _mm_cvtss_f32 (__m128 __A)
899 {
900   return ((__v4sf)__A)[0];
901 }
902 
903 /* Convert the lower SPFP value to a 32-bit integer according to the current
904    rounding mode.  */
905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si32(__m128 __A)906 _mm_cvtss_si32 (__m128 __A)
907 {
908   int res;
909 #ifdef _ARCH_PWR8
910   double dtmp;
911   __asm__(
912 #ifdef __LITTLE_ENDIAN__
913       "xxsldwi %x0,%x0,%x0,3;\n"
914 #endif
915       "xscvspdp %x2,%x0;\n"
916       "fctiw  %2,%2;\n"
917       "mfvsrd  %1,%x2;\n"
918       : "+wa" (__A),
919         "=r" (res),
920         "=f" (dtmp)
921       : );
922 #else
923   res = __builtin_rint(__A[0]);
924 #endif
925   return (res);
926 }
927 
928 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_ss2si(__m128 __A)929 _mm_cvt_ss2si (__m128 __A)
930 {
931   return _mm_cvtss_si32 (__A);
932 }
933 
934 /* Convert the lower SPFP value to a 32-bit integer according to the
935    current rounding mode.  */
936 
937 /* Intel intrinsic.  */
938 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64(__m128 __A)939 _mm_cvtss_si64 (__m128 __A)
940 {
941   long long res;
942 #if defined (_ARCH_PWR8) && defined (__powerpc64__)
943   double dtmp;
944   __asm__(
945 #ifdef __LITTLE_ENDIAN__
946       "xxsldwi %x0,%x0,%x0,3;\n"
947 #endif
948       "xscvspdp %x2,%x0;\n"
949       "fctid  %2,%2;\n"
950       "mfvsrd  %1,%x2;\n"
951       : "+wa" (__A),
952         "=r" (res),
953         "=f" (dtmp)
954       : );
955 #else
956   res = __builtin_llrint(__A[0]);
957 #endif
958   return (res);
959 }
960 
961 /* Microsoft intrinsic.  */
962 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64x(__m128 __A)963 _mm_cvtss_si64x (__m128 __A)
964 {
965   return _mm_cvtss_si64 ((__v4sf) __A);
966 }
967 
968 /* Constants for use with _mm_prefetch.  */
969 enum _mm_hint
970 {
971   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
972   _MM_HINT_ET0 = 7,
973   _MM_HINT_ET1 = 6,
974   _MM_HINT_T0 = 3,
975   _MM_HINT_T1 = 2,
976   _MM_HINT_T2 = 1,
977   _MM_HINT_NTA = 0
978 };
979 
980 /* Loads one cache line from address P to a location "closer" to the
981    processor.  The selector I specifies the type of prefetch operation.  */
982 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_prefetch(const void * __P,enum _mm_hint __I)983 _mm_prefetch (const void *__P, enum _mm_hint __I)
984 {
985   /* Current PowerPC will ignores the hint parameters.  */
986   __builtin_prefetch (__P);
987 }
988 
989 /* Convert the two lower SPFP values to 32-bit integers according to the
990    current rounding mode.  Return the integers in packed form.  */
991 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi32(__m128 __A)992 _mm_cvtps_pi32 (__m128 __A)
993 {
994   /* Splat two lower SPFP values to both halves.  */
995   __v4sf temp, rounded;
996   __vector unsigned long long result;
997 
998   /* Splat two lower SPFP values to both halves.  */
999   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1000   rounded = vec_rint(temp);
1001   result = (__vector unsigned long long) vec_cts (rounded, 0);
1002 
1003   return (__m64) ((__vector long long) result)[0];
1004 }
1005 
1006 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_ps2pi(__m128 __A)1007 _mm_cvt_ps2pi (__m128 __A)
1008 {
1009   return _mm_cvtps_pi32 (__A);
1010 }
1011 
1012 /* Truncate the lower SPFP value to a 32-bit integer.  */
1013 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si32(__m128 __A)1014 _mm_cvttss_si32 (__m128 __A)
1015 {
1016   /* Extract the lower float element.  */
1017   float temp = __A[0];
1018   /* truncate to 32-bit integer and return.  */
1019   return temp;
1020 }
1021 
1022 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_ss2si(__m128 __A)1023 _mm_cvtt_ss2si (__m128 __A)
1024 {
1025   return _mm_cvttss_si32 (__A);
1026 }
1027 
1028 /* Intel intrinsic.  */
1029 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si64(__m128 __A)1030 _mm_cvttss_si64 (__m128 __A)
1031 {
1032   /* Extract the lower float element.  */
1033   float temp = __A[0];
1034   /* truncate to 32-bit integer and return.  */
1035   return temp;
1036 }
1037 
1038 /* Microsoft intrinsic.  */
1039 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si64x(__m128 __A)1040 _mm_cvttss_si64x (__m128 __A)
1041 {
1042   /* Extract the lower float element.  */
1043   float temp = __A[0];
1044   /* truncate to 32-bit integer and return.  */
1045   return temp;
1046 }
1047 
1048 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
1049    integers in packed form.  */
1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_pi32(__m128 __A)1051 _mm_cvttps_pi32 (__m128 __A)
1052 {
1053   __v4sf temp;
1054   __vector unsigned long long result;
1055 
1056   /* Splat two lower SPFP values to both halves.  */
1057   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1058   result = (__vector unsigned long long) vec_cts (temp, 0);
1059 
1060   return (__m64) ((__vector long long) result)[0];
1061 }
1062 
1063 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_ps2pi(__m128 __A)1064 _mm_cvtt_ps2pi (__m128 __A)
1065 {
1066   return _mm_cvttps_pi32 (__A);
1067 }
1068 
1069 /* Convert B to a SPFP value and insert it as element zero in A.  */
1070 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_ss(__m128 __A,int __B)1071 _mm_cvtsi32_ss (__m128 __A, int __B)
1072 {
1073   float temp = __B;
1074   __A[0] = temp;
1075 
1076   return __A;
1077 }
1078 
1079 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_si2ss(__m128 __A,int __B)1080 _mm_cvt_si2ss (__m128 __A, int __B)
1081 {
1082   return _mm_cvtsi32_ss (__A, __B);
1083 }
1084 
1085 /* Convert B to a SPFP value and insert it as element zero in A.  */
1086 /* Intel intrinsic.  */
1087 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_ss(__m128 __A,long long __B)1088 _mm_cvtsi64_ss (__m128 __A, long long __B)
1089 {
1090   float temp = __B;
1091   __A[0] = temp;
1092 
1093   return __A;
1094 }
1095 
1096 /* Microsoft intrinsic.  */
1097 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_ss(__m128 __A,long long __B)1098 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1099 {
1100   return _mm_cvtsi64_ss (__A, __B);
1101 }
1102 
1103 /* Convert the two 32-bit values in B to SPFP form and insert them
1104    as the two lower elements in A.  */
1105 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_ps(__m128 __A,__m64 __B)1106 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
1107 {
1108   __vector signed int vm1;
1109   __vector float vf1;
1110 
1111   vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1112   vf1 = (__vector float) vec_ctf (vm1, 0);
1113 
1114   return ((__m128) (__vector unsigned long long)
1115     { ((__vector unsigned long long)vf1) [0],
1116 	((__vector unsigned long long)__A) [1]});
1117 }
1118 
1119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_pi2ps(__m128 __A,__m64 __B)1120 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1121 {
1122   return _mm_cvtpi32_ps (__A, __B);
1123 }
1124 
1125 /* Convert the four signed 16-bit values in A to SPFP form.  */
1126 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi16_ps(__m64 __A)1127 _mm_cvtpi16_ps (__m64 __A)
1128 {
1129   __vector signed short vs8;
1130   __vector signed int vi4;
1131   __vector float vf1;
1132 
1133   vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1134   vi4 = vec_vupklsh (vs8);
1135   vf1 = (__vector float) vec_ctf (vi4, 0);
1136 
1137   return (__m128) vf1;
1138 }
1139 
1140 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
1141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpu16_ps(__m64 __A)1142 _mm_cvtpu16_ps (__m64 __A)
1143 {
1144   const __vector unsigned short zero =
1145     { 0, 0, 0, 0, 0, 0, 0, 0 };
1146   __vector unsigned short vs8;
1147   __vector unsigned int vi4;
1148   __vector float vf1;
1149 
1150   vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1151   vi4 = (__vector unsigned int) vec_mergel
1152 #ifdef __LITTLE_ENDIAN__
1153                                            (vs8, zero);
1154 #else
1155                                            (zero, vs8);
1156 #endif
1157   vf1 = (__vector float) vec_ctf (vi4, 0);
1158 
1159   return (__m128) vf1;
1160 }
1161 
1162 /* Convert the low four signed 8-bit values in A to SPFP form.  */
1163 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi8_ps(__m64 __A)1164 _mm_cvtpi8_ps (__m64 __A)
1165 {
1166   __vector signed char vc16;
1167   __vector signed short vs8;
1168   __vector signed int vi4;
1169   __vector float vf1;
1170 
1171   vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1172   vs8 = vec_vupkhsb (vc16);
1173   vi4 = vec_vupkhsh (vs8);
1174   vf1 = (__vector float) vec_ctf (vi4, 0);
1175 
1176   return (__m128) vf1;
1177 }
1178 
1179 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1180 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181 
_mm_cvtpu8_ps(__m64 __A)1182 _mm_cvtpu8_ps (__m64  __A)
1183 {
1184   const __vector unsigned char zero =
1185     { 0, 0, 0, 0, 0, 0, 0, 0 };
1186   __vector unsigned char vc16;
1187   __vector unsigned short vs8;
1188   __vector unsigned int vi4;
1189   __vector float vf1;
1190 
1191   vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1192 #ifdef __LITTLE_ENDIAN__
1193   vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1194   vi4 = (__vector unsigned int) vec_mergeh (vs8,
1195 					    (__vector unsigned short) zero);
1196 #else
1197   vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1198   vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1199                                             vs8);
1200 #endif
1201   vf1 = (__vector float) vec_ctf (vi4, 0);
1202 
1203   return (__m128) vf1;
1204 }
1205 
1206 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
1207 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32x2_ps(__m64 __A,__m64 __B)1208 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1209 {
1210   __vector signed int vi4;
1211   __vector float vf4;
1212 
1213   vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1214   vf4 = (__vector float) vec_ctf (vi4, 0);
1215   return (__m128) vf4;
1216 }
1217 
1218 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi16(__m128 __A)1220 _mm_cvtps_pi16 (__m128 __A)
1221 {
1222   __v4sf rounded;
1223   __vector signed int temp;
1224   __vector unsigned long long result;
1225 
1226   rounded = vec_rint(__A);
1227   temp = vec_cts (rounded, 0);
1228   result = (__vector unsigned long long) vec_pack (temp, temp);
1229 
1230   return (__m64) ((__vector long long) result)[0];
1231 }
1232 
1233 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
1234 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi8(__m128 __A)1235 _mm_cvtps_pi8 (__m128 __A)
1236 {
1237   __v4sf rounded;
1238   __vector signed int tmp_i;
1239   static const __vector signed int zero = {0, 0, 0, 0};
1240   __vector signed short tmp_s;
1241   __vector signed char res_v;
1242 
1243   rounded = vec_rint(__A);
1244   tmp_i = vec_cts (rounded, 0);
1245   tmp_s = vec_pack (tmp_i, zero);
1246   res_v = vec_pack (tmp_s, tmp_s);
1247   return (__m64) ((__vector long long) res_v)[0];
1248 }
1249 
1250 /* Selects four specific SPFP values from A and B based on MASK.  */
1251 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252 
_mm_shuffle_ps(__m128 __A,__m128 __B,int const __mask)1253 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1254 {
1255   unsigned long element_selector_10 = __mask & 0x03;
1256   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1257   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1258   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1259   static const unsigned int permute_selectors[4] =
1260     {
1261 #ifdef __LITTLE_ENDIAN__
1262       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1263 #else
1264       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1265 #endif
1266     };
1267   __vector unsigned int t;
1268 
1269   t[0] = permute_selectors[element_selector_10];
1270   t[1] = permute_selectors[element_selector_32];
1271   t[2] = permute_selectors[element_selector_54] + 0x10101010;
1272   t[3] = permute_selectors[element_selector_76] + 0x10101010;
1273   return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1274 }
1275 
1276 /* Selects and interleaves the upper two SPFP values from A and B.  */
1277 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_ps(__m128 __A,__m128 __B)1278 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1279 {
1280   return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1281 }
1282 
1283 /* Selects and interleaves the lower two SPFP values from A and B.  */
1284 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_ps(__m128 __A,__m128 __B)1285 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1286 {
1287   return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1288 }
1289 
1290 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1291    the lower two values are passed through from A.  */
1292 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pi(__m128 __A,__m64 const * __P)1293 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1294 {
1295   __vector unsigned long long __a = (__vector unsigned long long)__A;
1296   __vector unsigned long long __p = vec_splats(*__P);
1297   __a [1] = __p [1];
1298 
1299   return (__m128)__a;
1300 }
1301 
1302 /* Stores the upper two SPFP values of A into P.  */
1303 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pi(__m64 * __P,__m128 __A)1304 _mm_storeh_pi (__m64 *__P, __m128 __A)
1305 {
1306   __vector unsigned long long __a = (__vector unsigned long long) __A;
1307 
1308   *__P = __a[1];
1309 }
1310 
1311 /* Moves the upper two values of B into the lower two values of A.  */
1312 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehl_ps(__m128 __A,__m128 __B)1313 _mm_movehl_ps (__m128 __A, __m128 __B)
1314 {
1315   return (__m128) vec_mergel ((__vector unsigned long long)__B,
1316 			      (__vector unsigned long long)__A);
1317 }
1318 
1319 /* Moves the lower two values of B into the upper two values of A.  */
1320 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movelh_ps(__m128 __A,__m128 __B)1321 _mm_movelh_ps (__m128 __A, __m128 __B)
1322 {
1323   return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1324 			      (__vector unsigned long long)__B);
1325 }
1326 
1327 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1328    the upper two values are passed through from A.  */
1329 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pi(__m128 __A,__m64 const * __P)1330 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1331 {
1332   __vector unsigned long long __a = (__vector unsigned long long)__A;
1333   __vector unsigned long long __p = vec_splats(*__P);
1334   __a [0] = __p [0];
1335 
1336   return (__m128)__a;
1337 }
1338 
1339 /* Stores the lower two SPFP values of A into P.  */
1340 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pi(__m64 * __P,__m128 __A)1341 _mm_storel_pi (__m64 *__P, __m128 __A)
1342 {
1343   __vector unsigned long long __a = (__vector unsigned long long) __A;
1344 
1345   *__P = __a[0];
1346 }
1347 
1348 #ifdef _ARCH_PWR8
1349 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1350 
1351 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1352 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_ps(__m128 __A)1353 _mm_movemask_ps (__m128  __A)
1354 {
1355 #ifdef _ARCH_PWR10
1356   return vec_extractm ((vector unsigned int) __A);
1357 #else
1358   __vector unsigned long long result;
1359   static const __vector unsigned int perm_mask =
1360     {
1361 #ifdef __LITTLE_ENDIAN__
1362 	0x00204060, 0x80808080, 0x80808080, 0x80808080
1363 #else
1364       0x80808080, 0x80808080, 0x80808080, 0x00204060
1365 #endif
1366     };
1367 
1368   result = ((__vector unsigned long long)
1369 	    vec_vbpermq ((__vector unsigned char) __A,
1370 			 (__vector unsigned char) perm_mask));
1371 
1372 #ifdef __LITTLE_ENDIAN__
1373   return result[1];
1374 #else
1375   return result[0];
1376 #endif
1377 #endif /* !_ARCH_PWR10 */
1378 }
1379 #endif /* _ARCH_PWR8 */
1380 
1381 /* Create a vector with all four elements equal to *P.  */
1382 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_ps(float const * __P)1383 _mm_load1_ps (float const *__P)
1384 {
1385   return _mm_set1_ps (*__P);
1386 }
1387 
1388 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps1(float const * __P)1389 _mm_load_ps1 (float const *__P)
1390 {
1391   return _mm_load1_ps (__P);
1392 }
1393 
1394 /* Extracts one of the four words of A.  The selector N must be immediate.  */
1395 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_pi16(__m64 const __A,int const __N)1396 _mm_extract_pi16 (__m64 const __A, int const __N)
1397 {
1398   unsigned int shiftr = __N & 3;
1399 #ifdef __BIG_ENDIAN__
1400   shiftr = 3 - shiftr;
1401 #endif
1402 
1403   return ((__A >> (shiftr * 16)) & 0xffff);
1404 }
1405 
1406 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pextrw(__m64 const __A,int const __N)1407 _m_pextrw (__m64 const __A, int const __N)
1408 {
1409   return _mm_extract_pi16 (__A, __N);
1410 }
1411 
1412 /* Inserts word D into one of four words of A.  The selector N must be
1413    immediate.  */
1414 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_pi16(__m64 const __A,int const __D,int const __N)1415 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1416 {
1417   const int shiftl = (__N & 3) * 16;
1418   const __m64 shiftD = (const __m64) __D << shiftl;
1419   const __m64 mask = 0xffffUL << shiftl;
1420   __m64 result = (__A & (~mask)) | (shiftD & mask);
1421 
1422   return (result);
1423 }
1424 
1425 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pinsrw(__m64 const __A,int const __D,int const __N)1426 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1427 {
1428   return _mm_insert_pi16 (__A, __D, __N);
1429 }
1430 
1431 /* Compute the element-wise maximum of signed 16-bit values.  */
1432 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1433 
_mm_max_pi16(__m64 __A,__m64 __B)1434 _mm_max_pi16 (__m64 __A, __m64 __B)
1435 {
1436 #if _ARCH_PWR8
1437   __vector signed short a, b, r;
1438   __vector __bool short c;
1439 
1440   a = (__vector signed short)vec_splats (__A);
1441   b = (__vector signed short)vec_splats (__B);
1442   c = (__vector __bool short)vec_cmpgt (a, b);
1443   r = vec_sel (b, a, c);
1444   return (__m64) ((__vector long long) r)[0];
1445 #else
1446   __m64_union m1, m2, res;
1447 
1448   m1.as_m64 = __A;
1449   m2.as_m64 = __B;
1450 
1451   res.as_short[0] =
1452       (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1453   res.as_short[1] =
1454       (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1455   res.as_short[2] =
1456       (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1457   res.as_short[3] =
1458       (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1459 
1460   return (__m64) res.as_m64;
1461 #endif
1462 }
1463 
1464 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaxsw(__m64 __A,__m64 __B)1465 _m_pmaxsw (__m64 __A, __m64 __B)
1466 {
1467   return _mm_max_pi16 (__A, __B);
1468 }
1469 
1470 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1471 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pu8(__m64 __A,__m64 __B)1472 _mm_max_pu8 (__m64 __A, __m64 __B)
1473 {
1474 #if _ARCH_PWR8
1475   __vector unsigned char a, b, r;
1476   __vector __bool char c;
1477 
1478   a = (__vector unsigned char)vec_splats (__A);
1479   b = (__vector unsigned char)vec_splats (__B);
1480   c = (__vector __bool char)vec_cmpgt (a, b);
1481   r = vec_sel (b, a, c);
1482   return (__m64) ((__vector long long) r)[0];
1483 #else
1484   __m64_union m1, m2, res;
1485   long i;
1486 
1487   m1.as_m64 = __A;
1488   m2.as_m64 = __B;
1489 
1490 
1491   for (i = 0; i < 8; i++)
1492   res.as_char[i] =
1493       ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1494 	  m1.as_char[i] : m2.as_char[i];
1495 
1496   return (__m64) res.as_m64;
1497 #endif
1498 }
1499 
1500 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaxub(__m64 __A,__m64 __B)1501 _m_pmaxub (__m64 __A, __m64 __B)
1502 {
1503   return _mm_max_pu8 (__A, __B);
1504 }
1505 
1506 /* Compute the element-wise minimum of signed 16-bit values.  */
1507 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pi16(__m64 __A,__m64 __B)1508 _mm_min_pi16 (__m64 __A, __m64 __B)
1509 {
1510 #if _ARCH_PWR8
1511   __vector signed short a, b, r;
1512   __vector __bool short c;
1513 
1514   a = (__vector signed short)vec_splats (__A);
1515   b = (__vector signed short)vec_splats (__B);
1516   c = (__vector __bool short)vec_cmplt (a, b);
1517   r = vec_sel (b, a, c);
1518   return (__m64) ((__vector long long) r)[0];
1519 #else
1520   __m64_union m1, m2, res;
1521 
1522   m1.as_m64 = __A;
1523   m2.as_m64 = __B;
1524 
1525   res.as_short[0] =
1526       (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1527   res.as_short[1] =
1528       (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1529   res.as_short[2] =
1530       (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1531   res.as_short[3] =
1532       (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1533 
1534   return (__m64) res.as_m64;
1535 #endif
1536 }
1537 
1538 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pminsw(__m64 __A,__m64 __B)1539 _m_pminsw (__m64 __A, __m64 __B)
1540 {
1541   return _mm_min_pi16 (__A, __B);
1542 }
1543 
1544 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1545 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pu8(__m64 __A,__m64 __B)1546 _mm_min_pu8 (__m64 __A, __m64 __B)
1547 {
1548 #if _ARCH_PWR8
1549   __vector unsigned char a, b, r;
1550   __vector __bool char c;
1551 
1552   a = (__vector unsigned char)vec_splats (__A);
1553   b = (__vector unsigned char)vec_splats (__B);
1554   c = (__vector __bool char)vec_cmplt (a, b);
1555   r = vec_sel (b, a, c);
1556   return (__m64) ((__vector long long) r)[0];
1557 #else
1558   __m64_union m1, m2, res;
1559   long i;
1560 
1561   m1.as_m64 = __A;
1562   m2.as_m64 = __B;
1563 
1564 
1565   for (i = 0; i < 8; i++)
1566   res.as_char[i] =
1567       ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1568 	  m1.as_char[i] : m2.as_char[i];
1569 
1570   return (__m64) res.as_m64;
1571 #endif
1572 }
1573 
1574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pminub(__m64 __A,__m64 __B)1575 _m_pminub (__m64 __A, __m64 __B)
1576 {
1577   return _mm_min_pu8 (__A, __B);
1578 }
1579 
1580 /* Create an 8-bit mask of the signs of 8-bit values.  */
1581 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A)1582 _mm_movemask_pi8 (__m64 __A)
1583 {
1584 #ifdef __powerpc64__
1585   unsigned long long p =
1586 #ifdef __LITTLE_ENDIAN__
1587                          0x0008101820283038UL; // permute control for sign bits
1588 #else
1589                          0x3830282018100800UL; // permute control for sign bits
1590 #endif
1591   return __builtin_bpermd (p, __A);
1592 #else
1593 #ifdef __LITTLE_ENDIAN__
1594   unsigned int mask = 0x20283038UL;
1595   unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
1596   unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
1597 #else
1598   unsigned int mask = 0x38302820UL;
1599   unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
1600   unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
1601 #endif
1602   return (r2 << 4) | r1;
1603 #endif
1604 }
1605 
1606 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmovmskb(__m64 __A)1607 _m_pmovmskb (__m64 __A)
1608 {
1609   return _mm_movemask_pi8 (__A);
1610 }
1611 
1612 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1613    in B and produce the high 16 bits of the 32-bit results.  */
1614 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A,__m64 __B)1615 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1616 {
1617   __vector unsigned short a, b;
1618   __vector unsigned short c;
1619   __vector unsigned int w0, w1;
1620   __vector unsigned char xform1 = {
1621 #ifdef __LITTLE_ENDIAN__
1622       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1623       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1624 #else
1625       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1626       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
1627 #endif
1628     };
1629 
1630   a = (__vector unsigned short)vec_splats (__A);
1631   b = (__vector unsigned short)vec_splats (__B);
1632 
1633   w0 = vec_vmuleuh (a, b);
1634   w1 = vec_vmulouh (a, b);
1635   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1636 
1637   return (__m64) ((__vector long long) c)[0];
1638 }
1639 
1640 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhuw(__m64 __A,__m64 __B)1641 _m_pmulhuw (__m64 __A, __m64 __B)
1642 {
1643   return _mm_mulhi_pu16 (__A, __B);
1644 }
1645 
1646 /* Return a combination of the four 16-bit values in A.  The selector
1647    must be an immediate.  */
1648 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16(__m64 __A,int const __N)1649 _mm_shuffle_pi16 (__m64 __A, int const __N)
1650 {
1651   unsigned long element_selector_10 = __N & 0x03;
1652   unsigned long element_selector_32 = (__N >> 2) & 0x03;
1653   unsigned long element_selector_54 = (__N >> 4) & 0x03;
1654   unsigned long element_selector_76 = (__N >> 6) & 0x03;
1655   static const unsigned short permute_selectors[4] =
1656     {
1657 #ifdef __LITTLE_ENDIAN__
1658 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1659 #else
1660 	      0x0607, 0x0405, 0x0203, 0x0001
1661 #endif
1662     };
1663   __m64_union t;
1664   __vector unsigned long long a, p, r;
1665 
1666 #ifdef __LITTLE_ENDIAN__
1667   t.as_short[0] = permute_selectors[element_selector_10];
1668   t.as_short[1] = permute_selectors[element_selector_32];
1669   t.as_short[2] = permute_selectors[element_selector_54];
1670   t.as_short[3] = permute_selectors[element_selector_76];
1671 #else
1672   t.as_short[3] = permute_selectors[element_selector_10];
1673   t.as_short[2] = permute_selectors[element_selector_32];
1674   t.as_short[1] = permute_selectors[element_selector_54];
1675   t.as_short[0] = permute_selectors[element_selector_76];
1676 #endif
1677   p = vec_splats (t.as_m64);
1678   a = vec_splats (__A);
1679   r = vec_perm (a, a, (__vector unsigned char)p);
1680   return (__m64) ((__vector long long) r)[0];
1681 }
1682 
1683 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pshufw(__m64 __A,int const __N)1684 _m_pshufw (__m64 __A, int const __N)
1685 {
1686   return _mm_shuffle_pi16 (__A, __N);
1687 }
1688 
1689 /* Conditionally store byte elements of A into P.  The high bit of each
1690    byte in the selector N determines whether the corresponding byte from
1691    A is stored.  */
1692 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmove_si64(__m64 __A,__m64 __N,char * __P)1693 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1694 {
1695   __m64 hibit = 0x8080808080808080UL;
1696   __m64 mask, tmp;
1697   __m64 *p = (__m64*)__P;
1698 
1699   tmp = *p;
1700   mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1701   tmp = (tmp & (~mask)) | (__A & mask);
1702   *p = tmp;
1703 }
1704 
1705 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_maskmovq(__m64 __A,__m64 __N,char * __P)1706 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1707 {
1708   _mm_maskmove_si64 (__A, __N, __P);
1709 }
1710 
1711 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1712 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_pu8(__m64 __A,__m64 __B)1713 _mm_avg_pu8 (__m64 __A, __m64 __B)
1714 {
1715   __vector unsigned char a, b, c;
1716 
1717   a = (__vector unsigned char)vec_splats (__A);
1718   b = (__vector unsigned char)vec_splats (__B);
1719   c = vec_avg (a, b);
1720   return (__m64) ((__vector long long) c)[0];
1721 }
1722 
1723 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgb(__m64 __A,__m64 __B)1724 _m_pavgb (__m64 __A, __m64 __B)
1725 {
1726   return _mm_avg_pu8 (__A, __B);
1727 }
1728 
1729 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1730 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_pu16(__m64 __A,__m64 __B)1731 _mm_avg_pu16 (__m64 __A, __m64 __B)
1732 {
1733   __vector unsigned short a, b, c;
1734 
1735   a = (__vector unsigned short)vec_splats (__A);
1736   b = (__vector unsigned short)vec_splats (__B);
1737   c = vec_avg (a, b);
1738   return (__m64) ((__vector long long) c)[0];
1739 }
1740 
1741 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgw(__m64 __A,__m64 __B)1742 _m_pavgw (__m64 __A, __m64 __B)
1743 {
1744   return _mm_avg_pu16 (__A, __B);
1745 }
1746 
1747 /* Compute the sum of the absolute differences of the unsigned 8-bit
1748    values in A and B.  Return the value in the lower 16-bit word; the
1749    upper words are cleared.  */
1750 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_pu8(__m64 __A,__m64 __B)1751 _mm_sad_pu8 (__m64  __A, __m64  __B)
1752 {
1753   __vector unsigned char a, b;
1754   __vector unsigned char vmin, vmax, vabsdiff;
1755   __vector signed int vsum;
1756   const __vector unsigned int zero =
1757     { 0, 0, 0, 0 };
1758   __m64_union result = {0};
1759 
1760   a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1761   b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1762   vmin = vec_min (a, b);
1763   vmax = vec_max (a, b);
1764   vabsdiff = vec_sub (vmax, vmin);
1765   /* Sum four groups of bytes into integers.  */
1766   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1767   /* Sum across four integers with integer result.  */
1768   vsum = vec_sums (vsum, (__vector signed int) zero);
1769   /* The sum is in the right most 32-bits of the vector result.
1770      Transfer to a GPR and truncate to 16 bits.  */
1771   result.as_short[0] = vsum[3];
1772   return result.as_m64;
1773 }
1774 
1775 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psadbw(__m64 __A,__m64 __B)1776 _m_psadbw (__m64 __A, __m64 __B)
1777 {
1778   return _mm_sad_pu8 (__A, __B);
1779 }
1780 
1781 /* Stores the data in A to the address P without polluting the caches.  */
1782 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pi(__m64 * __P,__m64 __A)1783 _mm_stream_pi (__m64 *__P, __m64 __A)
1784 {
1785   /* Use the data cache block touch for store transient.  */
1786   __asm__ (
1787     "	dcbtstt	0,%0"
1788     :
1789     : "b" (__P)
1790     : "memory"
1791   );
1792   *__P = __A;
1793 }
1794 
1795 /* Likewise.  The address must be 16-byte aligned.  */
1796 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_ps(float * __P,__m128 __A)1797 _mm_stream_ps (float *__P, __m128 __A)
1798 {
1799   /* Use the data cache block touch for store transient.  */
1800   __asm__ (
1801     "	dcbtstt	0,%0"
1802     :
1803     : "b" (__P)
1804     : "memory"
1805   );
1806   _mm_store_ps (__P, __A);
1807 }
1808 
1809 /* Guarantees that every preceding store is globally visible before
1810    any subsequent store.  */
1811 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sfence(void)1812 _mm_sfence (void)
1813 {
1814   /* Generate a light weight sync.  */
1815   __atomic_thread_fence (__ATOMIC_RELEASE);
1816 }
1817 
1818 /* The execution of the next instruction is delayed by an implementation
1819    specific amount of time.  The instruction does not modify the
1820    architectural state.  This is after the pop_options pragma because
1821    it does not require SSE support in the processor--the encoding is a
1822    nop on processors that do not support it.  */
1823 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_pause(void)1824 _mm_pause (void)
1825 {
1826   /* There is no exact match with this construct, but the following is
1827      close to the desired effect.  */
1828 #if _ARCH_PWR8
1829   /* On power8 and later processors we can depend on Program Priority
1830      (PRI) and associated "very low" PPI setting.  Since we don't know
1831      what PPI this thread is running at we: 1) save the current PRI
1832      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1833      via the special or 31,31,31 encoding. 3) issue an "isync" to
1834      insure the PRI change takes effect before we execute any more
1835      instructions.
1836      Now we can execute a lwsync (release barrier) while we execute
1837      this thread at "very low" PRI.  Finally we restore the original
1838      PRI and continue execution.  */
1839   unsigned long __PPR;
1840 
1841   __asm__ volatile (
1842     "	mfppr	%0;"
1843     "   or 31,31,31;"
1844     "   isync;"
1845     "   lwsync;"
1846     "   isync;"
1847     "   mtppr	%0;"
1848     : "=r" (__PPR)
1849     :
1850     : "memory"
1851   );
1852 #else
1853   /* For older processor where we may not even have Program Priority
1854      controls we can only depend on Heavy Weight Sync.  */
1855   __atomic_thread_fence (__ATOMIC_SEQ_CST);
1856 #endif
1857 }
1858 
1859 /* Transpose the 4x4 matrix composed of row[0-3].  */
1860 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
1861 do {									\
1862   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1863   __v4sf __t0 = vec_vmrghw (__r0, __r1);			\
1864   __v4sf __t1 = vec_vmrghw (__r2, __r3);			\
1865   __v4sf __t2 = vec_vmrglw (__r0, __r1);			\
1866   __v4sf __t3 = vec_vmrglw (__r2, __r3);			\
1867   (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, 	\
1868 			       (__vector long long)__t1);	\
1869   (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,	\
1870 			       (__vector long long)__t1);	\
1871   (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,	\
1872 			       (__vector long long)__t3);	\
1873   (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,	\
1874 			       (__vector long long)__t3);	\
1875 } while (0)
1876 
1877 /* For backward source compatibility.  */
1878 //# include <emmintrin.h>
1879 
1880 #endif /* _XMMINTRIN_H_INCLUDED */
1881