1 /* Copyright (C) 2002-2020 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 9.0.  */
26 
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29    makes explicit use of Intel intrinsics to powerpc64le.
30    It is the user's responsibility to determine if the results are
31    acceptable and make additional changes as necessary.
32    Note that much code that uses Intel intrinsics can be rewritten in
33    standard C or GNU C extensions, which are more portable and better
34    optimized across multiple targets.
35 
36    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37    target does not support a native __vector_size__ (8) type.  Instead
38    we typedef __m64 to a 64-bit unsigned long long, which is natively
39    supported in 64-bit mode.  This works well for the _si64 and some
40    _pi32 operations, but starts to generate long sequences for _pi16
41    and _pi8 operations.  For those cases it better (faster and
42    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43    unit, perform the operation, and then transfer the result back to
44    the __m64 type. This implies that the direct register move
45    instructions, introduced with power8, are available for efficient
46    implementation of these transfers.
47 
48    Most MMX intrinsic operations can be performed efficiently as
49    C language 64-bit scalar operation or optimized to use the newer
50    128-bit SSE/Altivec operations.  We recomend this for new
51    applications.  */
52 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
53 #endif
54 
55 #ifndef _MMINTRIN_H_INCLUDED
56 #define _MMINTRIN_H_INCLUDED
57 
58 #include <altivec.h>
59 /* The Intel API is flexible enough that we must allow aliasing with other
60    vector types, and their scalar components.  */
61 typedef __attribute__ ((__aligned__ (8),
62 			__may_alias__)) unsigned long long __m64;
63 
64 typedef __attribute__ ((__aligned__ (8)))
65 union
66   {
67     __m64 as_m64;
68     char as_char[8];
69     signed char as_signed_char [8];
70     short as_short[4];
71     int as_int[2];
72     long long as_long_long;
73     float as_float[2];
74     double as_double;
75   } __m64_union;
76 
77 /* Empty the multimedia state.  */
78 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)79 _mm_empty (void)
80 {
81   /* nothing to do on PowerPC.  */
82 }
83 
84 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_empty(void)85 _m_empty (void)
86 {
87   /* nothing to do on PowerPC.  */
88 }
89 
90 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
91 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si64(int __i)92 _mm_cvtsi32_si64 (int __i)
93 {
94   return (__m64) (unsigned int) __i;
95 }
96 
97 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int(int __i)98 _m_from_int (int __i)
99 {
100   return _mm_cvtsi32_si64 (__i);
101 }
102 
103 /* Convert the lower 32 bits of the __m64 object into an integer.  */
104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si32(__m64 __i)105 _mm_cvtsi64_si32 (__m64 __i)
106 {
107   return ((int) __i);
108 }
109 
110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int(__m64 __i)111 _m_to_int (__m64 __i)
112 {
113   return _mm_cvtsi64_si32 (__i);
114 }
115 
116 /* Convert I to a __m64 object.  */
117 
118 /* Intel intrinsic.  */
119 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int64(long long __i)120 _m_from_int64 (long long __i)
121 {
122   return (__m64) __i;
123 }
124 
125 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_m64(long long __i)126 _mm_cvtsi64_m64 (long long __i)
127 {
128   return (__m64) __i;
129 }
130 
131 /* Microsoft intrinsic.  */
132 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si64(long long __i)133 _mm_cvtsi64x_si64 (long long __i)
134 {
135   return (__m64) __i;
136 }
137 
138 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi64x(long long __i)139 _mm_set_pi64x (long long __i)
140 {
141   return (__m64) __i;
142 }
143 
144 /* Convert the __m64 object to a 64bit integer.  */
145 
146 /* Intel intrinsic.  */
147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int64(__m64 __i)148 _m_to_int64 (__m64 __i)
149 {
150   return (long long)__i;
151 }
152 
153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtm64_si64(__m64 __i)154 _mm_cvtm64_si64 (__m64 __i)
155 {
156   return (long long) __i;
157 }
158 
159 /* Microsoft intrinsic.  */
160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si64x(__m64 __i)161 _mm_cvtsi64_si64x (__m64 __i)
162 {
163   return (long long) __i;
164 }
165 
166 #ifdef _ARCH_PWR8
167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
168    the result, and the four 16-bit values from M2 into the upper four 8-bit
169    values of the result, all with signed saturation.  */
170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi16(__m64 __m1,__m64 __m2)171 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
172 {
173   __vector signed short vm1;
174   __vector signed char vresult;
175 
176   vm1 = (__vector signed short) (__vector unsigned long long)
177 #ifdef __LITTLE_ENDIAN__
178         { __m1, __m2 };
179 #else
180         { __m2, __m1 };
181 #endif
182   vresult = vec_packs (vm1, vm1);
183   return (__m64) ((__vector long long) vresult)[0];
184 }
185 
186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packsswb(__m64 __m1,__m64 __m2)187 _m_packsswb (__m64 __m1, __m64 __m2)
188 {
189   return _mm_packs_pi16 (__m1, __m2);
190 }
191 
192 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
193    the result, and the two 32-bit values from M2 into the upper two 16-bit
194    values of the result, all with signed saturation.  */
195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi32(__m64 __m1,__m64 __m2)196 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
197 {
198   __vector signed int vm1;
199   __vector signed short vresult;
200 
201   vm1 = (__vector signed int) (__vector unsigned long long)
202 #ifdef __LITTLE_ENDIAN__
203         { __m1, __m2 };
204 #else
205         { __m2, __m1 };
206 #endif
207   vresult = vec_packs (vm1, vm1);
208   return (__m64) ((__vector long long) vresult)[0];
209 }
210 
211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packssdw(__m64 __m1,__m64 __m2)212 _m_packssdw (__m64 __m1, __m64 __m2)
213 {
214   return _mm_packs_pi32 (__m1, __m2);
215 }
216 
217 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
218    the result, and the four 16-bit values from M2 into the upper four 8-bit
219    values of the result, all with unsigned saturation.  */
220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pu16(__m64 __m1,__m64 __m2)221 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
222 {
223   __vector unsigned char r;
224   __vector signed short vm1 = (__vector signed short) (__vector long long)
225 #ifdef __LITTLE_ENDIAN__
226         { __m1, __m2 };
227 #else
228         { __m2, __m1 };
229 #endif
230   const __vector signed short __zero = { 0 };
231   __vector __bool short __select = vec_cmplt (vm1, __zero);
232   r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1);
233   __vector __bool char packsel = vec_pack (__select, __select);
234   r = vec_sel (r, (const __vector unsigned char) __zero, packsel);
235   return (__m64) ((__vector long long) r)[0];
236 }
237 
238 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packuswb(__m64 __m1,__m64 __m2)239 _m_packuswb (__m64 __m1, __m64 __m2)
240 {
241   return _mm_packs_pu16 (__m1, __m2);
242 }
243 #endif /* end ARCH_PWR8 */
244 
245 /* Interleave the four 8-bit values from the high half of M1 with the four
246    8-bit values from the high half of M2.  */
247 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)248 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
249 {
250 #if _ARCH_PWR8
251   __vector unsigned char a, b, c;
252 
253   a = (__vector unsigned char)vec_splats (__m1);
254   b = (__vector unsigned char)vec_splats (__m2);
255   c = vec_mergel (a, b);
256   return (__m64) ((__vector long long) c)[1];
257 #else
258   __m64_union m1, m2, res;
259 
260   m1.as_m64 = __m1;
261   m2.as_m64 = __m2;
262 
263   res.as_char[0] = m1.as_char[4];
264   res.as_char[1] = m2.as_char[4];
265   res.as_char[2] = m1.as_char[5];
266   res.as_char[3] = m2.as_char[5];
267   res.as_char[4] = m1.as_char[6];
268   res.as_char[5] = m2.as_char[6];
269   res.as_char[6] = m1.as_char[7];
270   res.as_char[7] = m2.as_char[7];
271 
272   return (__m64) res.as_m64;
273 #endif
274 }
275 
276 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhbw(__m64 __m1,__m64 __m2)277 _m_punpckhbw (__m64 __m1, __m64 __m2)
278 {
279   return _mm_unpackhi_pi8 (__m1, __m2);
280 }
281 
282 /* Interleave the two 16-bit values from the high half of M1 with the two
283    16-bit values from the high half of M2.  */
284 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)285 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
286 {
287   __m64_union m1, m2, res;
288 
289   m1.as_m64 = __m1;
290   m2.as_m64 = __m2;
291 
292   res.as_short[0] = m1.as_short[2];
293   res.as_short[1] = m2.as_short[2];
294   res.as_short[2] = m1.as_short[3];
295   res.as_short[3] = m2.as_short[3];
296 
297   return (__m64) res.as_m64;
298 }
299 
300 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhwd(__m64 __m1,__m64 __m2)301 _m_punpckhwd (__m64 __m1, __m64 __m2)
302 {
303   return _mm_unpackhi_pi16 (__m1, __m2);
304 }
305 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
306    value from the high half of M2.  */
307 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi32(__m64 __m1,__m64 __m2)308 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
309 {
310   __m64_union m1, m2, res;
311 
312   m1.as_m64 = __m1;
313   m2.as_m64 = __m2;
314 
315   res.as_int[0] = m1.as_int[1];
316   res.as_int[1] = m2.as_int[1];
317 
318   return (__m64) res.as_m64;
319 }
320 
321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhdq(__m64 __m1,__m64 __m2)322 _m_punpckhdq (__m64 __m1, __m64 __m2)
323 {
324   return _mm_unpackhi_pi32 (__m1, __m2);
325 }
326 /* Interleave the four 8-bit values from the low half of M1 with the four
327    8-bit values from the low half of M2.  */
328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)329 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
330 {
331 #if _ARCH_PWR8
332   __vector unsigned char a, b, c;
333 
334   a = (__vector unsigned char)vec_splats (__m1);
335   b = (__vector unsigned char)vec_splats (__m2);
336   c = vec_mergel (a, b);
337   return (__m64) ((__vector long long) c)[0];
338 #else
339   __m64_union m1, m2, res;
340 
341   m1.as_m64 = __m1;
342   m2.as_m64 = __m2;
343 
344   res.as_char[0] = m1.as_char[0];
345   res.as_char[1] = m2.as_char[0];
346   res.as_char[2] = m1.as_char[1];
347   res.as_char[3] = m2.as_char[1];
348   res.as_char[4] = m1.as_char[2];
349   res.as_char[5] = m2.as_char[2];
350   res.as_char[6] = m1.as_char[3];
351   res.as_char[7] = m2.as_char[3];
352 
353   return (__m64) res.as_m64;
354 #endif
355 }
356 
357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklbw(__m64 __m1,__m64 __m2)358 _m_punpcklbw (__m64 __m1, __m64 __m2)
359 {
360   return _mm_unpacklo_pi8 (__m1, __m2);
361 }
362 /* Interleave the two 16-bit values from the low half of M1 with the two
363    16-bit values from the low half of M2.  */
364 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)365 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
366 {
367   __m64_union m1, m2, res;
368 
369   m1.as_m64 = __m1;
370   m2.as_m64 = __m2;
371 
372   res.as_short[0] = m1.as_short[0];
373   res.as_short[1] = m2.as_short[0];
374   res.as_short[2] = m1.as_short[1];
375   res.as_short[3] = m2.as_short[1];
376 
377   return (__m64) res.as_m64;
378 }
379 
380 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklwd(__m64 __m1,__m64 __m2)381 _m_punpcklwd (__m64 __m1, __m64 __m2)
382 {
383   return _mm_unpacklo_pi16 (__m1, __m2);
384 }
385 
386 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
387    value from the low half of M2.  */
388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi32(__m64 __m1,__m64 __m2)389 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
390 {
391   __m64_union m1, m2, res;
392 
393   m1.as_m64 = __m1;
394   m2.as_m64 = __m2;
395 
396   res.as_int[0] = m1.as_int[0];
397   res.as_int[1] = m2.as_int[0];
398 
399   return (__m64) res.as_m64;
400 }
401 
402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckldq(__m64 __m1,__m64 __m2)403 _m_punpckldq (__m64 __m1, __m64 __m2)
404 {
405   return _mm_unpacklo_pi32 (__m1, __m2);
406 }
407 
408 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi8(__m64 __m1,__m64 __m2)410 _mm_add_pi8 (__m64 __m1, __m64 __m2)
411 {
412 #if _ARCH_PWR8
413   __vector signed char a, b, c;
414 
415   a = (__vector signed char)vec_splats (__m1);
416   b = (__vector signed char)vec_splats (__m2);
417   c = vec_add (a, b);
418   return (__m64) ((__vector long long) c)[0];
419 #else
420   __m64_union m1, m2, res;
421 
422   m1.as_m64 = __m1;
423   m2.as_m64 = __m2;
424 
425   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
426   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
427   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
428   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
429   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
430   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
431   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
432   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
433 
434   return (__m64) res.as_m64;
435 #endif
436 }
437 
438 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddb(__m64 __m1,__m64 __m2)439 _m_paddb (__m64 __m1, __m64 __m2)
440 {
441   return _mm_add_pi8 (__m1, __m2);
442 }
443 
444 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi16(__m64 __m1,__m64 __m2)446 _mm_add_pi16 (__m64 __m1, __m64 __m2)
447 {
448 #if _ARCH_PWR8
449   __vector signed short a, b, c;
450 
451   a = (__vector signed short)vec_splats (__m1);
452   b = (__vector signed short)vec_splats (__m2);
453   c = vec_add (a, b);
454   return (__m64) ((__vector long long) c)[0];
455 #else
456   __m64_union m1, m2, res;
457 
458   m1.as_m64 = __m1;
459   m2.as_m64 = __m2;
460 
461   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
462   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
463   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
464   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
465 
466   return (__m64) res.as_m64;
467 #endif
468 }
469 
470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddw(__m64 __m1,__m64 __m2)471 _m_paddw (__m64 __m1, __m64 __m2)
472 {
473   return _mm_add_pi16 (__m1, __m2);
474 }
475 
476 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi32(__m64 __m1,__m64 __m2)478 _mm_add_pi32 (__m64 __m1, __m64 __m2)
479 {
480 #if _ARCH_PWR9
481   __vector signed int a, b, c;
482 
483   a = (__vector signed int)vec_splats (__m1);
484   b = (__vector signed int)vec_splats (__m2);
485   c = vec_add (a, b);
486   return (__m64) ((__vector long long) c)[0];
487 #else
488   __m64_union m1, m2, res;
489 
490   m1.as_m64 = __m1;
491   m2.as_m64 = __m2;
492 
493   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
494   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
495 
496   return (__m64) res.as_m64;
497 #endif
498 }
499 
500 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddd(__m64 __m1,__m64 __m2)501 _m_paddd (__m64 __m1, __m64 __m2)
502 {
503   return _mm_add_pi32 (__m1, __m2);
504 }
505 
506 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
507 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi8(__m64 __m1,__m64 __m2)508 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
509 {
510 #if _ARCH_PWR8
511   __vector signed char a, b, c;
512 
513   a = (__vector signed char)vec_splats (__m1);
514   b = (__vector signed char)vec_splats (__m2);
515   c = vec_sub (a, b);
516   return (__m64) ((__vector long long) c)[0];
517 #else
518   __m64_union m1, m2, res;
519 
520   m1.as_m64 = __m1;
521   m2.as_m64 = __m2;
522 
523   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
524   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
525   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
526   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
527   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
528   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
529   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
530   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
531 
532   return (__m64) res.as_m64;
533 #endif
534 }
535 
536 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubb(__m64 __m1,__m64 __m2)537 _m_psubb (__m64 __m1, __m64 __m2)
538 {
539   return _mm_sub_pi8 (__m1, __m2);
540 }
541 
542 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
543 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi16(__m64 __m1,__m64 __m2)544 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
545 {
546 #if _ARCH_PWR8
547   __vector signed short a, b, c;
548 
549   a = (__vector signed short)vec_splats (__m1);
550   b = (__vector signed short)vec_splats (__m2);
551   c = vec_sub (a, b);
552   return (__m64) ((__vector long long) c)[0];
553 #else
554   __m64_union m1, m2, res;
555 
556   m1.as_m64 = __m1;
557   m2.as_m64 = __m2;
558 
559   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
560   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
561   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
562   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
563 
564   return (__m64) res.as_m64;
565 #endif
566 }
567 
568 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubw(__m64 __m1,__m64 __m2)569 _m_psubw (__m64 __m1, __m64 __m2)
570 {
571   return _mm_sub_pi16 (__m1, __m2);
572 }
573 
574 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
575 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi32(__m64 __m1,__m64 __m2)576 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
577 {
578 #if _ARCH_PWR9
579   __vector signed int a, b, c;
580 
581   a = (__vector signed int)vec_splats (__m1);
582   b = (__vector signed int)vec_splats (__m2);
583   c = vec_sub (a, b);
584   return (__m64) ((__vector long long) c)[0];
585 #else
586   __m64_union m1, m2, res;
587 
588   m1.as_m64 = __m1;
589   m2.as_m64 = __m2;
590 
591   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
592   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
593 
594   return (__m64) res.as_m64;
595 #endif
596 }
597 
598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubd(__m64 __m1,__m64 __m2)599 _m_psubd (__m64 __m1, __m64 __m2)
600 {
601   return _mm_sub_pi32 (__m1, __m2);
602 }
603 
604 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_si64(__m64 __m1,__m64 __m2)605 _mm_add_si64 (__m64 __m1, __m64 __m2)
606 {
607   return (__m1 + __m2);
608 }
609 
610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_si64(__m64 __m1,__m64 __m2)611 _mm_sub_si64 (__m64 __m1, __m64 __m2)
612 {
613   return (__m1 - __m2);
614 }
615 
616 /* Shift the 64-bit value in M left by COUNT.  */
617 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_si64(__m64 __m,__m64 __count)618 _mm_sll_si64 (__m64 __m, __m64 __count)
619 {
620   return (__m << __count);
621 }
622 
623 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllq(__m64 __m,__m64 __count)624 _m_psllq (__m64 __m, __m64 __count)
625 {
626   return _mm_sll_si64 (__m, __count);
627 }
628 
629 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si64(__m64 __m,const int __count)630 _mm_slli_si64 (__m64 __m, const int __count)
631 {
632   return (__m << __count);
633 }
634 
635 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllqi(__m64 __m,const int __count)636 _m_psllqi (__m64 __m, const int __count)
637 {
638   return _mm_slli_si64 (__m, __count);
639 }
640 
641 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_si64(__m64 __m,__m64 __count)643 _mm_srl_si64 (__m64 __m, __m64 __count)
644 {
645   return (__m >> __count);
646 }
647 
648 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlq(__m64 __m,__m64 __count)649 _m_psrlq (__m64 __m, __m64 __count)
650 {
651   return _mm_srl_si64 (__m, __count);
652 }
653 
654 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si64(__m64 __m,const int __count)655 _mm_srli_si64 (__m64 __m, const int __count)
656 {
657   return (__m >> __count);
658 }
659 
660 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlqi(__m64 __m,const int __count)661 _m_psrlqi (__m64 __m, const int __count)
662 {
663   return _mm_srli_si64 (__m, __count);
664 }
665 
666 /* Bit-wise AND the 64-bit values in M1 and M2.  */
667 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si64(__m64 __m1,__m64 __m2)668 _mm_and_si64 (__m64 __m1, __m64 __m2)
669 {
670   return (__m1 & __m2);
671 }
672 
673 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pand(__m64 __m1,__m64 __m2)674 _m_pand (__m64 __m1, __m64 __m2)
675 {
676   return _mm_and_si64 (__m1, __m2);
677 }
678 
679 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
680    64-bit value in M2.  */
681 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si64(__m64 __m1,__m64 __m2)682 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
683 {
684   return (~__m1 & __m2);
685 }
686 
687 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pandn(__m64 __m1,__m64 __m2)688 _m_pandn (__m64 __m1, __m64 __m2)
689 {
690   return _mm_andnot_si64 (__m1, __m2);
691 }
692 
693 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
694 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si64(__m64 __m1,__m64 __m2)695 _mm_or_si64 (__m64 __m1, __m64 __m2)
696 {
697   return (__m1 | __m2);
698 }
699 
700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_por(__m64 __m1,__m64 __m2)701 _m_por (__m64 __m1, __m64 __m2)
702 {
703   return _mm_or_si64 (__m1, __m2);
704 }
705 
706 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
707 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si64(__m64 __m1,__m64 __m2)708 _mm_xor_si64 (__m64 __m1, __m64 __m2)
709 {
710   return  (__m1 ^ __m2);
711 }
712 
713 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pxor(__m64 __m1,__m64 __m2)714 _m_pxor (__m64 __m1, __m64 __m2)
715 {
716   return _mm_xor_si64 (__m1, __m2);
717 }
718 
719 /* Creates a 64-bit zero.  */
720 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si64(void)721 _mm_setzero_si64 (void)
722 {
723   return (__m64) 0;
724 }
725 
726 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
727    test is true and zero if false.  */
728 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi8(__m64 __m1,__m64 __m2)729 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
730 {
731 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
732   __m64 res;
733   __asm__(
734       "cmpb %0,%1,%2;\n"
735       : "=r" (res)
736       : "r" (__m1),
737 	"r" (__m2)
738       : );
739   return (res);
740 #else
741   __m64_union m1, m2, res;
742 
743   m1.as_m64 = __m1;
744   m2.as_m64 = __m2;
745 
746   res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
747   res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
748   res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
749   res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
750   res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
751   res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
752   res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
753   res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
754 
755   return (__m64) res.as_m64;
756 #endif
757 }
758 
759 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqb(__m64 __m1,__m64 __m2)760 _m_pcmpeqb (__m64 __m1, __m64 __m2)
761 {
762   return _mm_cmpeq_pi8 (__m1, __m2);
763 }
764 
765 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi8(__m64 __m1,__m64 __m2)766 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
767 {
768 #if _ARCH_PWR8
769   __vector signed char a, b, c;
770 
771   a = (__vector signed char)vec_splats (__m1);
772   b = (__vector signed char)vec_splats (__m2);
773   c = (__vector signed char)vec_cmpgt (a, b);
774   return (__m64) ((__vector long long) c)[0];
775 #else
776   __m64_union m1, m2, res;
777 
778   m1.as_m64 = __m1;
779   m2.as_m64 = __m2;
780 
781   res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
782   res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
783   res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
784   res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
785   res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
786   res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
787   res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
788   res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
789 
790   return (__m64) res.as_m64;
791 #endif
792 }
793 
794 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtb(__m64 __m1,__m64 __m2)795 _m_pcmpgtb (__m64 __m1, __m64 __m2)
796 {
797   return _mm_cmpgt_pi8 (__m1, __m2);
798 }
799 
800 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
801    the test is true and zero if false.  */
802 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi16(__m64 __m1,__m64 __m2)803 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
804 {
805 #if _ARCH_PWR8
806   __vector signed short a, b, c;
807 
808   a = (__vector signed short)vec_splats (__m1);
809   b = (__vector signed short)vec_splats (__m2);
810   c = (__vector signed short)vec_cmpeq (a, b);
811   return (__m64) ((__vector long long) c)[0];
812 #else
813   __m64_union m1, m2, res;
814 
815   m1.as_m64 = __m1;
816   m2.as_m64 = __m2;
817 
818   res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
819   res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
820   res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
821   res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
822 
823   return (__m64) res.as_m64;
824 #endif
825 }
826 
827 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqw(__m64 __m1,__m64 __m2)828 _m_pcmpeqw (__m64 __m1, __m64 __m2)
829 {
830   return _mm_cmpeq_pi16 (__m1, __m2);
831 }
832 
833 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi16(__m64 __m1,__m64 __m2)834 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
835 {
836 #if _ARCH_PWR8
837   __vector signed short a, b, c;
838 
839   a = (__vector signed short)vec_splats (__m1);
840   b = (__vector signed short)vec_splats (__m2);
841   c = (__vector signed short)vec_cmpgt (a, b);
842   return (__m64) ((__vector long long) c)[0];
843 #else
844   __m64_union m1, m2, res;
845 
846   m1.as_m64 = __m1;
847   m2.as_m64 = __m2;
848 
849   res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
850   res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
851   res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
852   res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
853 
854   return (__m64) res.as_m64;
855 #endif
856 }
857 
858 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtw(__m64 __m1,__m64 __m2)859 _m_pcmpgtw (__m64 __m1, __m64 __m2)
860 {
861   return _mm_cmpgt_pi16 (__m1, __m2);
862 }
863 
864 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
865    the test is true and zero if false.  */
866 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)867 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
868 {
869 #if _ARCH_PWR9
870   __vector signed int a, b, c;
871 
872   a = (__vector signed int)vec_splats (__m1);
873   b = (__vector signed int)vec_splats (__m2);
874   c = (__vector signed int)vec_cmpeq (a, b);
875   return (__m64) ((__vector long long) c)[0];
876 #else
877   __m64_union m1, m2, res;
878 
879   m1.as_m64 = __m1;
880   m2.as_m64 = __m2;
881 
882   res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
883   res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
884 
885   return (__m64) res.as_m64;
886 #endif
887 }
888 
889 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqd(__m64 __m1,__m64 __m2)890 _m_pcmpeqd (__m64 __m1, __m64 __m2)
891 {
892   return _mm_cmpeq_pi32 (__m1, __m2);
893 }
894 
895 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi32(__m64 __m1,__m64 __m2)896 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
897 {
898 #if _ARCH_PWR9
899   __vector signed int a, b, c;
900 
901   a = (__vector signed int)vec_splats (__m1);
902   b = (__vector signed int)vec_splats (__m2);
903   c = (__vector signed int)vec_cmpgt (a, b);
904   return (__m64) ((__vector long long) c)[0];
905 #else
906   __m64_union m1, m2, res;
907 
908   m1.as_m64 = __m1;
909   m2.as_m64 = __m2;
910 
911   res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
912   res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
913 
914   return (__m64) res.as_m64;
915 #endif
916 }
917 
918 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtd(__m64 __m1,__m64 __m2)919 _m_pcmpgtd (__m64 __m1, __m64 __m2)
920 {
921   return _mm_cmpgt_pi32 (__m1, __m2);
922 }
923 
924 #if _ARCH_PWR8
925 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
926    saturated arithmetic.  */
927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi8(__m64 __m1,__m64 __m2)928 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
929 {
930   __vector signed char a, b, c;
931 
932   a = (__vector signed char)vec_splats (__m1);
933   b = (__vector signed char)vec_splats (__m2);
934   c = vec_adds (a, b);
935   return (__m64) ((__vector long long) c)[0];
936 }
937 
938 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsb(__m64 __m1,__m64 __m2)939 _m_paddsb (__m64 __m1, __m64 __m2)
940 {
941   return _mm_adds_pi8 (__m1, __m2);
942 }
943 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
944    saturated arithmetic.  */
945 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi16(__m64 __m1,__m64 __m2)946 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
947 {
948   __vector signed short a, b, c;
949 
950   a = (__vector signed short)vec_splats (__m1);
951   b = (__vector signed short)vec_splats (__m2);
952   c = vec_adds (a, b);
953   return (__m64) ((__vector long long) c)[0];
954 }
955 
956 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsw(__m64 __m1,__m64 __m2)957 _m_paddsw (__m64 __m1, __m64 __m2)
958 {
959   return _mm_adds_pi16 (__m1, __m2);
960 }
961 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
962    saturated arithmetic.  */
963 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu8(__m64 __m1,__m64 __m2)964 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
965 {
966   __vector unsigned char a, b, c;
967 
968   a = (__vector unsigned char)vec_splats (__m1);
969   b = (__vector unsigned char)vec_splats (__m2);
970   c = vec_adds (a, b);
971   return (__m64) ((__vector long long) c)[0];
972 }
973 
974 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusb(__m64 __m1,__m64 __m2)975 _m_paddusb (__m64 __m1, __m64 __m2)
976 {
977   return _mm_adds_pu8 (__m1, __m2);
978 }
979 
980 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
981    saturated arithmetic.  */
982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu16(__m64 __m1,__m64 __m2)983 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
984 {
985   __vector unsigned short a, b, c;
986 
987   a = (__vector unsigned short)vec_splats (__m1);
988   b = (__vector unsigned short)vec_splats (__m2);
989   c = vec_adds (a, b);
990   return (__m64) ((__vector long long) c)[0];
991 }
992 
993 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusw(__m64 __m1,__m64 __m2)994 _m_paddusw (__m64 __m1, __m64 __m2)
995 {
996   return _mm_adds_pu16 (__m1, __m2);
997 }
998 
999 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
1000    saturating arithmetic.  */
1001 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi8(__m64 __m1,__m64 __m2)1002 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
1003 {
1004   __vector signed char a, b, c;
1005 
1006   a = (__vector signed char)vec_splats (__m1);
1007   b = (__vector signed char)vec_splats (__m2);
1008   c = vec_subs (a, b);
1009   return (__m64) ((__vector long long) c)[0];
1010 }
1011 
1012 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsb(__m64 __m1,__m64 __m2)1013 _m_psubsb (__m64 __m1, __m64 __m2)
1014 {
1015   return _mm_subs_pi8 (__m1, __m2);
1016 }
1017 
1018 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1019    signed saturating arithmetic.  */
1020 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi16(__m64 __m1,__m64 __m2)1021 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1022 {
1023   __vector signed short a, b, c;
1024 
1025   a = (__vector signed short)vec_splats (__m1);
1026   b = (__vector signed short)vec_splats (__m2);
1027   c = vec_subs (a, b);
1028   return (__m64) ((__vector long long) c)[0];
1029 }
1030 
1031 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsw(__m64 __m1,__m64 __m2)1032 _m_psubsw (__m64 __m1, __m64 __m2)
1033 {
1034   return _mm_subs_pi16 (__m1, __m2);
1035 }
1036 
1037 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1038    unsigned saturating arithmetic.  */
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu8(__m64 __m1,__m64 __m2)1040 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1041 {
1042   __vector unsigned char a, b, c;
1043 
1044   a = (__vector unsigned char)vec_splats (__m1);
1045   b = (__vector unsigned char)vec_splats (__m2);
1046   c = vec_subs (a, b);
1047   return (__m64) ((__vector long long) c)[0];
1048 }
1049 
1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusb(__m64 __m1,__m64 __m2)1051 _m_psubusb (__m64 __m1, __m64 __m2)
1052 {
1053   return _mm_subs_pu8 (__m1, __m2);
1054 }
1055 
1056 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1057    unsigned saturating arithmetic.  */
1058 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu16(__m64 __m1,__m64 __m2)1059 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1060 {
1061   __vector unsigned short a, b, c;
1062 
1063   a = (__vector unsigned short)vec_splats (__m1);
1064   b = (__vector unsigned short)vec_splats (__m2);
1065   c = vec_subs (a, b);
1066   return (__m64) ((__vector long long) c)[0];
1067 }
1068 
1069 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusw(__m64 __m1,__m64 __m2)1070 _m_psubusw (__m64 __m1, __m64 __m2)
1071 {
1072   return _mm_subs_pu16 (__m1, __m2);
1073 }
1074 
1075 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1076    four 32-bit intermediate results, which are then summed by pairs to
1077    produce two 32-bit results.  */
1078 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_pi16(__m64 __m1,__m64 __m2)1079 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1080 {
1081   __vector signed short a, b;
1082   __vector signed int c;
1083   __vector signed int zero = {0, 0, 0, 0};
1084 
1085   a = (__vector signed short)vec_splats (__m1);
1086   b = (__vector signed short)vec_splats (__m2);
1087   c = vec_vmsumshm (a, b, zero);
1088   return (__m64) ((__vector long long) c)[0];
1089 }
1090 
1091 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaddwd(__m64 __m1,__m64 __m2)1092 _m_pmaddwd (__m64 __m1, __m64 __m2)
1093 {
1094   return _mm_madd_pi16 (__m1, __m2);
1095 }
1096 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1097    M2 and produce the high 16 bits of the 32-bit results.  */
1098 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pi16(__m64 __m1,__m64 __m2)1099 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1100 {
1101   __vector signed short a, b;
1102   __vector signed short c;
1103   __vector signed int w0, w1;
1104   __vector unsigned char xform1 = {
1105 #ifdef __LITTLE_ENDIAN__
1106       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1107       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1108 #else
1109       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1110       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
1111 #endif
1112     };
1113 
1114   a = (__vector signed short)vec_splats (__m1);
1115   b = (__vector signed short)vec_splats (__m2);
1116 
1117   w0 = vec_vmulesh (a, b);
1118   w1 = vec_vmulosh (a, b);
1119   c = (__vector signed short)vec_perm (w0, w1, xform1);
1120 
1121   return (__m64) ((__vector long long) c)[0];
1122 }
1123 
1124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhw(__m64 __m1,__m64 __m2)1125 _m_pmulhw (__m64 __m1, __m64 __m2)
1126 {
1127   return _mm_mulhi_pi16 (__m1, __m2);
1128 }
1129 
1130 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1131    the low 16 bits of the results.  */
1132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_pi16(__m64 __m1,__m64 __m2)1133 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1134 {
1135   __vector signed short a, b, c;
1136 
1137   a = (__vector signed short)vec_splats (__m1);
1138   b = (__vector signed short)vec_splats (__m2);
1139   c = a * b;
1140   return (__m64) ((__vector long long) c)[0];
1141 }
1142 
1143 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmullw(__m64 __m1,__m64 __m2)1144 _m_pmullw (__m64 __m1, __m64 __m2)
1145 {
1146   return _mm_mullo_pi16 (__m1, __m2);
1147 }
1148 
1149 /* Shift four 16-bit values in M left by COUNT.  */
1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi16(__m64 __m,__m64 __count)1151 _mm_sll_pi16 (__m64 __m, __m64 __count)
1152 {
1153   __vector signed short m, r;
1154   __vector unsigned short c;
1155 
1156   if (__count <= 15)
1157     {
1158       m = (__vector signed short)vec_splats (__m);
1159       c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1160       r = vec_sl (m, (__vector unsigned short)c);
1161       return (__m64) ((__vector long long) r)[0];
1162     }
1163   else
1164   return (0);
1165 }
1166 
1167 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllw(__m64 __m,__m64 __count)1168 _m_psllw (__m64 __m, __m64 __count)
1169 {
1170   return _mm_sll_pi16 (__m, __count);
1171 }
1172 
1173 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi16(__m64 __m,int __count)1174 _mm_slli_pi16 (__m64 __m, int __count)
1175 {
1176   /* Promote int to long then invoke mm_sll_pi16.  */
1177   return _mm_sll_pi16 (__m, __count);
1178 }
1179 
1180 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllwi(__m64 __m,int __count)1181 _m_psllwi (__m64 __m, int __count)
1182 {
1183   return _mm_slli_pi16 (__m, __count);
1184 }
1185 
1186 /* Shift two 32-bit values in M left by COUNT.  */
1187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi32(__m64 __m,__m64 __count)1188 _mm_sll_pi32 (__m64 __m, __m64 __count)
1189 {
1190   __m64_union m, res;
1191 
1192   m.as_m64 = __m;
1193 
1194   res.as_int[0] = m.as_int[0] << __count;
1195   res.as_int[1] = m.as_int[1] << __count;
1196   return (res.as_m64);
1197 }
1198 
1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslld(__m64 __m,__m64 __count)1200 _m_pslld (__m64 __m, __m64 __count)
1201 {
1202   return _mm_sll_pi32 (__m, __count);
1203 }
1204 
1205 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi32(__m64 __m,int __count)1206 _mm_slli_pi32 (__m64 __m, int __count)
1207 {
1208   /* Promote int to long then invoke mm_sll_pi32.  */
1209   return _mm_sll_pi32 (__m, __count);
1210 }
1211 
1212 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslldi(__m64 __m,int __count)1213 _m_pslldi (__m64 __m, int __count)
1214 {
1215   return _mm_slli_pi32 (__m, __count);
1216 }
1217 
1218 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi16(__m64 __m,__m64 __count)1220 _mm_sra_pi16 (__m64 __m, __m64 __count)
1221 {
1222   __vector signed short m, r;
1223   __vector unsigned short c;
1224 
1225   if (__count <= 15)
1226     {
1227 	m = (__vector signed short)vec_splats (__m);
1228 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1229 	r = vec_sra (m, (__vector unsigned short)c);
1230         return (__m64) ((__vector long long) r)[0];
1231     }
1232   else
1233   return (0);
1234 }
1235 
1236 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psraw(__m64 __m,__m64 __count)1237 _m_psraw (__m64 __m, __m64 __count)
1238 {
1239   return _mm_sra_pi16 (__m, __count);
1240 }
1241 
1242 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi16(__m64 __m,int __count)1243 _mm_srai_pi16 (__m64 __m, int __count)
1244 {
1245   /* Promote int to long then invoke mm_sra_pi32.  */
1246   return _mm_sra_pi16 (__m, __count);
1247 }
1248 
1249 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrawi(__m64 __m,int __count)1250 _m_psrawi (__m64 __m, int __count)
1251 {
1252   return _mm_srai_pi16 (__m, __count);
1253 }
1254 
1255 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1256 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi32(__m64 __m,__m64 __count)1257 _mm_sra_pi32 (__m64 __m, __m64 __count)
1258 {
1259   __m64_union m, res;
1260 
1261   m.as_m64 = __m;
1262 
1263   res.as_int[0] = m.as_int[0] >> __count;
1264   res.as_int[1] = m.as_int[1] >> __count;
1265   return (res.as_m64);
1266 }
1267 
1268 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrad(__m64 __m,__m64 __count)1269 _m_psrad (__m64 __m, __m64 __count)
1270 {
1271   return _mm_sra_pi32 (__m, __count);
1272 }
1273 
1274 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi32(__m64 __m,int __count)1275 _mm_srai_pi32 (__m64 __m, int __count)
1276 {
1277   /* Promote int to long then invoke mm_sra_pi32.  */
1278   return _mm_sra_pi32 (__m, __count);
1279 }
1280 
1281 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psradi(__m64 __m,int __count)1282 _m_psradi (__m64 __m, int __count)
1283 {
1284   return _mm_srai_pi32 (__m, __count);
1285 }
1286 
1287 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1288 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi16(__m64 __m,__m64 __count)1289 _mm_srl_pi16 (__m64 __m, __m64 __count)
1290 {
1291   __vector unsigned short m, r;
1292   __vector unsigned short c;
1293 
1294   if (__count <= 15)
1295     {
1296 	m = (__vector unsigned short)vec_splats (__m);
1297 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1298 	r = vec_sr (m, (__vector unsigned short)c);
1299         return (__m64) ((__vector long long) r)[0];
1300     }
1301   else
1302     return (0);
1303 }
1304 
1305 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlw(__m64 __m,__m64 __count)1306 _m_psrlw (__m64 __m, __m64 __count)
1307 {
1308   return _mm_srl_pi16 (__m, __count);
1309 }
1310 
1311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi16(__m64 __m,int __count)1312 _mm_srli_pi16 (__m64 __m, int __count)
1313 {
1314   /* Promote int to long then invoke mm_sra_pi32.  */
1315   return _mm_srl_pi16 (__m, __count);
1316 }
1317 
1318 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlwi(__m64 __m,int __count)1319 _m_psrlwi (__m64 __m, int __count)
1320 {
1321   return _mm_srli_pi16 (__m, __count);
1322 }
1323 
1324 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1325 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi32(__m64 __m,__m64 __count)1326 _mm_srl_pi32 (__m64 __m, __m64 __count)
1327 {
1328   __m64_union m, res;
1329 
1330   m.as_m64 = __m;
1331 
1332   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1333   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1334   return (res.as_m64);
1335 }
1336 
1337 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrld(__m64 __m,__m64 __count)1338 _m_psrld (__m64 __m, __m64 __count)
1339 {
1340   return _mm_srl_pi32 (__m, __count);
1341 }
1342 
1343 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi32(__m64 __m,int __count)1344 _mm_srli_pi32 (__m64 __m, int __count)
1345 {
1346   /* Promote int to long then invoke mm_srl_pi32.  */
1347   return _mm_srl_pi32 (__m, __count);
1348 }
1349 
1350 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrldi(__m64 __m,int __count)1351 _m_psrldi (__m64 __m, int __count)
1352 {
1353   return _mm_srli_pi32 (__m, __count);
1354 }
1355 #endif /* _ARCH_PWR8 */
1356 
1357 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1358 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi32(int __i1,int __i0)1359 _mm_set_pi32 (int __i1, int __i0)
1360 {
1361   __m64_union res;
1362 
1363   res.as_int[0] = __i0;
1364   res.as_int[1] = __i1;
1365   return (res.as_m64);
1366 }
1367 
1368 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1369 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi16(short __w3,short __w2,short __w1,short __w0)1370 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1371 {
1372   __m64_union res;
1373 
1374   res.as_short[0] = __w0;
1375   res.as_short[1] = __w1;
1376   res.as_short[2] = __w2;
1377   res.as_short[3] = __w3;
1378   return (res.as_m64);
1379 }
1380 
1381 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1382 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi8(char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)1383 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1384 	     char __b3, char __b2, char __b1, char __b0)
1385 {
1386   __m64_union res;
1387 
1388   res.as_char[0] = __b0;
1389   res.as_char[1] = __b1;
1390   res.as_char[2] = __b2;
1391   res.as_char[3] = __b3;
1392   res.as_char[4] = __b4;
1393   res.as_char[5] = __b5;
1394   res.as_char[6] = __b6;
1395   res.as_char[7] = __b7;
1396   return (res.as_m64);
1397 }
1398 
1399 /* Similar, but with the arguments in reverse order.  */
1400 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi32(int __i0,int __i1)1401 _mm_setr_pi32 (int __i0, int __i1)
1402 {
1403   __m64_union res;
1404 
1405   res.as_int[0] = __i0;
1406   res.as_int[1] = __i1;
1407   return (res.as_m64);
1408 }
1409 
1410 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi16(short __w0,short __w1,short __w2,short __w3)1411 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1412 {
1413   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1414 }
1415 
1416 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7)1417 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1418 	      char __b4, char __b5, char __b6, char __b7)
1419 {
1420   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1421 }
1422 
1423 /* Creates a vector of two 32-bit values, both elements containing I.  */
1424 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi32(int __i)1425 _mm_set1_pi32 (int __i)
1426 {
1427   __m64_union res;
1428 
1429   res.as_int[0] = __i;
1430   res.as_int[1] = __i;
1431   return (res.as_m64);
1432 }
1433 
1434 /* Creates a vector of four 16-bit values, all elements containing W.  */
1435 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi16(short __w)1436 _mm_set1_pi16 (short __w)
1437 {
1438 #if _ARCH_PWR9
1439   __vector signed short w;
1440 
1441   w = (__vector signed short)vec_splats (__w);
1442   return (__m64) ((__vector long long) w)[0];
1443 #else
1444   __m64_union res;
1445 
1446   res.as_short[0] = __w;
1447   res.as_short[1] = __w;
1448   res.as_short[2] = __w;
1449   res.as_short[3] = __w;
1450   return (res.as_m64);
1451 #endif
1452 }
1453 
1454 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1455 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi8(signed char __b)1456 _mm_set1_pi8 (signed char __b)
1457 {
1458 #if _ARCH_PWR8
1459   __vector signed char b;
1460 
1461   b = (__vector signed char)vec_splats (__b);
1462   return (__m64) ((__vector long long) b)[0];
1463 #else
1464   __m64_union res;
1465 
1466   res.as_char[0] = __b;
1467   res.as_char[1] = __b;
1468   res.as_char[2] = __b;
1469   res.as_char[3] = __b;
1470   res.as_char[4] = __b;
1471   res.as_char[5] = __b;
1472   res.as_char[6] = __b;
1473   res.as_char[7] = __b;
1474   return (res.as_m64);
1475 #endif
1476 }
1477 #endif /* _MMINTRIN_H_INCLUDED */
1478