1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since PowerPC target doesn't support native 64-bit vector type, we
18    typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19    works well for _si64 and some _pi32 operations.
20 
21    For _pi16 and _pi8 operations, it's better to transfer __m64 into
22    128-bit PowerPC vector first. Power8 introduced direct register
23    move instructions which helps for more efficient implementation.
24 
25    It's user's responsibility to determine if the results of such port
26    are acceptable or further changes are needed. Please note that much
27    code using Intel intrinsics CAN BE REWRITTEN in more portable and
28    efficient standard C or GNU C extensions with 64-bit scalar
29    operations, or 128-bit SSE/Altivec operations, which are more
30    recommended. */
31 #error                                                                         \
32     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef _MMINTRIN_H_INCLUDED
36 #define _MMINTRIN_H_INCLUDED
37 
38 #include <altivec.h>
39 /* The Intel API is flexible enough that we must allow aliasing with other
40    vector types, and their scalar components.  */
41 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
42 
43 typedef __attribute__((__aligned__(8))) union {
44   __m64 as_m64;
45   char as_char[8];
46   signed char as_signed_char[8];
47   short as_short[4];
48   int as_int[2];
49   long long as_long_long;
50   float as_float[2];
51   double as_double;
52 } __m64_union;
53 
54 /* Empty the multimedia state.  */
55 extern __inline void
56     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57     _mm_empty(void) {
58   /* nothing to do on PowerPC.  */
59 }
60 
61 extern __inline void
62     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63     _m_empty(void) {
64   /* nothing to do on PowerPC.  */
65 }
66 
67 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
68 extern __inline __m64
69     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70     _mm_cvtsi32_si64(int __i) {
71   return (__m64)(unsigned int)__i;
72 }
73 
74 extern __inline __m64
75     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76     _m_from_int(int __i) {
77   return _mm_cvtsi32_si64(__i);
78 }
79 
80 /* Convert the lower 32 bits of the __m64 object into an integer.  */
81 extern __inline int
82     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83     _mm_cvtsi64_si32(__m64 __i) {
84   return ((int)__i);
85 }
86 
87 extern __inline int
88     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89     _m_to_int(__m64 __i) {
90   return _mm_cvtsi64_si32(__i);
91 }
92 
93 /* Convert I to a __m64 object.  */
94 
95 /* Intel intrinsic.  */
96 extern __inline __m64
97     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98     _m_from_int64(long long __i) {
99   return (__m64)__i;
100 }
101 
102 extern __inline __m64
103     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104     _mm_cvtsi64_m64(long long __i) {
105   return (__m64)__i;
106 }
107 
108 /* Microsoft intrinsic.  */
109 extern __inline __m64
110     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
111     _mm_cvtsi64x_si64(long long __i) {
112   return (__m64)__i;
113 }
114 
115 extern __inline __m64
116     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117     _mm_set_pi64x(long long __i) {
118   return (__m64)__i;
119 }
120 
121 /* Convert the __m64 object to a 64bit integer.  */
122 
123 /* Intel intrinsic.  */
124 extern __inline long long
125     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126     _m_to_int64(__m64 __i) {
127   return (long long)__i;
128 }
129 
130 extern __inline long long
131     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132     _mm_cvtm64_si64(__m64 __i) {
133   return (long long)__i;
134 }
135 
136 /* Microsoft intrinsic.  */
137 extern __inline long long
138     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139     _mm_cvtsi64_si64x(__m64 __i) {
140   return (long long)__i;
141 }
142 
143 #ifdef _ARCH_PWR8
144 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
145    the result, and the four 16-bit values from M2 into the upper four 8-bit
146    values of the result, all with signed saturation.  */
147 extern __inline __m64
148     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149     _mm_packs_pi16(__m64 __m1, __m64 __m2) {
150   __vector signed short vm1;
151   __vector signed char vresult;
152 
153   vm1 = (__vector signed short)(__vector unsigned long long)
154 #ifdef __LITTLE_ENDIAN__
155       {__m1, __m2};
156 #else
157       {__m2, __m1};
158 #endif
159   vresult = vec_packs(vm1, vm1);
160   return (__m64)((__vector long long)vresult)[0];
161 }
162 
163 extern __inline __m64
164     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165     _m_packsswb(__m64 __m1, __m64 __m2) {
166   return _mm_packs_pi16(__m1, __m2);
167 }
168 
169 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
170    the result, and the two 32-bit values from M2 into the upper two 16-bit
171    values of the result, all with signed saturation.  */
172 extern __inline __m64
173     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174     _mm_packs_pi32(__m64 __m1, __m64 __m2) {
175   __vector signed int vm1;
176   __vector signed short vresult;
177 
178   vm1 = (__vector signed int)(__vector unsigned long long)
179 #ifdef __LITTLE_ENDIAN__
180       {__m1, __m2};
181 #else
182       {__m2, __m1};
183 #endif
184   vresult = vec_packs(vm1, vm1);
185   return (__m64)((__vector long long)vresult)[0];
186 }
187 
188 extern __inline __m64
189     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190     _m_packssdw(__m64 __m1, __m64 __m2) {
191   return _mm_packs_pi32(__m1, __m2);
192 }
193 
194 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
195    the result, and the four 16-bit values from M2 into the upper four 8-bit
196    values of the result, all with unsigned saturation.  */
197 extern __inline __m64
198     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199     _mm_packs_pu16(__m64 __m1, __m64 __m2) {
200   __vector unsigned char r;
201   __vector signed short vm1 = (__vector signed short)(__vector long long)
202 #ifdef __LITTLE_ENDIAN__
203       {__m1, __m2};
204 #else
205       {__m2, __m1};
206 #endif
207   const __vector signed short __zero = {0};
208   __vector __bool short __select = vec_cmplt(vm1, __zero);
209   r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
210   __vector __bool char packsel = vec_pack(__select, __select);
211   r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
212   return (__m64)((__vector long long)r)[0];
213 }
214 
215 extern __inline __m64
216     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
217     _m_packuswb(__m64 __m1, __m64 __m2) {
218   return _mm_packs_pu16(__m1, __m2);
219 }
220 #endif /* end ARCH_PWR8 */
221 
222 /* Interleave the four 8-bit values from the high half of M1 with the four
223    8-bit values from the high half of M2.  */
224 extern __inline __m64
225     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226     _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
227 #if _ARCH_PWR8
228   __vector unsigned char a, b, c;
229 
230   a = (__vector unsigned char)vec_splats(__m1);
231   b = (__vector unsigned char)vec_splats(__m2);
232   c = vec_mergel(a, b);
233   return (__m64)((__vector long long)c)[1];
234 #else
235   __m64_union m1, m2, res;
236 
237   m1.as_m64 = __m1;
238   m2.as_m64 = __m2;
239 
240   res.as_char[0] = m1.as_char[4];
241   res.as_char[1] = m2.as_char[4];
242   res.as_char[2] = m1.as_char[5];
243   res.as_char[3] = m2.as_char[5];
244   res.as_char[4] = m1.as_char[6];
245   res.as_char[5] = m2.as_char[6];
246   res.as_char[6] = m1.as_char[7];
247   res.as_char[7] = m2.as_char[7];
248 
249   return (__m64)res.as_m64;
250 #endif
251 }
252 
253 extern __inline __m64
254     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255     _m_punpckhbw(__m64 __m1, __m64 __m2) {
256   return _mm_unpackhi_pi8(__m1, __m2);
257 }
258 
259 /* Interleave the two 16-bit values from the high half of M1 with the two
260    16-bit values from the high half of M2.  */
261 extern __inline __m64
262     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263     _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
264   __m64_union m1, m2, res;
265 
266   m1.as_m64 = __m1;
267   m2.as_m64 = __m2;
268 
269   res.as_short[0] = m1.as_short[2];
270   res.as_short[1] = m2.as_short[2];
271   res.as_short[2] = m1.as_short[3];
272   res.as_short[3] = m2.as_short[3];
273 
274   return (__m64)res.as_m64;
275 }
276 
277 extern __inline __m64
278     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279     _m_punpckhwd(__m64 __m1, __m64 __m2) {
280   return _mm_unpackhi_pi16(__m1, __m2);
281 }
282 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
283    value from the high half of M2.  */
284 extern __inline __m64
285     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
286     _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
287   __m64_union m1, m2, res;
288 
289   m1.as_m64 = __m1;
290   m2.as_m64 = __m2;
291 
292   res.as_int[0] = m1.as_int[1];
293   res.as_int[1] = m2.as_int[1];
294 
295   return (__m64)res.as_m64;
296 }
297 
298 extern __inline __m64
299     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300     _m_punpckhdq(__m64 __m1, __m64 __m2) {
301   return _mm_unpackhi_pi32(__m1, __m2);
302 }
303 /* Interleave the four 8-bit values from the low half of M1 with the four
304    8-bit values from the low half of M2.  */
305 extern __inline __m64
306     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307     _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
308 #if _ARCH_PWR8
309   __vector unsigned char a, b, c;
310 
311   a = (__vector unsigned char)vec_splats(__m1);
312   b = (__vector unsigned char)vec_splats(__m2);
313   c = vec_mergel(a, b);
314   return (__m64)((__vector long long)c)[0];
315 #else
316   __m64_union m1, m2, res;
317 
318   m1.as_m64 = __m1;
319   m2.as_m64 = __m2;
320 
321   res.as_char[0] = m1.as_char[0];
322   res.as_char[1] = m2.as_char[0];
323   res.as_char[2] = m1.as_char[1];
324   res.as_char[3] = m2.as_char[1];
325   res.as_char[4] = m1.as_char[2];
326   res.as_char[5] = m2.as_char[2];
327   res.as_char[6] = m1.as_char[3];
328   res.as_char[7] = m2.as_char[3];
329 
330   return (__m64)res.as_m64;
331 #endif
332 }
333 
334 extern __inline __m64
335     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
336     _m_punpcklbw(__m64 __m1, __m64 __m2) {
337   return _mm_unpacklo_pi8(__m1, __m2);
338 }
339 /* Interleave the two 16-bit values from the low half of M1 with the two
340    16-bit values from the low half of M2.  */
341 extern __inline __m64
342     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343     _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
344   __m64_union m1, m2, res;
345 
346   m1.as_m64 = __m1;
347   m2.as_m64 = __m2;
348 
349   res.as_short[0] = m1.as_short[0];
350   res.as_short[1] = m2.as_short[0];
351   res.as_short[2] = m1.as_short[1];
352   res.as_short[3] = m2.as_short[1];
353 
354   return (__m64)res.as_m64;
355 }
356 
357 extern __inline __m64
358     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359     _m_punpcklwd(__m64 __m1, __m64 __m2) {
360   return _mm_unpacklo_pi16(__m1, __m2);
361 }
362 
363 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
364    value from the low half of M2.  */
365 extern __inline __m64
366     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367     _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
368   __m64_union m1, m2, res;
369 
370   m1.as_m64 = __m1;
371   m2.as_m64 = __m2;
372 
373   res.as_int[0] = m1.as_int[0];
374   res.as_int[1] = m2.as_int[0];
375 
376   return (__m64)res.as_m64;
377 }
378 
379 extern __inline __m64
380     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381     _m_punpckldq(__m64 __m1, __m64 __m2) {
382   return _mm_unpacklo_pi32(__m1, __m2);
383 }
384 
385 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
386 extern __inline __m64
387     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388     _mm_add_pi8(__m64 __m1, __m64 __m2) {
389 #if _ARCH_PWR8
390   __vector signed char a, b, c;
391 
392   a = (__vector signed char)vec_splats(__m1);
393   b = (__vector signed char)vec_splats(__m2);
394   c = vec_add(a, b);
395   return (__m64)((__vector long long)c)[0];
396 #else
397   __m64_union m1, m2, res;
398 
399   m1.as_m64 = __m1;
400   m2.as_m64 = __m2;
401 
402   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
403   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
404   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
405   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
406   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
407   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
408   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
409   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
410 
411   return (__m64)res.as_m64;
412 #endif
413 }
414 
415 extern __inline __m64
416     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417     _m_paddb(__m64 __m1, __m64 __m2) {
418   return _mm_add_pi8(__m1, __m2);
419 }
420 
421 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
422 extern __inline __m64
423     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424     _mm_add_pi16(__m64 __m1, __m64 __m2) {
425 #if _ARCH_PWR8
426   __vector signed short a, b, c;
427 
428   a = (__vector signed short)vec_splats(__m1);
429   b = (__vector signed short)vec_splats(__m2);
430   c = vec_add(a, b);
431   return (__m64)((__vector long long)c)[0];
432 #else
433   __m64_union m1, m2, res;
434 
435   m1.as_m64 = __m1;
436   m2.as_m64 = __m2;
437 
438   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
439   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
440   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
441   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
442 
443   return (__m64)res.as_m64;
444 #endif
445 }
446 
447 extern __inline __m64
448     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449     _m_paddw(__m64 __m1, __m64 __m2) {
450   return _mm_add_pi16(__m1, __m2);
451 }
452 
453 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
454 extern __inline __m64
455     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456     _mm_add_pi32(__m64 __m1, __m64 __m2) {
457 #if _ARCH_PWR9
458   __vector signed int a, b, c;
459 
460   a = (__vector signed int)vec_splats(__m1);
461   b = (__vector signed int)vec_splats(__m2);
462   c = vec_add(a, b);
463   return (__m64)((__vector long long)c)[0];
464 #else
465   __m64_union m1, m2, res;
466 
467   m1.as_m64 = __m1;
468   m2.as_m64 = __m2;
469 
470   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
471   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
472 
473   return (__m64)res.as_m64;
474 #endif
475 }
476 
477 extern __inline __m64
478     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
479     _m_paddd(__m64 __m1, __m64 __m2) {
480   return _mm_add_pi32(__m1, __m2);
481 }
482 
483 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
484 extern __inline __m64
485     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486     _mm_sub_pi8(__m64 __m1, __m64 __m2) {
487 #if _ARCH_PWR8
488   __vector signed char a, b, c;
489 
490   a = (__vector signed char)vec_splats(__m1);
491   b = (__vector signed char)vec_splats(__m2);
492   c = vec_sub(a, b);
493   return (__m64)((__vector long long)c)[0];
494 #else
495   __m64_union m1, m2, res;
496 
497   m1.as_m64 = __m1;
498   m2.as_m64 = __m2;
499 
500   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
501   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
502   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
503   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
504   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
505   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
506   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
507   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
508 
509   return (__m64)res.as_m64;
510 #endif
511 }
512 
513 extern __inline __m64
514     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515     _m_psubb(__m64 __m1, __m64 __m2) {
516   return _mm_sub_pi8(__m1, __m2);
517 }
518 
519 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
520 extern __inline __m64
521     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522     _mm_sub_pi16(__m64 __m1, __m64 __m2) {
523 #if _ARCH_PWR8
524   __vector signed short a, b, c;
525 
526   a = (__vector signed short)vec_splats(__m1);
527   b = (__vector signed short)vec_splats(__m2);
528   c = vec_sub(a, b);
529   return (__m64)((__vector long long)c)[0];
530 #else
531   __m64_union m1, m2, res;
532 
533   m1.as_m64 = __m1;
534   m2.as_m64 = __m2;
535 
536   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
537   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
538   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
539   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
540 
541   return (__m64)res.as_m64;
542 #endif
543 }
544 
545 extern __inline __m64
546     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
547     _m_psubw(__m64 __m1, __m64 __m2) {
548   return _mm_sub_pi16(__m1, __m2);
549 }
550 
551 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
552 extern __inline __m64
553     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554     _mm_sub_pi32(__m64 __m1, __m64 __m2) {
555 #if _ARCH_PWR9
556   __vector signed int a, b, c;
557 
558   a = (__vector signed int)vec_splats(__m1);
559   b = (__vector signed int)vec_splats(__m2);
560   c = vec_sub(a, b);
561   return (__m64)((__vector long long)c)[0];
562 #else
563   __m64_union m1, m2, res;
564 
565   m1.as_m64 = __m1;
566   m2.as_m64 = __m2;
567 
568   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
569   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
570 
571   return (__m64)res.as_m64;
572 #endif
573 }
574 
575 extern __inline __m64
576     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577     _m_psubd(__m64 __m1, __m64 __m2) {
578   return _mm_sub_pi32(__m1, __m2);
579 }
580 
581 extern __inline __m64
582     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583     _mm_add_si64(__m64 __m1, __m64 __m2) {
584   return (__m1 + __m2);
585 }
586 
587 extern __inline __m64
588     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589     _mm_sub_si64(__m64 __m1, __m64 __m2) {
590   return (__m1 - __m2);
591 }
592 
593 /* Shift the 64-bit value in M left by COUNT.  */
594 extern __inline __m64
595     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596     _mm_sll_si64(__m64 __m, __m64 __count) {
597   return (__m << __count);
598 }
599 
600 extern __inline __m64
601     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
602     _m_psllq(__m64 __m, __m64 __count) {
603   return _mm_sll_si64(__m, __count);
604 }
605 
606 extern __inline __m64
607     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608     _mm_slli_si64(__m64 __m, const int __count) {
609   return (__m << __count);
610 }
611 
612 extern __inline __m64
613     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
614     _m_psllqi(__m64 __m, const int __count) {
615   return _mm_slli_si64(__m, __count);
616 }
617 
618 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
619 extern __inline __m64
620     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621     _mm_srl_si64(__m64 __m, __m64 __count) {
622   return (__m >> __count);
623 }
624 
625 extern __inline __m64
626     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627     _m_psrlq(__m64 __m, __m64 __count) {
628   return _mm_srl_si64(__m, __count);
629 }
630 
631 extern __inline __m64
632     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
633     _mm_srli_si64(__m64 __m, const int __count) {
634   return (__m >> __count);
635 }
636 
637 extern __inline __m64
638     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639     _m_psrlqi(__m64 __m, const int __count) {
640   return _mm_srli_si64(__m, __count);
641 }
642 
643 /* Bit-wise AND the 64-bit values in M1 and M2.  */
644 extern __inline __m64
645     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646     _mm_and_si64(__m64 __m1, __m64 __m2) {
647   return (__m1 & __m2);
648 }
649 
650 extern __inline __m64
651     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
652     _m_pand(__m64 __m1, __m64 __m2) {
653   return _mm_and_si64(__m1, __m2);
654 }
655 
656 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
657    64-bit value in M2.  */
658 extern __inline __m64
659     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660     _mm_andnot_si64(__m64 __m1, __m64 __m2) {
661   return (~__m1 & __m2);
662 }
663 
664 extern __inline __m64
665     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666     _m_pandn(__m64 __m1, __m64 __m2) {
667   return _mm_andnot_si64(__m1, __m2);
668 }
669 
670 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
671 extern __inline __m64
672     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673     _mm_or_si64(__m64 __m1, __m64 __m2) {
674   return (__m1 | __m2);
675 }
676 
677 extern __inline __m64
678     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679     _m_por(__m64 __m1, __m64 __m2) {
680   return _mm_or_si64(__m1, __m2);
681 }
682 
683 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
684 extern __inline __m64
685     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686     _mm_xor_si64(__m64 __m1, __m64 __m2) {
687   return (__m1 ^ __m2);
688 }
689 
690 extern __inline __m64
691     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692     _m_pxor(__m64 __m1, __m64 __m2) {
693   return _mm_xor_si64(__m1, __m2);
694 }
695 
696 /* Creates a 64-bit zero.  */
697 extern __inline __m64
698     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
699     _mm_setzero_si64(void) {
700   return (__m64)0;
701 }
702 
703 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
704    test is true and zero if false.  */
705 extern __inline __m64
706     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707     _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
708 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
709   __m64 res;
710   __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
711   return (res);
712 #else
713   __m64_union m1, m2, res;
714 
715   m1.as_m64 = __m1;
716   m2.as_m64 = __m2;
717 
718   res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
719   res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
720   res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
721   res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
722   res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
723   res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
724   res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
725   res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
726 
727   return (__m64)res.as_m64;
728 #endif
729 }
730 
731 extern __inline __m64
732     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
733     _m_pcmpeqb(__m64 __m1, __m64 __m2) {
734   return _mm_cmpeq_pi8(__m1, __m2);
735 }
736 
737 extern __inline __m64
738     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
739     _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
740 #if _ARCH_PWR8
741   __vector signed char a, b, c;
742 
743   a = (__vector signed char)vec_splats(__m1);
744   b = (__vector signed char)vec_splats(__m2);
745   c = (__vector signed char)vec_cmpgt(a, b);
746   return (__m64)((__vector long long)c)[0];
747 #else
748   __m64_union m1, m2, res;
749 
750   m1.as_m64 = __m1;
751   m2.as_m64 = __m2;
752 
753   res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
754   res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
755   res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
756   res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
757   res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
758   res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
759   res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
760   res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
761 
762   return (__m64)res.as_m64;
763 #endif
764 }
765 
766 extern __inline __m64
767     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
768     _m_pcmpgtb(__m64 __m1, __m64 __m2) {
769   return _mm_cmpgt_pi8(__m1, __m2);
770 }
771 
772 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
773    the test is true and zero if false.  */
774 extern __inline __m64
775     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776     _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
777 #if _ARCH_PWR8
778   __vector signed short a, b, c;
779 
780   a = (__vector signed short)vec_splats(__m1);
781   b = (__vector signed short)vec_splats(__m2);
782   c = (__vector signed short)vec_cmpeq(a, b);
783   return (__m64)((__vector long long)c)[0];
784 #else
785   __m64_union m1, m2, res;
786 
787   m1.as_m64 = __m1;
788   m2.as_m64 = __m2;
789 
790   res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
791   res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
792   res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
793   res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
794 
795   return (__m64)res.as_m64;
796 #endif
797 }
798 
799 extern __inline __m64
800     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801     _m_pcmpeqw(__m64 __m1, __m64 __m2) {
802   return _mm_cmpeq_pi16(__m1, __m2);
803 }
804 
805 extern __inline __m64
806     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807     _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
808 #if _ARCH_PWR8
809   __vector signed short a, b, c;
810 
811   a = (__vector signed short)vec_splats(__m1);
812   b = (__vector signed short)vec_splats(__m2);
813   c = (__vector signed short)vec_cmpgt(a, b);
814   return (__m64)((__vector long long)c)[0];
815 #else
816   __m64_union m1, m2, res;
817 
818   m1.as_m64 = __m1;
819   m2.as_m64 = __m2;
820 
821   res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
822   res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
823   res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
824   res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
825 
826   return (__m64)res.as_m64;
827 #endif
828 }
829 
830 extern __inline __m64
831     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
832     _m_pcmpgtw(__m64 __m1, __m64 __m2) {
833   return _mm_cmpgt_pi16(__m1, __m2);
834 }
835 
836 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
837    the test is true and zero if false.  */
838 extern __inline __m64
839     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840     _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
841 #if _ARCH_PWR9
842   __vector signed int a, b, c;
843 
844   a = (__vector signed int)vec_splats(__m1);
845   b = (__vector signed int)vec_splats(__m2);
846   c = (__vector signed int)vec_cmpeq(a, b);
847   return (__m64)((__vector long long)c)[0];
848 #else
849   __m64_union m1, m2, res;
850 
851   m1.as_m64 = __m1;
852   m2.as_m64 = __m2;
853 
854   res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
855   res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
856 
857   return (__m64)res.as_m64;
858 #endif
859 }
860 
861 extern __inline __m64
862     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863     _m_pcmpeqd(__m64 __m1, __m64 __m2) {
864   return _mm_cmpeq_pi32(__m1, __m2);
865 }
866 
867 extern __inline __m64
868     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869     _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
870 #if _ARCH_PWR9
871   __vector signed int a, b, c;
872 
873   a = (__vector signed int)vec_splats(__m1);
874   b = (__vector signed int)vec_splats(__m2);
875   c = (__vector signed int)vec_cmpgt(a, b);
876   return (__m64)((__vector long long)c)[0];
877 #else
878   __m64_union m1, m2, res;
879 
880   m1.as_m64 = __m1;
881   m2.as_m64 = __m2;
882 
883   res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
884   res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
885 
886   return (__m64)res.as_m64;
887 #endif
888 }
889 
890 extern __inline __m64
891     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
892     _m_pcmpgtd(__m64 __m1, __m64 __m2) {
893   return _mm_cmpgt_pi32(__m1, __m2);
894 }
895 
896 #if _ARCH_PWR8
897 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
898    saturated arithmetic.  */
899 extern __inline __m64
900     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
901     _mm_adds_pi8(__m64 __m1, __m64 __m2) {
902   __vector signed char a, b, c;
903 
904   a = (__vector signed char)vec_splats(__m1);
905   b = (__vector signed char)vec_splats(__m2);
906   c = vec_adds(a, b);
907   return (__m64)((__vector long long)c)[0];
908 }
909 
910 extern __inline __m64
911     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
912     _m_paddsb(__m64 __m1, __m64 __m2) {
913   return _mm_adds_pi8(__m1, __m2);
914 }
915 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
916    saturated arithmetic.  */
917 extern __inline __m64
918     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919     _mm_adds_pi16(__m64 __m1, __m64 __m2) {
920   __vector signed short a, b, c;
921 
922   a = (__vector signed short)vec_splats(__m1);
923   b = (__vector signed short)vec_splats(__m2);
924   c = vec_adds(a, b);
925   return (__m64)((__vector long long)c)[0];
926 }
927 
928 extern __inline __m64
929     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
930     _m_paddsw(__m64 __m1, __m64 __m2) {
931   return _mm_adds_pi16(__m1, __m2);
932 }
933 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
934    saturated arithmetic.  */
935 extern __inline __m64
936     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
937     _mm_adds_pu8(__m64 __m1, __m64 __m2) {
938   __vector unsigned char a, b, c;
939 
940   a = (__vector unsigned char)vec_splats(__m1);
941   b = (__vector unsigned char)vec_splats(__m2);
942   c = vec_adds(a, b);
943   return (__m64)((__vector long long)c)[0];
944 }
945 
946 extern __inline __m64
947     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
948     _m_paddusb(__m64 __m1, __m64 __m2) {
949   return _mm_adds_pu8(__m1, __m2);
950 }
951 
952 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
953    saturated arithmetic.  */
954 extern __inline __m64
955     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956     _mm_adds_pu16(__m64 __m1, __m64 __m2) {
957   __vector unsigned short a, b, c;
958 
959   a = (__vector unsigned short)vec_splats(__m1);
960   b = (__vector unsigned short)vec_splats(__m2);
961   c = vec_adds(a, b);
962   return (__m64)((__vector long long)c)[0];
963 }
964 
965 extern __inline __m64
966     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967     _m_paddusw(__m64 __m1, __m64 __m2) {
968   return _mm_adds_pu16(__m1, __m2);
969 }
970 
971 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
972    saturating arithmetic.  */
973 extern __inline __m64
974     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975     _mm_subs_pi8(__m64 __m1, __m64 __m2) {
976   __vector signed char a, b, c;
977 
978   a = (__vector signed char)vec_splats(__m1);
979   b = (__vector signed char)vec_splats(__m2);
980   c = vec_subs(a, b);
981   return (__m64)((__vector long long)c)[0];
982 }
983 
984 extern __inline __m64
985     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
986     _m_psubsb(__m64 __m1, __m64 __m2) {
987   return _mm_subs_pi8(__m1, __m2);
988 }
989 
990 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
991    signed saturating arithmetic.  */
992 extern __inline __m64
993     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994     _mm_subs_pi16(__m64 __m1, __m64 __m2) {
995   __vector signed short a, b, c;
996 
997   a = (__vector signed short)vec_splats(__m1);
998   b = (__vector signed short)vec_splats(__m2);
999   c = vec_subs(a, b);
1000   return (__m64)((__vector long long)c)[0];
1001 }
1002 
1003 extern __inline __m64
1004     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005     _m_psubsw(__m64 __m1, __m64 __m2) {
1006   return _mm_subs_pi16(__m1, __m2);
1007 }
1008 
1009 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1010    unsigned saturating arithmetic.  */
1011 extern __inline __m64
1012     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013     _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1014   __vector unsigned char a, b, c;
1015 
1016   a = (__vector unsigned char)vec_splats(__m1);
1017   b = (__vector unsigned char)vec_splats(__m2);
1018   c = vec_subs(a, b);
1019   return (__m64)((__vector long long)c)[0];
1020 }
1021 
1022 extern __inline __m64
1023     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1024     _m_psubusb(__m64 __m1, __m64 __m2) {
1025   return _mm_subs_pu8(__m1, __m2);
1026 }
1027 
1028 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1029    unsigned saturating arithmetic.  */
1030 extern __inline __m64
1031     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032     _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1033   __vector unsigned short a, b, c;
1034 
1035   a = (__vector unsigned short)vec_splats(__m1);
1036   b = (__vector unsigned short)vec_splats(__m2);
1037   c = vec_subs(a, b);
1038   return (__m64)((__vector long long)c)[0];
1039 }
1040 
1041 extern __inline __m64
1042     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043     _m_psubusw(__m64 __m1, __m64 __m2) {
1044   return _mm_subs_pu16(__m1, __m2);
1045 }
1046 
1047 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1048    four 32-bit intermediate results, which are then summed by pairs to
1049    produce two 32-bit results.  */
1050 extern __inline __m64
1051     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052     _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1053   __vector signed short a, b;
1054   __vector signed int c;
1055   __vector signed int zero = {0, 0, 0, 0};
1056 
1057   a = (__vector signed short)vec_splats(__m1);
1058   b = (__vector signed short)vec_splats(__m2);
1059   c = vec_vmsumshm(a, b, zero);
1060   return (__m64)((__vector long long)c)[0];
1061 }
1062 
1063 extern __inline __m64
1064     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065     _m_pmaddwd(__m64 __m1, __m64 __m2) {
1066   return _mm_madd_pi16(__m1, __m2);
1067 }
1068 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1069    M2 and produce the high 16 bits of the 32-bit results.  */
1070 extern __inline __m64
1071     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072     _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1073   __vector signed short a, b;
1074   __vector signed short c;
1075   __vector signed int w0, w1;
1076   __vector unsigned char xform1 = {
1077 #ifdef __LITTLE_ENDIAN__
1078       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1079       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1080 #else
1081       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1082       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1083 #endif
1084   };
1085 
1086   a = (__vector signed short)vec_splats(__m1);
1087   b = (__vector signed short)vec_splats(__m2);
1088 
1089   w0 = vec_vmulesh(a, b);
1090   w1 = vec_vmulosh(a, b);
1091   c = (__vector signed short)vec_perm(w0, w1, xform1);
1092 
1093   return (__m64)((__vector long long)c)[0];
1094 }
1095 
1096 extern __inline __m64
1097     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098     _m_pmulhw(__m64 __m1, __m64 __m2) {
1099   return _mm_mulhi_pi16(__m1, __m2);
1100 }
1101 
1102 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1103    the low 16 bits of the results.  */
1104 extern __inline __m64
1105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106     _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1107   __vector signed short a, b, c;
1108 
1109   a = (__vector signed short)vec_splats(__m1);
1110   b = (__vector signed short)vec_splats(__m2);
1111   c = a * b;
1112   return (__m64)((__vector long long)c)[0];
1113 }
1114 
1115 extern __inline __m64
1116     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117     _m_pmullw(__m64 __m1, __m64 __m2) {
1118   return _mm_mullo_pi16(__m1, __m2);
1119 }
1120 
1121 /* Shift four 16-bit values in M left by COUNT.  */
1122 extern __inline __m64
1123     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1124     _mm_sll_pi16(__m64 __m, __m64 __count) {
1125   __vector signed short m, r;
1126   __vector unsigned short c;
1127 
1128   if (__count <= 15) {
1129     m = (__vector signed short)vec_splats(__m);
1130     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1131     r = vec_sl(m, (__vector unsigned short)c);
1132     return (__m64)((__vector long long)r)[0];
1133   } else
1134     return (0);
1135 }
1136 
1137 extern __inline __m64
1138     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1139     _m_psllw(__m64 __m, __m64 __count) {
1140   return _mm_sll_pi16(__m, __count);
1141 }
1142 
1143 extern __inline __m64
1144     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145     _mm_slli_pi16(__m64 __m, int __count) {
1146   /* Promote int to long then invoke mm_sll_pi16.  */
1147   return _mm_sll_pi16(__m, __count);
1148 }
1149 
1150 extern __inline __m64
1151     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152     _m_psllwi(__m64 __m, int __count) {
1153   return _mm_slli_pi16(__m, __count);
1154 }
1155 
1156 /* Shift two 32-bit values in M left by COUNT.  */
1157 extern __inline __m64
1158     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159     _mm_sll_pi32(__m64 __m, __m64 __count) {
1160   __m64_union m, res;
1161 
1162   m.as_m64 = __m;
1163 
1164   res.as_int[0] = m.as_int[0] << __count;
1165   res.as_int[1] = m.as_int[1] << __count;
1166   return (res.as_m64);
1167 }
1168 
1169 extern __inline __m64
1170     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171     _m_pslld(__m64 __m, __m64 __count) {
1172   return _mm_sll_pi32(__m, __count);
1173 }
1174 
1175 extern __inline __m64
1176     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177     _mm_slli_pi32(__m64 __m, int __count) {
1178   /* Promote int to long then invoke mm_sll_pi32.  */
1179   return _mm_sll_pi32(__m, __count);
1180 }
1181 
1182 extern __inline __m64
1183     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184     _m_pslldi(__m64 __m, int __count) {
1185   return _mm_slli_pi32(__m, __count);
1186 }
1187 
1188 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1189 extern __inline __m64
1190     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191     _mm_sra_pi16(__m64 __m, __m64 __count) {
1192   __vector signed short m, r;
1193   __vector unsigned short c;
1194 
1195   if (__count <= 15) {
1196     m = (__vector signed short)vec_splats(__m);
1197     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1198     r = vec_sra(m, (__vector unsigned short)c);
1199     return (__m64)((__vector long long)r)[0];
1200   } else
1201     return (0);
1202 }
1203 
1204 extern __inline __m64
1205     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206     _m_psraw(__m64 __m, __m64 __count) {
1207   return _mm_sra_pi16(__m, __count);
1208 }
1209 
1210 extern __inline __m64
1211     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212     _mm_srai_pi16(__m64 __m, int __count) {
1213   /* Promote int to long then invoke mm_sra_pi32.  */
1214   return _mm_sra_pi16(__m, __count);
1215 }
1216 
1217 extern __inline __m64
1218     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1219     _m_psrawi(__m64 __m, int __count) {
1220   return _mm_srai_pi16(__m, __count);
1221 }
1222 
1223 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1224 extern __inline __m64
1225     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226     _mm_sra_pi32(__m64 __m, __m64 __count) {
1227   __m64_union m, res;
1228 
1229   m.as_m64 = __m;
1230 
1231   res.as_int[0] = m.as_int[0] >> __count;
1232   res.as_int[1] = m.as_int[1] >> __count;
1233   return (res.as_m64);
1234 }
1235 
1236 extern __inline __m64
1237     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238     _m_psrad(__m64 __m, __m64 __count) {
1239   return _mm_sra_pi32(__m, __count);
1240 }
1241 
1242 extern __inline __m64
1243     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244     _mm_srai_pi32(__m64 __m, int __count) {
1245   /* Promote int to long then invoke mm_sra_pi32.  */
1246   return _mm_sra_pi32(__m, __count);
1247 }
1248 
1249 extern __inline __m64
1250     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251     _m_psradi(__m64 __m, int __count) {
1252   return _mm_srai_pi32(__m, __count);
1253 }
1254 
1255 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1256 extern __inline __m64
1257     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258     _mm_srl_pi16(__m64 __m, __m64 __count) {
1259   __vector unsigned short m, r;
1260   __vector unsigned short c;
1261 
1262   if (__count <= 15) {
1263     m = (__vector unsigned short)vec_splats(__m);
1264     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1265     r = vec_sr(m, (__vector unsigned short)c);
1266     return (__m64)((__vector long long)r)[0];
1267   } else
1268     return (0);
1269 }
1270 
1271 extern __inline __m64
1272     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273     _m_psrlw(__m64 __m, __m64 __count) {
1274   return _mm_srl_pi16(__m, __count);
1275 }
1276 
1277 extern __inline __m64
1278     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1279     _mm_srli_pi16(__m64 __m, int __count) {
1280   /* Promote int to long then invoke mm_sra_pi32.  */
1281   return _mm_srl_pi16(__m, __count);
1282 }
1283 
1284 extern __inline __m64
1285     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286     _m_psrlwi(__m64 __m, int __count) {
1287   return _mm_srli_pi16(__m, __count);
1288 }
1289 
1290 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1291 extern __inline __m64
1292     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293     _mm_srl_pi32(__m64 __m, __m64 __count) {
1294   __m64_union m, res;
1295 
1296   m.as_m64 = __m;
1297 
1298   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1299   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1300   return (res.as_m64);
1301 }
1302 
1303 extern __inline __m64
1304     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305     _m_psrld(__m64 __m, __m64 __count) {
1306   return _mm_srl_pi32(__m, __count);
1307 }
1308 
1309 extern __inline __m64
1310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311     _mm_srli_pi32(__m64 __m, int __count) {
1312   /* Promote int to long then invoke mm_srl_pi32.  */
1313   return _mm_srl_pi32(__m, __count);
1314 }
1315 
1316 extern __inline __m64
1317     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318     _m_psrldi(__m64 __m, int __count) {
1319   return _mm_srli_pi32(__m, __count);
1320 }
1321 #endif /* _ARCH_PWR8 */
1322 
1323 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1324 extern __inline __m64
1325     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326     _mm_set_pi32(int __i1, int __i0) {
1327   __m64_union res;
1328 
1329   res.as_int[0] = __i0;
1330   res.as_int[1] = __i1;
1331   return (res.as_m64);
1332 }
1333 
1334 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1335 extern __inline __m64
1336     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337     _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1338   __m64_union res;
1339 
1340   res.as_short[0] = __w0;
1341   res.as_short[1] = __w1;
1342   res.as_short[2] = __w2;
1343   res.as_short[3] = __w3;
1344   return (res.as_m64);
1345 }
1346 
1347 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1348 extern __inline __m64
1349     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1350     _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1351                 char __b2, char __b1, char __b0) {
1352   __m64_union res;
1353 
1354   res.as_char[0] = __b0;
1355   res.as_char[1] = __b1;
1356   res.as_char[2] = __b2;
1357   res.as_char[3] = __b3;
1358   res.as_char[4] = __b4;
1359   res.as_char[5] = __b5;
1360   res.as_char[6] = __b6;
1361   res.as_char[7] = __b7;
1362   return (res.as_m64);
1363 }
1364 
1365 /* Similar, but with the arguments in reverse order.  */
1366 extern __inline __m64
1367     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368     _mm_setr_pi32(int __i0, int __i1) {
1369   __m64_union res;
1370 
1371   res.as_int[0] = __i0;
1372   res.as_int[1] = __i1;
1373   return (res.as_m64);
1374 }
1375 
1376 extern __inline __m64
1377     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378     _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1379   return _mm_set_pi16(__w3, __w2, __w1, __w0);
1380 }
1381 
1382 extern __inline __m64
1383     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384     _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1385                  char __b5, char __b6, char __b7) {
1386   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1387 }
1388 
1389 /* Creates a vector of two 32-bit values, both elements containing I.  */
1390 extern __inline __m64
1391     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392     _mm_set1_pi32(int __i) {
1393   __m64_union res;
1394 
1395   res.as_int[0] = __i;
1396   res.as_int[1] = __i;
1397   return (res.as_m64);
1398 }
1399 
1400 /* Creates a vector of four 16-bit values, all elements containing W.  */
1401 extern __inline __m64
1402     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403     _mm_set1_pi16(short __w) {
1404 #if _ARCH_PWR9
1405   __vector signed short w;
1406 
1407   w = (__vector signed short)vec_splats(__w);
1408   return (__m64)((__vector long long)w)[0];
1409 #else
1410   __m64_union res;
1411 
1412   res.as_short[0] = __w;
1413   res.as_short[1] = __w;
1414   res.as_short[2] = __w;
1415   res.as_short[3] = __w;
1416   return (res.as_m64);
1417 #endif
1418 }
1419 
1420 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1421 extern __inline __m64
1422     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423     _mm_set1_pi8(signed char __b) {
1424 #if _ARCH_PWR8
1425   __vector signed char b;
1426 
1427   b = (__vector signed char)vec_splats(__b);
1428   return (__m64)((__vector long long)b)[0];
1429 #else
1430   __m64_union res;
1431 
1432   res.as_char[0] = __b;
1433   res.as_char[1] = __b;
1434   res.as_char[2] = __b;
1435   res.as_char[3] = __b;
1436   res.as_char[4] = __b;
1437   res.as_char[5] = __b;
1438   res.as_char[6] = __b;
1439   res.as_char[7] = __b;
1440   return (res.as_m64);
1441 #endif
1442 }
1443 #endif /* _MMINTRIN_H_INCLUDED */
1444