1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since PowerPC target doesn't support native 64-bit vector type, we
18    typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19    works well for _si64 and some _pi32 operations.
20 
21    For _pi16 and _pi8 operations, it's better to transfer __m64 into
22    128-bit PowerPC vector first. Power8 introduced direct register
23    move instructions which helps for more efficient implementation.
24 
25    It's user's responsibility to determine if the results of such port
26    are acceptable or further changes are needed. Please note that much
27    code using Intel intrinsics CAN BE REWRITTEN in more portable and
28    efficient standard C or GNU C extensions with 64-bit scalar
29    operations, or 128-bit SSE/Altivec operations, which are more
30    recommended. */
31 #error                                                                         \
32     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef _MMINTRIN_H_INCLUDED
36 #define _MMINTRIN_H_INCLUDED
37 
38 #if defined(__ppc64__) &&                                                      \
39     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
40 
41 #include <altivec.h>
42 /* The Intel API is flexible enough that we must allow aliasing with other
43    vector types, and their scalar components.  */
44 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
45 
46 typedef __attribute__((__aligned__(8))) union {
47   __m64 as_m64;
48   char as_char[8];
49   signed char as_signed_char[8];
50   short as_short[4];
51   int as_int[2];
52   long long as_long_long;
53   float as_float[2];
54   double as_double;
55 } __m64_union;
56 
57 /* Empty the multimedia state.  */
58 extern __inline void
59     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
60     _mm_empty(void) {
61   /* nothing to do on PowerPC.  */
62 }
63 
64 extern __inline void
65     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66     _m_empty(void) {
67   /* nothing to do on PowerPC.  */
68 }
69 
70 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
71 extern __inline __m64
72     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73     _mm_cvtsi32_si64(int __i) {
74   return (__m64)(unsigned int)__i;
75 }
76 
77 extern __inline __m64
78     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79     _m_from_int(int __i) {
80   return _mm_cvtsi32_si64(__i);
81 }
82 
83 /* Convert the lower 32 bits of the __m64 object into an integer.  */
84 extern __inline int
85     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86     _mm_cvtsi64_si32(__m64 __i) {
87   return ((int)__i);
88 }
89 
90 extern __inline int
91     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92     _m_to_int(__m64 __i) {
93   return _mm_cvtsi64_si32(__i);
94 }
95 
96 /* Convert I to a __m64 object.  */
97 
98 /* Intel intrinsic.  */
99 extern __inline __m64
100     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101     _m_from_int64(long long __i) {
102   return (__m64)__i;
103 }
104 
105 extern __inline __m64
106     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
107     _mm_cvtsi64_m64(long long __i) {
108   return (__m64)__i;
109 }
110 
111 /* Microsoft intrinsic.  */
112 extern __inline __m64
113     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
114     _mm_cvtsi64x_si64(long long __i) {
115   return (__m64)__i;
116 }
117 
118 extern __inline __m64
119     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120     _mm_set_pi64x(long long __i) {
121   return (__m64)__i;
122 }
123 
124 /* Convert the __m64 object to a 64bit integer.  */
125 
126 /* Intel intrinsic.  */
127 extern __inline long long
128     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129     _m_to_int64(__m64 __i) {
130   return (long long)__i;
131 }
132 
133 extern __inline long long
134     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135     _mm_cvtm64_si64(__m64 __i) {
136   return (long long)__i;
137 }
138 
139 /* Microsoft intrinsic.  */
140 extern __inline long long
141     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142     _mm_cvtsi64_si64x(__m64 __i) {
143   return (long long)__i;
144 }
145 
146 #ifdef _ARCH_PWR8
147 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
148    the result, and the four 16-bit values from M2 into the upper four 8-bit
149    values of the result, all with signed saturation.  */
150 extern __inline __m64
151     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152     _mm_packs_pi16(__m64 __m1, __m64 __m2) {
153   __vector signed short __vm1;
154   __vector signed char __vresult;
155 
156   __vm1 = (__vector signed short)(__vector unsigned long long)
157 #ifdef __LITTLE_ENDIAN__
158       {__m1, __m2};
159 #else
160       {__m2, __m1};
161 #endif
162   __vresult = vec_packs(__vm1, __vm1);
163   return (__m64)((__vector long long)__vresult)[0];
164 }
165 
166 extern __inline __m64
167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168     _m_packsswb(__m64 __m1, __m64 __m2) {
169   return _mm_packs_pi16(__m1, __m2);
170 }
171 
172 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
173    the result, and the two 32-bit values from M2 into the upper two 16-bit
174    values of the result, all with signed saturation.  */
175 extern __inline __m64
176     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177     _mm_packs_pi32(__m64 __m1, __m64 __m2) {
178   __vector signed int __vm1;
179   __vector signed short __vresult;
180 
181   __vm1 = (__vector signed int)(__vector unsigned long long)
182 #ifdef __LITTLE_ENDIAN__
183       {__m1, __m2};
184 #else
185       {__m2, __m1};
186 #endif
187   __vresult = vec_packs(__vm1, __vm1);
188   return (__m64)((__vector long long)__vresult)[0];
189 }
190 
191 extern __inline __m64
192     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193     _m_packssdw(__m64 __m1, __m64 __m2) {
194   return _mm_packs_pi32(__m1, __m2);
195 }
196 
197 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
198    the result, and the four 16-bit values from M2 into the upper four 8-bit
199    values of the result, all with unsigned saturation.  */
200 extern __inline __m64
201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202     _mm_packs_pu16(__m64 __m1, __m64 __m2) {
203   __vector unsigned char __r;
204   __vector signed short __vm1 = (__vector signed short)(__vector long long)
205 #ifdef __LITTLE_ENDIAN__
206       {__m1, __m2};
207 #else
208       {__m2, __m1};
209 #endif
210   const __vector signed short __zero = {0};
211   __vector __bool short __select = vec_cmplt(__vm1, __zero);
212   __r =
213       vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
214   __vector __bool char __packsel = vec_pack(__select, __select);
215   __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
216   return (__m64)((__vector long long)__r)[0];
217 }
218 
219 extern __inline __m64
220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221     _m_packuswb(__m64 __m1, __m64 __m2) {
222   return _mm_packs_pu16(__m1, __m2);
223 }
224 #endif /* end ARCH_PWR8 */
225 
226 /* Interleave the four 8-bit values from the high half of M1 with the four
227    8-bit values from the high half of M2.  */
228 extern __inline __m64
229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230     _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
231 #if _ARCH_PWR8
232   __vector unsigned char __a, __b, __c;
233 
234   __a = (__vector unsigned char)vec_splats(__m1);
235   __b = (__vector unsigned char)vec_splats(__m2);
236   __c = vec_mergel(__a, __b);
237   return (__m64)((__vector long long)__c)[1];
238 #else
239   __m64_union __mu1, __mu2, __res;
240 
241   __mu1.as_m64 = __m1;
242   __mu2.as_m64 = __m2;
243 
244   __res.as_char[0] = __mu1.as_char[4];
245   __res.as_char[1] = __mu2.as_char[4];
246   __res.as_char[2] = __mu1.as_char[5];
247   __res.as_char[3] = __mu2.as_char[5];
248   __res.as_char[4] = __mu1.as_char[6];
249   __res.as_char[5] = __mu2.as_char[6];
250   __res.as_char[6] = __mu1.as_char[7];
251   __res.as_char[7] = __mu2.as_char[7];
252 
253   return (__m64)__res.as_m64;
254 #endif
255 }
256 
257 extern __inline __m64
258     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259     _m_punpckhbw(__m64 __m1, __m64 __m2) {
260   return _mm_unpackhi_pi8(__m1, __m2);
261 }
262 
263 /* Interleave the two 16-bit values from the high half of M1 with the two
264    16-bit values from the high half of M2.  */
265 extern __inline __m64
266     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267     _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
268   __m64_union __mu1, __mu2, __res;
269 
270   __mu1.as_m64 = __m1;
271   __mu2.as_m64 = __m2;
272 
273   __res.as_short[0] = __mu1.as_short[2];
274   __res.as_short[1] = __mu2.as_short[2];
275   __res.as_short[2] = __mu1.as_short[3];
276   __res.as_short[3] = __mu2.as_short[3];
277 
278   return (__m64)__res.as_m64;
279 }
280 
281 extern __inline __m64
282     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283     _m_punpckhwd(__m64 __m1, __m64 __m2) {
284   return _mm_unpackhi_pi16(__m1, __m2);
285 }
286 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
287    value from the high half of M2.  */
288 extern __inline __m64
289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290     _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
291   __m64_union __mu1, __mu2, __res;
292 
293   __mu1.as_m64 = __m1;
294   __mu2.as_m64 = __m2;
295 
296   __res.as_int[0] = __mu1.as_int[1];
297   __res.as_int[1] = __mu2.as_int[1];
298 
299   return (__m64)__res.as_m64;
300 }
301 
302 extern __inline __m64
303     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304     _m_punpckhdq(__m64 __m1, __m64 __m2) {
305   return _mm_unpackhi_pi32(__m1, __m2);
306 }
307 /* Interleave the four 8-bit values from the low half of M1 with the four
308    8-bit values from the low half of M2.  */
309 extern __inline __m64
310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311     _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
312 #if _ARCH_PWR8
313   __vector unsigned char __a, __b, __c;
314 
315   __a = (__vector unsigned char)vec_splats(__m1);
316   __b = (__vector unsigned char)vec_splats(__m2);
317   __c = vec_mergel(__a, __b);
318   return (__m64)((__vector long long)__c)[0];
319 #else
320   __m64_union __mu1, __mu2, __res;
321 
322   __mu1.as_m64 = __m1;
323   __mu2.as_m64 = __m2;
324 
325   __res.as_char[0] = __mu1.as_char[0];
326   __res.as_char[1] = __mu2.as_char[0];
327   __res.as_char[2] = __mu1.as_char[1];
328   __res.as_char[3] = __mu2.as_char[1];
329   __res.as_char[4] = __mu1.as_char[2];
330   __res.as_char[5] = __mu2.as_char[2];
331   __res.as_char[6] = __mu1.as_char[3];
332   __res.as_char[7] = __mu2.as_char[3];
333 
334   return (__m64)__res.as_m64;
335 #endif
336 }
337 
338 extern __inline __m64
339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340     _m_punpcklbw(__m64 __m1, __m64 __m2) {
341   return _mm_unpacklo_pi8(__m1, __m2);
342 }
343 /* Interleave the two 16-bit values from the low half of M1 with the two
344    16-bit values from the low half of M2.  */
345 extern __inline __m64
346     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347     _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
348   __m64_union __mu1, __mu2, __res;
349 
350   __mu1.as_m64 = __m1;
351   __mu2.as_m64 = __m2;
352 
353   __res.as_short[0] = __mu1.as_short[0];
354   __res.as_short[1] = __mu2.as_short[0];
355   __res.as_short[2] = __mu1.as_short[1];
356   __res.as_short[3] = __mu2.as_short[1];
357 
358   return (__m64)__res.as_m64;
359 }
360 
361 extern __inline __m64
362     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363     _m_punpcklwd(__m64 __m1, __m64 __m2) {
364   return _mm_unpacklo_pi16(__m1, __m2);
365 }
366 
367 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
368    value from the low half of M2.  */
369 extern __inline __m64
370     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371     _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
372   __m64_union __mu1, __mu2, __res;
373 
374   __mu1.as_m64 = __m1;
375   __mu2.as_m64 = __m2;
376 
377   __res.as_int[0] = __mu1.as_int[0];
378   __res.as_int[1] = __mu2.as_int[0];
379 
380   return (__m64)__res.as_m64;
381 }
382 
383 extern __inline __m64
384     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385     _m_punpckldq(__m64 __m1, __m64 __m2) {
386   return _mm_unpacklo_pi32(__m1, __m2);
387 }
388 
389 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
390 extern __inline __m64
391     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392     _mm_add_pi8(__m64 __m1, __m64 __m2) {
393 #if _ARCH_PWR8
394   __vector signed char __a, __b, __c;
395 
396   __a = (__vector signed char)vec_splats(__m1);
397   __b = (__vector signed char)vec_splats(__m2);
398   __c = vec_add(__a, __b);
399   return (__m64)((__vector long long)__c)[0];
400 #else
401   __m64_union __mu1, __mu2, __res;
402 
403   __mu1.as_m64 = __m1;
404   __mu2.as_m64 = __m2;
405 
406   __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
407   __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
408   __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
409   __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
410   __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
411   __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
412   __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
413   __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
414 
415   return (__m64)__res.as_m64;
416 #endif
417 }
418 
419 extern __inline __m64
420     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
421     _m_paddb(__m64 __m1, __m64 __m2) {
422   return _mm_add_pi8(__m1, __m2);
423 }
424 
425 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
426 extern __inline __m64
427     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428     _mm_add_pi16(__m64 __m1, __m64 __m2) {
429 #if _ARCH_PWR8
430   __vector signed short __a, __b, __c;
431 
432   __a = (__vector signed short)vec_splats(__m1);
433   __b = (__vector signed short)vec_splats(__m2);
434   __c = vec_add(__a, __b);
435   return (__m64)((__vector long long)__c)[0];
436 #else
437   __m64_union __mu1, __mu2, __res;
438 
439   __mu1.as_m64 = __m1;
440   __mu2.as_m64 = __m2;
441 
442   __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
443   __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
444   __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
445   __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
446 
447   return (__m64)__res.as_m64;
448 #endif
449 }
450 
451 extern __inline __m64
452     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453     _m_paddw(__m64 __m1, __m64 __m2) {
454   return _mm_add_pi16(__m1, __m2);
455 }
456 
457 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
458 extern __inline __m64
459     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460     _mm_add_pi32(__m64 __m1, __m64 __m2) {
461 #if _ARCH_PWR9
462   __vector signed int __a, __b, __c;
463 
464   __a = (__vector signed int)vec_splats(__m1);
465   __b = (__vector signed int)vec_splats(__m2);
466   __c = vec_add(__a, __b);
467   return (__m64)((__vector long long)__c)[0];
468 #else
469   __m64_union __mu1, __mu2, __res;
470 
471   __mu1.as_m64 = __m1;
472   __mu2.as_m64 = __m2;
473 
474   __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
475   __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
476 
477   return (__m64)__res.as_m64;
478 #endif
479 }
480 
481 extern __inline __m64
482     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483     _m_paddd(__m64 __m1, __m64 __m2) {
484   return _mm_add_pi32(__m1, __m2);
485 }
486 
487 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
488 extern __inline __m64
489     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490     _mm_sub_pi8(__m64 __m1, __m64 __m2) {
491 #if _ARCH_PWR8
492   __vector signed char __a, __b, __c;
493 
494   __a = (__vector signed char)vec_splats(__m1);
495   __b = (__vector signed char)vec_splats(__m2);
496   __c = vec_sub(__a, __b);
497   return (__m64)((__vector long long)__c)[0];
498 #else
499   __m64_union __mu1, __mu2, __res;
500 
501   __mu1.as_m64 = __m1;
502   __mu2.as_m64 = __m2;
503 
504   __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
505   __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
506   __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
507   __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
508   __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
509   __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
510   __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
511   __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
512 
513   return (__m64)__res.as_m64;
514 #endif
515 }
516 
517 extern __inline __m64
518     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519     _m_psubb(__m64 __m1, __m64 __m2) {
520   return _mm_sub_pi8(__m1, __m2);
521 }
522 
523 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
524 extern __inline __m64
525     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526     _mm_sub_pi16(__m64 __m1, __m64 __m2) {
527 #if _ARCH_PWR8
528   __vector signed short __a, __b, __c;
529 
530   __a = (__vector signed short)vec_splats(__m1);
531   __b = (__vector signed short)vec_splats(__m2);
532   __c = vec_sub(__a, __b);
533   return (__m64)((__vector long long)__c)[0];
534 #else
535   __m64_union __mu1, __mu2, __res;
536 
537   __mu1.as_m64 = __m1;
538   __mu2.as_m64 = __m2;
539 
540   __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
541   __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
542   __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
543   __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
544 
545   return (__m64)__res.as_m64;
546 #endif
547 }
548 
549 extern __inline __m64
550     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551     _m_psubw(__m64 __m1, __m64 __m2) {
552   return _mm_sub_pi16(__m1, __m2);
553 }
554 
555 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
556 extern __inline __m64
557     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558     _mm_sub_pi32(__m64 __m1, __m64 __m2) {
559 #if _ARCH_PWR9
560   __vector signed int __a, __b, __c;
561 
562   __a = (__vector signed int)vec_splats(__m1);
563   __b = (__vector signed int)vec_splats(__m2);
564   __c = vec_sub(__a, __b);
565   return (__m64)((__vector long long)__c)[0];
566 #else
567   __m64_union __mu1, __mu2, __res;
568 
569   __mu1.as_m64 = __m1;
570   __mu2.as_m64 = __m2;
571 
572   __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
573   __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
574 
575   return (__m64)__res.as_m64;
576 #endif
577 }
578 
579 extern __inline __m64
580     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581     _m_psubd(__m64 __m1, __m64 __m2) {
582   return _mm_sub_pi32(__m1, __m2);
583 }
584 
585 extern __inline __m64
586     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587     _mm_add_si64(__m64 __m1, __m64 __m2) {
588   return (__m1 + __m2);
589 }
590 
591 extern __inline __m64
592     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593     _mm_sub_si64(__m64 __m1, __m64 __m2) {
594   return (__m1 - __m2);
595 }
596 
597 /* Shift the 64-bit value in M left by COUNT.  */
598 extern __inline __m64
599     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
600     _mm_sll_si64(__m64 __m, __m64 __count) {
601   return (__m << __count);
602 }
603 
604 extern __inline __m64
605     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606     _m_psllq(__m64 __m, __m64 __count) {
607   return _mm_sll_si64(__m, __count);
608 }
609 
610 extern __inline __m64
611     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612     _mm_slli_si64(__m64 __m, const int __count) {
613   return (__m << __count);
614 }
615 
616 extern __inline __m64
617     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618     _m_psllqi(__m64 __m, const int __count) {
619   return _mm_slli_si64(__m, __count);
620 }
621 
622 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
623 extern __inline __m64
624     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625     _mm_srl_si64(__m64 __m, __m64 __count) {
626   return (__m >> __count);
627 }
628 
629 extern __inline __m64
630     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631     _m_psrlq(__m64 __m, __m64 __count) {
632   return _mm_srl_si64(__m, __count);
633 }
634 
635 extern __inline __m64
636     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637     _mm_srli_si64(__m64 __m, const int __count) {
638   return (__m >> __count);
639 }
640 
641 extern __inline __m64
642     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643     _m_psrlqi(__m64 __m, const int __count) {
644   return _mm_srli_si64(__m, __count);
645 }
646 
647 /* Bit-wise AND the 64-bit values in M1 and M2.  */
648 extern __inline __m64
649     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650     _mm_and_si64(__m64 __m1, __m64 __m2) {
651   return (__m1 & __m2);
652 }
653 
654 extern __inline __m64
655     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656     _m_pand(__m64 __m1, __m64 __m2) {
657   return _mm_and_si64(__m1, __m2);
658 }
659 
660 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
661    64-bit value in M2.  */
662 extern __inline __m64
663     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664     _mm_andnot_si64(__m64 __m1, __m64 __m2) {
665   return (~__m1 & __m2);
666 }
667 
668 extern __inline __m64
669     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670     _m_pandn(__m64 __m1, __m64 __m2) {
671   return _mm_andnot_si64(__m1, __m2);
672 }
673 
674 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
675 extern __inline __m64
676     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677     _mm_or_si64(__m64 __m1, __m64 __m2) {
678   return (__m1 | __m2);
679 }
680 
681 extern __inline __m64
682     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683     _m_por(__m64 __m1, __m64 __m2) {
684   return _mm_or_si64(__m1, __m2);
685 }
686 
687 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
688 extern __inline __m64
689     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690     _mm_xor_si64(__m64 __m1, __m64 __m2) {
691   return (__m1 ^ __m2);
692 }
693 
694 extern __inline __m64
695     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696     _m_pxor(__m64 __m1, __m64 __m2) {
697   return _mm_xor_si64(__m1, __m2);
698 }
699 
700 /* Creates a 64-bit zero.  */
701 extern __inline __m64
702     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703     _mm_setzero_si64(void) {
704   return (__m64)0;
705 }
706 
707 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
708    test is true and zero if false.  */
709 extern __inline __m64
710     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711     _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
712 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
713   __m64 __res;
714   __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
715   return (__res);
716 #else
717   __m64_union __mu1, __mu2, __res;
718 
719   __mu1.as_m64 = __m1;
720   __mu2.as_m64 = __m2;
721 
722   __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
723   __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
724   __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
725   __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
726   __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
727   __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
728   __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
729   __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
730 
731   return (__m64)__res.as_m64;
732 #endif
733 }
734 
735 extern __inline __m64
736     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737     _m_pcmpeqb(__m64 __m1, __m64 __m2) {
738   return _mm_cmpeq_pi8(__m1, __m2);
739 }
740 
741 extern __inline __m64
742     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743     _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
744 #if _ARCH_PWR8
745   __vector signed char __a, __b, __c;
746 
747   __a = (__vector signed char)vec_splats(__m1);
748   __b = (__vector signed char)vec_splats(__m2);
749   __c = (__vector signed char)vec_cmpgt(__a, __b);
750   return (__m64)((__vector long long)__c)[0];
751 #else
752   __m64_union __mu1, __mu2, __res;
753 
754   __mu1.as_m64 = __m1;
755   __mu2.as_m64 = __m2;
756 
757   __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
758   __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
759   __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
760   __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
761   __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
762   __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
763   __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
764   __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
765 
766   return (__m64)__res.as_m64;
767 #endif
768 }
769 
770 extern __inline __m64
771     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772     _m_pcmpgtb(__m64 __m1, __m64 __m2) {
773   return _mm_cmpgt_pi8(__m1, __m2);
774 }
775 
776 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
777    the test is true and zero if false.  */
778 extern __inline __m64
779     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
780     _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
781 #if _ARCH_PWR8
782   __vector signed short __a, __b, __c;
783 
784   __a = (__vector signed short)vec_splats(__m1);
785   __b = (__vector signed short)vec_splats(__m2);
786   __c = (__vector signed short)vec_cmpeq(__a, __b);
787   return (__m64)((__vector long long)__c)[0];
788 #else
789   __m64_union __mu1, __mu2, __res;
790 
791   __mu1.as_m64 = __m1;
792   __mu2.as_m64 = __m2;
793 
794   __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
795   __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
796   __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
797   __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
798 
799   return (__m64)__res.as_m64;
800 #endif
801 }
802 
803 extern __inline __m64
804     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
805     _m_pcmpeqw(__m64 __m1, __m64 __m2) {
806   return _mm_cmpeq_pi16(__m1, __m2);
807 }
808 
809 extern __inline __m64
810     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811     _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
812 #if _ARCH_PWR8
813   __vector signed short __a, __b, __c;
814 
815   __a = (__vector signed short)vec_splats(__m1);
816   __b = (__vector signed short)vec_splats(__m2);
817   __c = (__vector signed short)vec_cmpgt(__a, __b);
818   return (__m64)((__vector long long)__c)[0];
819 #else
820   __m64_union __mu1, __mu2, __res;
821 
822   __mu1.as_m64 = __m1;
823   __mu2.as_m64 = __m2;
824 
825   __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
826   __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
827   __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
828   __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
829 
830   return (__m64)__res.as_m64;
831 #endif
832 }
833 
834 extern __inline __m64
835     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836     _m_pcmpgtw(__m64 __m1, __m64 __m2) {
837   return _mm_cmpgt_pi16(__m1, __m2);
838 }
839 
840 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
841    the test is true and zero if false.  */
842 extern __inline __m64
843     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844     _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
845 #if _ARCH_PWR9
846   __vector signed int __a, __b, __c;
847 
848   __a = (__vector signed int)vec_splats(__m1);
849   __b = (__vector signed int)vec_splats(__m2);
850   __c = (__vector signed int)vec_cmpeq(__a, __b);
851   return (__m64)((__vector long long)__c)[0];
852 #else
853   __m64_union __mu1, __mu2, __res;
854 
855   __mu1.as_m64 = __m1;
856   __mu2.as_m64 = __m2;
857 
858   __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
859   __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
860 
861   return (__m64)__res.as_m64;
862 #endif
863 }
864 
865 extern __inline __m64
866     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867     _m_pcmpeqd(__m64 __m1, __m64 __m2) {
868   return _mm_cmpeq_pi32(__m1, __m2);
869 }
870 
871 extern __inline __m64
872     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873     _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
874 #if _ARCH_PWR9
875   __vector signed int __a, __b, __c;
876 
877   __a = (__vector signed int)vec_splats(__m1);
878   __b = (__vector signed int)vec_splats(__m2);
879   __c = (__vector signed int)vec_cmpgt(__a, __b);
880   return (__m64)((__vector long long)__c)[0];
881 #else
882   __m64_union __mu1, __mu2, __res;
883 
884   __mu1.as_m64 = __m1;
885   __mu2.as_m64 = __m2;
886 
887   __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
888   __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
889 
890   return (__m64)__res.as_m64;
891 #endif
892 }
893 
894 extern __inline __m64
895     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896     _m_pcmpgtd(__m64 __m1, __m64 __m2) {
897   return _mm_cmpgt_pi32(__m1, __m2);
898 }
899 
900 #if _ARCH_PWR8
901 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
902    saturated arithmetic.  */
903 extern __inline __m64
904     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905     _mm_adds_pi8(__m64 __m1, __m64 __m2) {
906   __vector signed char __a, __b, __c;
907 
908   __a = (__vector signed char)vec_splats(__m1);
909   __b = (__vector signed char)vec_splats(__m2);
910   __c = vec_adds(__a, __b);
911   return (__m64)((__vector long long)__c)[0];
912 }
913 
914 extern __inline __m64
915     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916     _m_paddsb(__m64 __m1, __m64 __m2) {
917   return _mm_adds_pi8(__m1, __m2);
918 }
919 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
920    saturated arithmetic.  */
921 extern __inline __m64
922     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
923     _mm_adds_pi16(__m64 __m1, __m64 __m2) {
924   __vector signed short __a, __b, __c;
925 
926   __a = (__vector signed short)vec_splats(__m1);
927   __b = (__vector signed short)vec_splats(__m2);
928   __c = vec_adds(__a, __b);
929   return (__m64)((__vector long long)__c)[0];
930 }
931 
932 extern __inline __m64
933     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
934     _m_paddsw(__m64 __m1, __m64 __m2) {
935   return _mm_adds_pi16(__m1, __m2);
936 }
937 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
938    saturated arithmetic.  */
939 extern __inline __m64
940     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941     _mm_adds_pu8(__m64 __m1, __m64 __m2) {
942   __vector unsigned char __a, __b, __c;
943 
944   __a = (__vector unsigned char)vec_splats(__m1);
945   __b = (__vector unsigned char)vec_splats(__m2);
946   __c = vec_adds(__a, __b);
947   return (__m64)((__vector long long)__c)[0];
948 }
949 
950 extern __inline __m64
951     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952     _m_paddusb(__m64 __m1, __m64 __m2) {
953   return _mm_adds_pu8(__m1, __m2);
954 }
955 
956 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
957    saturated arithmetic.  */
958 extern __inline __m64
959     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960     _mm_adds_pu16(__m64 __m1, __m64 __m2) {
961   __vector unsigned short __a, __b, __c;
962 
963   __a = (__vector unsigned short)vec_splats(__m1);
964   __b = (__vector unsigned short)vec_splats(__m2);
965   __c = vec_adds(__a, __b);
966   return (__m64)((__vector long long)__c)[0];
967 }
968 
969 extern __inline __m64
970     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971     _m_paddusw(__m64 __m1, __m64 __m2) {
972   return _mm_adds_pu16(__m1, __m2);
973 }
974 
975 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
976    saturating arithmetic.  */
977 extern __inline __m64
978     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
979     _mm_subs_pi8(__m64 __m1, __m64 __m2) {
980   __vector signed char __a, __b, __c;
981 
982   __a = (__vector signed char)vec_splats(__m1);
983   __b = (__vector signed char)vec_splats(__m2);
984   __c = vec_subs(__a, __b);
985   return (__m64)((__vector long long)__c)[0];
986 }
987 
988 extern __inline __m64
989     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990     _m_psubsb(__m64 __m1, __m64 __m2) {
991   return _mm_subs_pi8(__m1, __m2);
992 }
993 
994 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
995    signed saturating arithmetic.  */
996 extern __inline __m64
997     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998     _mm_subs_pi16(__m64 __m1, __m64 __m2) {
999   __vector signed short __a, __b, __c;
1000 
1001   __a = (__vector signed short)vec_splats(__m1);
1002   __b = (__vector signed short)vec_splats(__m2);
1003   __c = vec_subs(__a, __b);
1004   return (__m64)((__vector long long)__c)[0];
1005 }
1006 
1007 extern __inline __m64
1008     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009     _m_psubsw(__m64 __m1, __m64 __m2) {
1010   return _mm_subs_pi16(__m1, __m2);
1011 }
1012 
1013 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1014    unsigned saturating arithmetic.  */
1015 extern __inline __m64
1016     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017     _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1018   __vector unsigned char __a, __b, __c;
1019 
1020   __a = (__vector unsigned char)vec_splats(__m1);
1021   __b = (__vector unsigned char)vec_splats(__m2);
1022   __c = vec_subs(__a, __b);
1023   return (__m64)((__vector long long)__c)[0];
1024 }
1025 
1026 extern __inline __m64
1027     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028     _m_psubusb(__m64 __m1, __m64 __m2) {
1029   return _mm_subs_pu8(__m1, __m2);
1030 }
1031 
1032 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1033    unsigned saturating arithmetic.  */
1034 extern __inline __m64
1035     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036     _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1037   __vector unsigned short __a, __b, __c;
1038 
1039   __a = (__vector unsigned short)vec_splats(__m1);
1040   __b = (__vector unsigned short)vec_splats(__m2);
1041   __c = vec_subs(__a, __b);
1042   return (__m64)((__vector long long)__c)[0];
1043 }
1044 
1045 extern __inline __m64
1046     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047     _m_psubusw(__m64 __m1, __m64 __m2) {
1048   return _mm_subs_pu16(__m1, __m2);
1049 }
1050 
1051 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1052    four 32-bit intermediate results, which are then summed by pairs to
1053    produce two 32-bit results.  */
1054 extern __inline __m64
1055     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056     _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1057   __vector signed short __a, __b;
1058   __vector signed int __c;
1059   __vector signed int __zero = {0, 0, 0, 0};
1060 
1061   __a = (__vector signed short)vec_splats(__m1);
1062   __b = (__vector signed short)vec_splats(__m2);
1063   __c = vec_vmsumshm(__a, __b, __zero);
1064   return (__m64)((__vector long long)__c)[0];
1065 }
1066 
1067 extern __inline __m64
1068     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069     _m_pmaddwd(__m64 __m1, __m64 __m2) {
1070   return _mm_madd_pi16(__m1, __m2);
1071 }
1072 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1073    M2 and produce the high 16 bits of the 32-bit results.  */
1074 extern __inline __m64
1075     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076     _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1077   __vector signed short __a, __b;
1078   __vector signed short __c;
1079   __vector signed int __w0, __w1;
1080   __vector unsigned char __xform1 = {
1081 #ifdef __LITTLE_ENDIAN__
1082       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1083       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1084 #else
1085       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1086       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1087 #endif
1088   };
1089 
1090   __a = (__vector signed short)vec_splats(__m1);
1091   __b = (__vector signed short)vec_splats(__m2);
1092 
1093   __w0 = vec_vmulesh(__a, __b);
1094   __w1 = vec_vmulosh(__a, __b);
1095   __c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
1096 
1097   return (__m64)((__vector long long)__c)[0];
1098 }
1099 
1100 extern __inline __m64
1101     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102     _m_pmulhw(__m64 __m1, __m64 __m2) {
1103   return _mm_mulhi_pi16(__m1, __m2);
1104 }
1105 
1106 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1107    the low 16 bits of the results.  */
1108 extern __inline __m64
1109     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110     _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1111   __vector signed short __a, __b, __c;
1112 
1113   __a = (__vector signed short)vec_splats(__m1);
1114   __b = (__vector signed short)vec_splats(__m2);
1115   __c = __a * __b;
1116   return (__m64)((__vector long long)__c)[0];
1117 }
1118 
1119 extern __inline __m64
1120     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121     _m_pmullw(__m64 __m1, __m64 __m2) {
1122   return _mm_mullo_pi16(__m1, __m2);
1123 }
1124 
1125 /* Shift four 16-bit values in M left by COUNT.  */
1126 extern __inline __m64
1127     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128     _mm_sll_pi16(__m64 __m, __m64 __count) {
1129   __vector signed short __r;
1130   __vector unsigned short __c;
1131 
1132   if (__count <= 15) {
1133     __r = (__vector signed short)vec_splats(__m);
1134     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1135     __r = vec_sl(__r, (__vector unsigned short)__c);
1136     return (__m64)((__vector long long)__r)[0];
1137   } else
1138     return (0);
1139 }
1140 
1141 extern __inline __m64
1142     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143     _m_psllw(__m64 __m, __m64 __count) {
1144   return _mm_sll_pi16(__m, __count);
1145 }
1146 
1147 extern __inline __m64
1148     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149     _mm_slli_pi16(__m64 __m, int __count) {
1150   /* Promote int to long then invoke mm_sll_pi16.  */
1151   return _mm_sll_pi16(__m, __count);
1152 }
1153 
1154 extern __inline __m64
1155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156     _m_psllwi(__m64 __m, int __count) {
1157   return _mm_slli_pi16(__m, __count);
1158 }
1159 
1160 /* Shift two 32-bit values in M left by COUNT.  */
1161 extern __inline __m64
1162     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163     _mm_sll_pi32(__m64 __m, __m64 __count) {
1164   __m64_union __res;
1165 
1166   __res.as_m64 = __m;
1167 
1168   __res.as_int[0] = __res.as_int[0] << __count;
1169   __res.as_int[1] = __res.as_int[1] << __count;
1170   return (__res.as_m64);
1171 }
1172 
1173 extern __inline __m64
1174     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175     _m_pslld(__m64 __m, __m64 __count) {
1176   return _mm_sll_pi32(__m, __count);
1177 }
1178 
1179 extern __inline __m64
1180     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181     _mm_slli_pi32(__m64 __m, int __count) {
1182   /* Promote int to long then invoke mm_sll_pi32.  */
1183   return _mm_sll_pi32(__m, __count);
1184 }
1185 
1186 extern __inline __m64
1187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188     _m_pslldi(__m64 __m, int __count) {
1189   return _mm_slli_pi32(__m, __count);
1190 }
1191 
1192 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1193 extern __inline __m64
1194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195     _mm_sra_pi16(__m64 __m, __m64 __count) {
1196   __vector signed short __r;
1197   __vector unsigned short __c;
1198 
1199   if (__count <= 15) {
1200     __r = (__vector signed short)vec_splats(__m);
1201     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1202     __r = vec_sra(__r, (__vector unsigned short)__c);
1203     return (__m64)((__vector long long)__r)[0];
1204   } else
1205     return (0);
1206 }
1207 
1208 extern __inline __m64
1209     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210     _m_psraw(__m64 __m, __m64 __count) {
1211   return _mm_sra_pi16(__m, __count);
1212 }
1213 
1214 extern __inline __m64
1215     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216     _mm_srai_pi16(__m64 __m, int __count) {
1217   /* Promote int to long then invoke mm_sra_pi32.  */
1218   return _mm_sra_pi16(__m, __count);
1219 }
1220 
1221 extern __inline __m64
1222     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223     _m_psrawi(__m64 __m, int __count) {
1224   return _mm_srai_pi16(__m, __count);
1225 }
1226 
1227 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1228 extern __inline __m64
1229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230     _mm_sra_pi32(__m64 __m, __m64 __count) {
1231   __m64_union __res;
1232 
1233   __res.as_m64 = __m;
1234 
1235   __res.as_int[0] = __res.as_int[0] >> __count;
1236   __res.as_int[1] = __res.as_int[1] >> __count;
1237   return (__res.as_m64);
1238 }
1239 
1240 extern __inline __m64
1241     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242     _m_psrad(__m64 __m, __m64 __count) {
1243   return _mm_sra_pi32(__m, __count);
1244 }
1245 
1246 extern __inline __m64
1247     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248     _mm_srai_pi32(__m64 __m, int __count) {
1249   /* Promote int to long then invoke mm_sra_pi32.  */
1250   return _mm_sra_pi32(__m, __count);
1251 }
1252 
1253 extern __inline __m64
1254     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255     _m_psradi(__m64 __m, int __count) {
1256   return _mm_srai_pi32(__m, __count);
1257 }
1258 
1259 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1260 extern __inline __m64
1261     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262     _mm_srl_pi16(__m64 __m, __m64 __count) {
1263   __vector unsigned short __r;
1264   __vector unsigned short __c;
1265 
1266   if (__count <= 15) {
1267     __r = (__vector unsigned short)vec_splats(__m);
1268     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1269     __r = vec_sr(__r, (__vector unsigned short)__c);
1270     return (__m64)((__vector long long)__r)[0];
1271   } else
1272     return (0);
1273 }
1274 
1275 extern __inline __m64
1276     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277     _m_psrlw(__m64 __m, __m64 __count) {
1278   return _mm_srl_pi16(__m, __count);
1279 }
1280 
1281 extern __inline __m64
1282     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283     _mm_srli_pi16(__m64 __m, int __count) {
1284   /* Promote int to long then invoke mm_sra_pi32.  */
1285   return _mm_srl_pi16(__m, __count);
1286 }
1287 
1288 extern __inline __m64
1289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290     _m_psrlwi(__m64 __m, int __count) {
1291   return _mm_srli_pi16(__m, __count);
1292 }
1293 
1294 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1295 extern __inline __m64
1296     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297     _mm_srl_pi32(__m64 __m, __m64 __count) {
1298   __m64_union __res;
1299 
1300   __res.as_m64 = __m;
1301 
1302   __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1303   __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1304   return (__res.as_m64);
1305 }
1306 
1307 extern __inline __m64
1308     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309     _m_psrld(__m64 __m, __m64 __count) {
1310   return _mm_srl_pi32(__m, __count);
1311 }
1312 
1313 extern __inline __m64
1314     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315     _mm_srli_pi32(__m64 __m, int __count) {
1316   /* Promote int to long then invoke mm_srl_pi32.  */
1317   return _mm_srl_pi32(__m, __count);
1318 }
1319 
1320 extern __inline __m64
1321     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322     _m_psrldi(__m64 __m, int __count) {
1323   return _mm_srli_pi32(__m, __count);
1324 }
1325 #endif /* _ARCH_PWR8 */
1326 
1327 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1328 extern __inline __m64
1329     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330     _mm_set_pi32(int __i1, int __i0) {
1331   __m64_union __res;
1332 
1333   __res.as_int[0] = __i0;
1334   __res.as_int[1] = __i1;
1335   return (__res.as_m64);
1336 }
1337 
1338 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1339 extern __inline __m64
1340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341     _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1342   __m64_union __res;
1343 
1344   __res.as_short[0] = __w0;
1345   __res.as_short[1] = __w1;
1346   __res.as_short[2] = __w2;
1347   __res.as_short[3] = __w3;
1348   return (__res.as_m64);
1349 }
1350 
1351 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1352 extern __inline __m64
1353     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354     _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1355                 char __b2, char __b1, char __b0) {
1356   __m64_union __res;
1357 
1358   __res.as_char[0] = __b0;
1359   __res.as_char[1] = __b1;
1360   __res.as_char[2] = __b2;
1361   __res.as_char[3] = __b3;
1362   __res.as_char[4] = __b4;
1363   __res.as_char[5] = __b5;
1364   __res.as_char[6] = __b6;
1365   __res.as_char[7] = __b7;
1366   return (__res.as_m64);
1367 }
1368 
1369 /* Similar, but with the arguments in reverse order.  */
1370 extern __inline __m64
1371     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372     _mm_setr_pi32(int __i0, int __i1) {
1373   __m64_union __res;
1374 
1375   __res.as_int[0] = __i0;
1376   __res.as_int[1] = __i1;
1377   return (__res.as_m64);
1378 }
1379 
1380 extern __inline __m64
1381     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382     _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1383   return _mm_set_pi16(__w3, __w2, __w1, __w0);
1384 }
1385 
1386 extern __inline __m64
1387     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388     _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1389                  char __b5, char __b6, char __b7) {
1390   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1391 }
1392 
1393 /* Creates a vector of two 32-bit values, both elements containing I.  */
1394 extern __inline __m64
1395     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396     _mm_set1_pi32(int __i) {
1397   __m64_union __res;
1398 
1399   __res.as_int[0] = __i;
1400   __res.as_int[1] = __i;
1401   return (__res.as_m64);
1402 }
1403 
1404 /* Creates a vector of four 16-bit values, all elements containing W.  */
1405 extern __inline __m64
1406     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407     _mm_set1_pi16(short __w) {
1408 #if _ARCH_PWR9
1409   __vector signed short w;
1410 
1411   w = (__vector signed short)vec_splats(__w);
1412   return (__m64)((__vector long long)w)[0];
1413 #else
1414   __m64_union __res;
1415 
1416   __res.as_short[0] = __w;
1417   __res.as_short[1] = __w;
1418   __res.as_short[2] = __w;
1419   __res.as_short[3] = __w;
1420   return (__res.as_m64);
1421 #endif
1422 }
1423 
1424 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1425 extern __inline __m64
1426     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427     _mm_set1_pi8(signed char __b) {
1428 #if _ARCH_PWR8
1429   __vector signed char __res;
1430 
1431   __res = (__vector signed char)vec_splats(__b);
1432   return (__m64)((__vector long long)__res)[0];
1433 #else
1434   __m64_union __res;
1435 
1436   __res.as_char[0] = __b;
1437   __res.as_char[1] = __b;
1438   __res.as_char[2] = __b;
1439   __res.as_char[3] = __b;
1440   __res.as_char[4] = __b;
1441   __res.as_char[5] = __b;
1442   __res.as_char[6] = __b;
1443   __res.as_char[7] = __b;
1444   return (__res.as_m64);
1445 #endif
1446 }
1447 
1448 #else
1449 #include_next <mmintrin.h>
1450 #endif /* defined(__ppc64__) &&
1451         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1452 
1453 #endif /* _MMINTRIN_H_INCLUDED */
1454