1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since PowerPC target doesn't support native 64-bit vector type, we
18    typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19    works well for _si64 and some _pi32 operations.
20 
21    For _pi16 and _pi8 operations, it's better to transfer __m64 into
22    128-bit PowerPC vector first. Power8 introduced direct register
23    move instructions which helps for more efficient implementation.
24 
25    It's user's responsibility to determine if the results of such port
26    are acceptable or further changes are needed. Please note that much
27    code using Intel intrinsics CAN BE REWRITTEN in more portable and
28    efficient standard C or GNU C extensions with 64-bit scalar
29    operations, or 128-bit SSE/Altivec operations, which are more
30    recommended. */
31 #error                                                                         \
32     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef _MMINTRIN_H_INCLUDED
36 #define _MMINTRIN_H_INCLUDED
37 
38 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
39 
40 #include <altivec.h>
41 /* The Intel API is flexible enough that we must allow aliasing with other
42    vector types, and their scalar components.  */
43 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
44 
45 typedef __attribute__((__aligned__(8))) union {
46   __m64 as_m64;
47   char as_char[8];
48   signed char as_signed_char[8];
49   short as_short[4];
50   int as_int[2];
51   long long as_long_long;
52   float as_float[2];
53   double as_double;
54 } __m64_union;
55 
56 /* Empty the multimedia state.  */
57 extern __inline void
58     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59     _mm_empty(void) {
60   /* nothing to do on PowerPC.  */
61 }
62 
63 extern __inline void
64     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
65     _m_empty(void) {
66   /* nothing to do on PowerPC.  */
67 }
68 
69 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
70 extern __inline __m64
71     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
72     _mm_cvtsi32_si64(int __i) {
73   return (__m64)(unsigned int)__i;
74 }
75 
76 extern __inline __m64
77     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78     _m_from_int(int __i) {
79   return _mm_cvtsi32_si64(__i);
80 }
81 
82 /* Convert the lower 32 bits of the __m64 object into an integer.  */
83 extern __inline int
84     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85     _mm_cvtsi64_si32(__m64 __i) {
86   return ((int)__i);
87 }
88 
89 extern __inline int
90     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91     _m_to_int(__m64 __i) {
92   return _mm_cvtsi64_si32(__i);
93 }
94 
95 /* Convert I to a __m64 object.  */
96 
97 /* Intel intrinsic.  */
98 extern __inline __m64
99     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100     _m_from_int64(long long __i) {
101   return (__m64)__i;
102 }
103 
104 extern __inline __m64
105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106     _mm_cvtsi64_m64(long long __i) {
107   return (__m64)__i;
108 }
109 
110 /* Microsoft intrinsic.  */
111 extern __inline __m64
112     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113     _mm_cvtsi64x_si64(long long __i) {
114   return (__m64)__i;
115 }
116 
117 extern __inline __m64
118     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119     _mm_set_pi64x(long long __i) {
120   return (__m64)__i;
121 }
122 
123 /* Convert the __m64 object to a 64bit integer.  */
124 
125 /* Intel intrinsic.  */
126 extern __inline long long
127     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128     _m_to_int64(__m64 __i) {
129   return (long long)__i;
130 }
131 
132 extern __inline long long
133     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134     _mm_cvtm64_si64(__m64 __i) {
135   return (long long)__i;
136 }
137 
138 /* Microsoft intrinsic.  */
139 extern __inline long long
140     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141     _mm_cvtsi64_si64x(__m64 __i) {
142   return (long long)__i;
143 }
144 
145 #ifdef _ARCH_PWR8
146 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
147    the result, and the four 16-bit values from M2 into the upper four 8-bit
148    values of the result, all with signed saturation.  */
149 extern __inline __m64
150     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151     _mm_packs_pi16(__m64 __m1, __m64 __m2) {
152   __vector signed short __vm1;
153   __vector signed char __vresult;
154 
155   __vm1 = (__vector signed short)(__vector unsigned long long)
156 #ifdef __LITTLE_ENDIAN__
157       {__m1, __m2};
158 #else
159       {__m2, __m1};
160 #endif
161   __vresult = vec_packs(__vm1, __vm1);
162   return (__m64)((__vector long long)__vresult)[0];
163 }
164 
165 extern __inline __m64
166     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167     _m_packsswb(__m64 __m1, __m64 __m2) {
168   return _mm_packs_pi16(__m1, __m2);
169 }
170 
171 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
172    the result, and the two 32-bit values from M2 into the upper two 16-bit
173    values of the result, all with signed saturation.  */
174 extern __inline __m64
175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176     _mm_packs_pi32(__m64 __m1, __m64 __m2) {
177   __vector signed int __vm1;
178   __vector signed short __vresult;
179 
180   __vm1 = (__vector signed int)(__vector unsigned long long)
181 #ifdef __LITTLE_ENDIAN__
182       {__m1, __m2};
183 #else
184       {__m2, __m1};
185 #endif
186   __vresult = vec_packs(__vm1, __vm1);
187   return (__m64)((__vector long long)__vresult)[0];
188 }
189 
190 extern __inline __m64
191     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192     _m_packssdw(__m64 __m1, __m64 __m2) {
193   return _mm_packs_pi32(__m1, __m2);
194 }
195 
196 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
197    the result, and the four 16-bit values from M2 into the upper four 8-bit
198    values of the result, all with unsigned saturation.  */
199 extern __inline __m64
200     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201     _mm_packs_pu16(__m64 __m1, __m64 __m2) {
202   __vector unsigned char __r;
203   __vector signed short __vm1 = (__vector signed short)(__vector long long)
204 #ifdef __LITTLE_ENDIAN__
205       {__m1, __m2};
206 #else
207       {__m2, __m1};
208 #endif
209   const __vector signed short __zero = {0};
210   __vector __bool short __select = vec_cmplt(__vm1, __zero);
211   __r =
212       vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
213   __vector __bool char __packsel = vec_pack(__select, __select);
214   __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
215   return (__m64)((__vector long long)__r)[0];
216 }
217 
218 extern __inline __m64
219     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220     _m_packuswb(__m64 __m1, __m64 __m2) {
221   return _mm_packs_pu16(__m1, __m2);
222 }
223 #endif /* end ARCH_PWR8 */
224 
225 /* Interleave the four 8-bit values from the high half of M1 with the four
226    8-bit values from the high half of M2.  */
227 extern __inline __m64
228     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229     _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
230 #if _ARCH_PWR8
231   __vector unsigned char __a, __b, __c;
232 
233   __a = (__vector unsigned char)vec_splats(__m1);
234   __b = (__vector unsigned char)vec_splats(__m2);
235   __c = vec_mergel(__a, __b);
236   return (__m64)((__vector long long)__c)[1];
237 #else
238   __m64_union __mu1, __mu2, __res;
239 
240   __mu1.as_m64 = __m1;
241   __mu2.as_m64 = __m2;
242 
243   __res.as_char[0] = __mu1.as_char[4];
244   __res.as_char[1] = __mu2.as_char[4];
245   __res.as_char[2] = __mu1.as_char[5];
246   __res.as_char[3] = __mu2.as_char[5];
247   __res.as_char[4] = __mu1.as_char[6];
248   __res.as_char[5] = __mu2.as_char[6];
249   __res.as_char[6] = __mu1.as_char[7];
250   __res.as_char[7] = __mu2.as_char[7];
251 
252   return (__m64)__res.as_m64;
253 #endif
254 }
255 
256 extern __inline __m64
257     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258     _m_punpckhbw(__m64 __m1, __m64 __m2) {
259   return _mm_unpackhi_pi8(__m1, __m2);
260 }
261 
262 /* Interleave the two 16-bit values from the high half of M1 with the two
263    16-bit values from the high half of M2.  */
264 extern __inline __m64
265     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266     _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
267   __m64_union __mu1, __mu2, __res;
268 
269   __mu1.as_m64 = __m1;
270   __mu2.as_m64 = __m2;
271 
272   __res.as_short[0] = __mu1.as_short[2];
273   __res.as_short[1] = __mu2.as_short[2];
274   __res.as_short[2] = __mu1.as_short[3];
275   __res.as_short[3] = __mu2.as_short[3];
276 
277   return (__m64)__res.as_m64;
278 }
279 
280 extern __inline __m64
281     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282     _m_punpckhwd(__m64 __m1, __m64 __m2) {
283   return _mm_unpackhi_pi16(__m1, __m2);
284 }
285 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
286    value from the high half of M2.  */
287 extern __inline __m64
288     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289     _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
290   __m64_union __mu1, __mu2, __res;
291 
292   __mu1.as_m64 = __m1;
293   __mu2.as_m64 = __m2;
294 
295   __res.as_int[0] = __mu1.as_int[1];
296   __res.as_int[1] = __mu2.as_int[1];
297 
298   return (__m64)__res.as_m64;
299 }
300 
301 extern __inline __m64
302     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303     _m_punpckhdq(__m64 __m1, __m64 __m2) {
304   return _mm_unpackhi_pi32(__m1, __m2);
305 }
306 /* Interleave the four 8-bit values from the low half of M1 with the four
307    8-bit values from the low half of M2.  */
308 extern __inline __m64
309     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310     _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
311 #if _ARCH_PWR8
312   __vector unsigned char __a, __b, __c;
313 
314   __a = (__vector unsigned char)vec_splats(__m1);
315   __b = (__vector unsigned char)vec_splats(__m2);
316   __c = vec_mergel(__a, __b);
317   return (__m64)((__vector long long)__c)[0];
318 #else
319   __m64_union __mu1, __mu2, __res;
320 
321   __mu1.as_m64 = __m1;
322   __mu2.as_m64 = __m2;
323 
324   __res.as_char[0] = __mu1.as_char[0];
325   __res.as_char[1] = __mu2.as_char[0];
326   __res.as_char[2] = __mu1.as_char[1];
327   __res.as_char[3] = __mu2.as_char[1];
328   __res.as_char[4] = __mu1.as_char[2];
329   __res.as_char[5] = __mu2.as_char[2];
330   __res.as_char[6] = __mu1.as_char[3];
331   __res.as_char[7] = __mu2.as_char[3];
332 
333   return (__m64)__res.as_m64;
334 #endif
335 }
336 
337 extern __inline __m64
338     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339     _m_punpcklbw(__m64 __m1, __m64 __m2) {
340   return _mm_unpacklo_pi8(__m1, __m2);
341 }
342 /* Interleave the two 16-bit values from the low half of M1 with the two
343    16-bit values from the low half of M2.  */
344 extern __inline __m64
345     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
346     _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
347   __m64_union __mu1, __mu2, __res;
348 
349   __mu1.as_m64 = __m1;
350   __mu2.as_m64 = __m2;
351 
352   __res.as_short[0] = __mu1.as_short[0];
353   __res.as_short[1] = __mu2.as_short[0];
354   __res.as_short[2] = __mu1.as_short[1];
355   __res.as_short[3] = __mu2.as_short[1];
356 
357   return (__m64)__res.as_m64;
358 }
359 
360 extern __inline __m64
361     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362     _m_punpcklwd(__m64 __m1, __m64 __m2) {
363   return _mm_unpacklo_pi16(__m1, __m2);
364 }
365 
366 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
367    value from the low half of M2.  */
368 extern __inline __m64
369     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
370     _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
371   __m64_union __mu1, __mu2, __res;
372 
373   __mu1.as_m64 = __m1;
374   __mu2.as_m64 = __m2;
375 
376   __res.as_int[0] = __mu1.as_int[0];
377   __res.as_int[1] = __mu2.as_int[0];
378 
379   return (__m64)__res.as_m64;
380 }
381 
382 extern __inline __m64
383     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384     _m_punpckldq(__m64 __m1, __m64 __m2) {
385   return _mm_unpacklo_pi32(__m1, __m2);
386 }
387 
388 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
389 extern __inline __m64
390     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391     _mm_add_pi8(__m64 __m1, __m64 __m2) {
392 #if _ARCH_PWR8
393   __vector signed char __a, __b, __c;
394 
395   __a = (__vector signed char)vec_splats(__m1);
396   __b = (__vector signed char)vec_splats(__m2);
397   __c = vec_add(__a, __b);
398   return (__m64)((__vector long long)__c)[0];
399 #else
400   __m64_union __mu1, __mu2, __res;
401 
402   __mu1.as_m64 = __m1;
403   __mu2.as_m64 = __m2;
404 
405   __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
406   __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
407   __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
408   __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
409   __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
410   __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
411   __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
412   __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
413 
414   return (__m64)__res.as_m64;
415 #endif
416 }
417 
418 extern __inline __m64
419     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
420     _m_paddb(__m64 __m1, __m64 __m2) {
421   return _mm_add_pi8(__m1, __m2);
422 }
423 
424 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
425 extern __inline __m64
426     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
427     _mm_add_pi16(__m64 __m1, __m64 __m2) {
428 #if _ARCH_PWR8
429   __vector signed short __a, __b, __c;
430 
431   __a = (__vector signed short)vec_splats(__m1);
432   __b = (__vector signed short)vec_splats(__m2);
433   __c = vec_add(__a, __b);
434   return (__m64)((__vector long long)__c)[0];
435 #else
436   __m64_union __mu1, __mu2, __res;
437 
438   __mu1.as_m64 = __m1;
439   __mu2.as_m64 = __m2;
440 
441   __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
442   __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
443   __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
444   __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
445 
446   return (__m64)__res.as_m64;
447 #endif
448 }
449 
450 extern __inline __m64
451     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452     _m_paddw(__m64 __m1, __m64 __m2) {
453   return _mm_add_pi16(__m1, __m2);
454 }
455 
456 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
457 extern __inline __m64
458     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459     _mm_add_pi32(__m64 __m1, __m64 __m2) {
460 #if _ARCH_PWR9
461   __vector signed int __a, __b, __c;
462 
463   __a = (__vector signed int)vec_splats(__m1);
464   __b = (__vector signed int)vec_splats(__m2);
465   __c = vec_add(__a, __b);
466   return (__m64)((__vector long long)__c)[0];
467 #else
468   __m64_union __mu1, __mu2, __res;
469 
470   __mu1.as_m64 = __m1;
471   __mu2.as_m64 = __m2;
472 
473   __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
474   __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
475 
476   return (__m64)__res.as_m64;
477 #endif
478 }
479 
480 extern __inline __m64
481     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482     _m_paddd(__m64 __m1, __m64 __m2) {
483   return _mm_add_pi32(__m1, __m2);
484 }
485 
486 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
487 extern __inline __m64
488     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489     _mm_sub_pi8(__m64 __m1, __m64 __m2) {
490 #if _ARCH_PWR8
491   __vector signed char __a, __b, __c;
492 
493   __a = (__vector signed char)vec_splats(__m1);
494   __b = (__vector signed char)vec_splats(__m2);
495   __c = vec_sub(__a, __b);
496   return (__m64)((__vector long long)__c)[0];
497 #else
498   __m64_union __mu1, __mu2, __res;
499 
500   __mu1.as_m64 = __m1;
501   __mu2.as_m64 = __m2;
502 
503   __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
504   __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
505   __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
506   __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
507   __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
508   __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
509   __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
510   __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
511 
512   return (__m64)__res.as_m64;
513 #endif
514 }
515 
516 extern __inline __m64
517     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
518     _m_psubb(__m64 __m1, __m64 __m2) {
519   return _mm_sub_pi8(__m1, __m2);
520 }
521 
522 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
523 extern __inline __m64
524     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525     _mm_sub_pi16(__m64 __m1, __m64 __m2) {
526 #if _ARCH_PWR8
527   __vector signed short __a, __b, __c;
528 
529   __a = (__vector signed short)vec_splats(__m1);
530   __b = (__vector signed short)vec_splats(__m2);
531   __c = vec_sub(__a, __b);
532   return (__m64)((__vector long long)__c)[0];
533 #else
534   __m64_union __mu1, __mu2, __res;
535 
536   __mu1.as_m64 = __m1;
537   __mu2.as_m64 = __m2;
538 
539   __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
540   __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
541   __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
542   __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
543 
544   return (__m64)__res.as_m64;
545 #endif
546 }
547 
548 extern __inline __m64
549     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
550     _m_psubw(__m64 __m1, __m64 __m2) {
551   return _mm_sub_pi16(__m1, __m2);
552 }
553 
554 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
555 extern __inline __m64
556     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557     _mm_sub_pi32(__m64 __m1, __m64 __m2) {
558 #if _ARCH_PWR9
559   __vector signed int __a, __b, __c;
560 
561   __a = (__vector signed int)vec_splats(__m1);
562   __b = (__vector signed int)vec_splats(__m2);
563   __c = vec_sub(__a, __b);
564   return (__m64)((__vector long long)__c)[0];
565 #else
566   __m64_union __mu1, __mu2, __res;
567 
568   __mu1.as_m64 = __m1;
569   __mu2.as_m64 = __m2;
570 
571   __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
572   __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
573 
574   return (__m64)__res.as_m64;
575 #endif
576 }
577 
578 extern __inline __m64
579     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
580     _m_psubd(__m64 __m1, __m64 __m2) {
581   return _mm_sub_pi32(__m1, __m2);
582 }
583 
584 extern __inline __m64
585     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
586     _mm_add_si64(__m64 __m1, __m64 __m2) {
587   return (__m1 + __m2);
588 }
589 
590 extern __inline __m64
591     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592     _mm_sub_si64(__m64 __m1, __m64 __m2) {
593   return (__m1 - __m2);
594 }
595 
596 /* Shift the 64-bit value in M left by COUNT.  */
597 extern __inline __m64
598     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599     _mm_sll_si64(__m64 __m, __m64 __count) {
600   return (__m << __count);
601 }
602 
603 extern __inline __m64
604     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605     _m_psllq(__m64 __m, __m64 __count) {
606   return _mm_sll_si64(__m, __count);
607 }
608 
609 extern __inline __m64
610     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611     _mm_slli_si64(__m64 __m, const int __count) {
612   return (__m << __count);
613 }
614 
615 extern __inline __m64
616     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617     _m_psllqi(__m64 __m, const int __count) {
618   return _mm_slli_si64(__m, __count);
619 }
620 
621 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
622 extern __inline __m64
623     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624     _mm_srl_si64(__m64 __m, __m64 __count) {
625   return (__m >> __count);
626 }
627 
628 extern __inline __m64
629     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630     _m_psrlq(__m64 __m, __m64 __count) {
631   return _mm_srl_si64(__m, __count);
632 }
633 
634 extern __inline __m64
635     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
636     _mm_srli_si64(__m64 __m, const int __count) {
637   return (__m >> __count);
638 }
639 
640 extern __inline __m64
641     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642     _m_psrlqi(__m64 __m, const int __count) {
643   return _mm_srli_si64(__m, __count);
644 }
645 
646 /* Bit-wise AND the 64-bit values in M1 and M2.  */
647 extern __inline __m64
648     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649     _mm_and_si64(__m64 __m1, __m64 __m2) {
650   return (__m1 & __m2);
651 }
652 
653 extern __inline __m64
654     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655     _m_pand(__m64 __m1, __m64 __m2) {
656   return _mm_and_si64(__m1, __m2);
657 }
658 
659 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
660    64-bit value in M2.  */
661 extern __inline __m64
662     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
663     _mm_andnot_si64(__m64 __m1, __m64 __m2) {
664   return (~__m1 & __m2);
665 }
666 
667 extern __inline __m64
668     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
669     _m_pandn(__m64 __m1, __m64 __m2) {
670   return _mm_andnot_si64(__m1, __m2);
671 }
672 
673 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
674 extern __inline __m64
675     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676     _mm_or_si64(__m64 __m1, __m64 __m2) {
677   return (__m1 | __m2);
678 }
679 
680 extern __inline __m64
681     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682     _m_por(__m64 __m1, __m64 __m2) {
683   return _mm_or_si64(__m1, __m2);
684 }
685 
686 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
687 extern __inline __m64
688     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
689     _mm_xor_si64(__m64 __m1, __m64 __m2) {
690   return (__m1 ^ __m2);
691 }
692 
693 extern __inline __m64
694     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695     _m_pxor(__m64 __m1, __m64 __m2) {
696   return _mm_xor_si64(__m1, __m2);
697 }
698 
699 /* Creates a 64-bit zero.  */
700 extern __inline __m64
701     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
702     _mm_setzero_si64(void) {
703   return (__m64)0;
704 }
705 
706 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
707    test is true and zero if false.  */
708 extern __inline __m64
709     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
710     _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
711 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
712   __m64 __res;
713   __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
714   return (__res);
715 #else
716   __m64_union __mu1, __mu2, __res;
717 
718   __mu1.as_m64 = __m1;
719   __mu2.as_m64 = __m2;
720 
721   __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
722   __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
723   __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
724   __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
725   __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
726   __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
727   __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
728   __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
729 
730   return (__m64)__res.as_m64;
731 #endif
732 }
733 
734 extern __inline __m64
735     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736     _m_pcmpeqb(__m64 __m1, __m64 __m2) {
737   return _mm_cmpeq_pi8(__m1, __m2);
738 }
739 
740 extern __inline __m64
741     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742     _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
743 #if _ARCH_PWR8
744   __vector signed char __a, __b, __c;
745 
746   __a = (__vector signed char)vec_splats(__m1);
747   __b = (__vector signed char)vec_splats(__m2);
748   __c = (__vector signed char)vec_cmpgt(__a, __b);
749   return (__m64)((__vector long long)__c)[0];
750 #else
751   __m64_union __mu1, __mu2, __res;
752 
753   __mu1.as_m64 = __m1;
754   __mu2.as_m64 = __m2;
755 
756   __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
757   __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
758   __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
759   __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
760   __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
761   __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
762   __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
763   __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
764 
765   return (__m64)__res.as_m64;
766 #endif
767 }
768 
769 extern __inline __m64
770     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771     _m_pcmpgtb(__m64 __m1, __m64 __m2) {
772   return _mm_cmpgt_pi8(__m1, __m2);
773 }
774 
775 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
776    the test is true and zero if false.  */
777 extern __inline __m64
778     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
779     _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
780 #if _ARCH_PWR8
781   __vector signed short __a, __b, __c;
782 
783   __a = (__vector signed short)vec_splats(__m1);
784   __b = (__vector signed short)vec_splats(__m2);
785   __c = (__vector signed short)vec_cmpeq(__a, __b);
786   return (__m64)((__vector long long)__c)[0];
787 #else
788   __m64_union __mu1, __mu2, __res;
789 
790   __mu1.as_m64 = __m1;
791   __mu2.as_m64 = __m2;
792 
793   __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
794   __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
795   __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
796   __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
797 
798   return (__m64)__res.as_m64;
799 #endif
800 }
801 
802 extern __inline __m64
803     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
804     _m_pcmpeqw(__m64 __m1, __m64 __m2) {
805   return _mm_cmpeq_pi16(__m1, __m2);
806 }
807 
808 extern __inline __m64
809     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810     _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
811 #if _ARCH_PWR8
812   __vector signed short __a, __b, __c;
813 
814   __a = (__vector signed short)vec_splats(__m1);
815   __b = (__vector signed short)vec_splats(__m2);
816   __c = (__vector signed short)vec_cmpgt(__a, __b);
817   return (__m64)((__vector long long)__c)[0];
818 #else
819   __m64_union __mu1, __mu2, __res;
820 
821   __mu1.as_m64 = __m1;
822   __mu2.as_m64 = __m2;
823 
824   __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
825   __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
826   __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
827   __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
828 
829   return (__m64)__res.as_m64;
830 #endif
831 }
832 
833 extern __inline __m64
834     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835     _m_pcmpgtw(__m64 __m1, __m64 __m2) {
836   return _mm_cmpgt_pi16(__m1, __m2);
837 }
838 
839 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
840    the test is true and zero if false.  */
841 extern __inline __m64
842     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843     _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
844 #if _ARCH_PWR9
845   __vector signed int __a, __b, __c;
846 
847   __a = (__vector signed int)vec_splats(__m1);
848   __b = (__vector signed int)vec_splats(__m2);
849   __c = (__vector signed int)vec_cmpeq(__a, __b);
850   return (__m64)((__vector long long)__c)[0];
851 #else
852   __m64_union __mu1, __mu2, __res;
853 
854   __mu1.as_m64 = __m1;
855   __mu2.as_m64 = __m2;
856 
857   __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
858   __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
859 
860   return (__m64)__res.as_m64;
861 #endif
862 }
863 
864 extern __inline __m64
865     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866     _m_pcmpeqd(__m64 __m1, __m64 __m2) {
867   return _mm_cmpeq_pi32(__m1, __m2);
868 }
869 
870 extern __inline __m64
871     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872     _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
873 #if _ARCH_PWR9
874   __vector signed int __a, __b, __c;
875 
876   __a = (__vector signed int)vec_splats(__m1);
877   __b = (__vector signed int)vec_splats(__m2);
878   __c = (__vector signed int)vec_cmpgt(__a, __b);
879   return (__m64)((__vector long long)__c)[0];
880 #else
881   __m64_union __mu1, __mu2, __res;
882 
883   __mu1.as_m64 = __m1;
884   __mu2.as_m64 = __m2;
885 
886   __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
887   __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
888 
889   return (__m64)__res.as_m64;
890 #endif
891 }
892 
893 extern __inline __m64
894     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
895     _m_pcmpgtd(__m64 __m1, __m64 __m2) {
896   return _mm_cmpgt_pi32(__m1, __m2);
897 }
898 
899 #if _ARCH_PWR8
900 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
901    saturated arithmetic.  */
902 extern __inline __m64
903     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904     _mm_adds_pi8(__m64 __m1, __m64 __m2) {
905   __vector signed char __a, __b, __c;
906 
907   __a = (__vector signed char)vec_splats(__m1);
908   __b = (__vector signed char)vec_splats(__m2);
909   __c = vec_adds(__a, __b);
910   return (__m64)((__vector long long)__c)[0];
911 }
912 
913 extern __inline __m64
914     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915     _m_paddsb(__m64 __m1, __m64 __m2) {
916   return _mm_adds_pi8(__m1, __m2);
917 }
918 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
919    saturated arithmetic.  */
920 extern __inline __m64
921     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922     _mm_adds_pi16(__m64 __m1, __m64 __m2) {
923   __vector signed short __a, __b, __c;
924 
925   __a = (__vector signed short)vec_splats(__m1);
926   __b = (__vector signed short)vec_splats(__m2);
927   __c = vec_adds(__a, __b);
928   return (__m64)((__vector long long)__c)[0];
929 }
930 
931 extern __inline __m64
932     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
933     _m_paddsw(__m64 __m1, __m64 __m2) {
934   return _mm_adds_pi16(__m1, __m2);
935 }
936 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
937    saturated arithmetic.  */
938 extern __inline __m64
939     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
940     _mm_adds_pu8(__m64 __m1, __m64 __m2) {
941   __vector unsigned char __a, __b, __c;
942 
943   __a = (__vector unsigned char)vec_splats(__m1);
944   __b = (__vector unsigned char)vec_splats(__m2);
945   __c = vec_adds(__a, __b);
946   return (__m64)((__vector long long)__c)[0];
947 }
948 
949 extern __inline __m64
950     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951     _m_paddusb(__m64 __m1, __m64 __m2) {
952   return _mm_adds_pu8(__m1, __m2);
953 }
954 
955 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
956    saturated arithmetic.  */
957 extern __inline __m64
958     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959     _mm_adds_pu16(__m64 __m1, __m64 __m2) {
960   __vector unsigned short __a, __b, __c;
961 
962   __a = (__vector unsigned short)vec_splats(__m1);
963   __b = (__vector unsigned short)vec_splats(__m2);
964   __c = vec_adds(__a, __b);
965   return (__m64)((__vector long long)__c)[0];
966 }
967 
968 extern __inline __m64
969     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970     _m_paddusw(__m64 __m1, __m64 __m2) {
971   return _mm_adds_pu16(__m1, __m2);
972 }
973 
974 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
975    saturating arithmetic.  */
976 extern __inline __m64
977     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978     _mm_subs_pi8(__m64 __m1, __m64 __m2) {
979   __vector signed char __a, __b, __c;
980 
981   __a = (__vector signed char)vec_splats(__m1);
982   __b = (__vector signed char)vec_splats(__m2);
983   __c = vec_subs(__a, __b);
984   return (__m64)((__vector long long)__c)[0];
985 }
986 
987 extern __inline __m64
988     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989     _m_psubsb(__m64 __m1, __m64 __m2) {
990   return _mm_subs_pi8(__m1, __m2);
991 }
992 
993 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
994    signed saturating arithmetic.  */
995 extern __inline __m64
996     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
997     _mm_subs_pi16(__m64 __m1, __m64 __m2) {
998   __vector signed short __a, __b, __c;
999 
1000   __a = (__vector signed short)vec_splats(__m1);
1001   __b = (__vector signed short)vec_splats(__m2);
1002   __c = vec_subs(__a, __b);
1003   return (__m64)((__vector long long)__c)[0];
1004 }
1005 
1006 extern __inline __m64
1007     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1008     _m_psubsw(__m64 __m1, __m64 __m2) {
1009   return _mm_subs_pi16(__m1, __m2);
1010 }
1011 
1012 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1013    unsigned saturating arithmetic.  */
1014 extern __inline __m64
1015     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016     _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1017   __vector unsigned char __a, __b, __c;
1018 
1019   __a = (__vector unsigned char)vec_splats(__m1);
1020   __b = (__vector unsigned char)vec_splats(__m2);
1021   __c = vec_subs(__a, __b);
1022   return (__m64)((__vector long long)__c)[0];
1023 }
1024 
1025 extern __inline __m64
1026     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027     _m_psubusb(__m64 __m1, __m64 __m2) {
1028   return _mm_subs_pu8(__m1, __m2);
1029 }
1030 
1031 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1032    unsigned saturating arithmetic.  */
1033 extern __inline __m64
1034     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1035     _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1036   __vector unsigned short __a, __b, __c;
1037 
1038   __a = (__vector unsigned short)vec_splats(__m1);
1039   __b = (__vector unsigned short)vec_splats(__m2);
1040   __c = vec_subs(__a, __b);
1041   return (__m64)((__vector long long)__c)[0];
1042 }
1043 
1044 extern __inline __m64
1045     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046     _m_psubusw(__m64 __m1, __m64 __m2) {
1047   return _mm_subs_pu16(__m1, __m2);
1048 }
1049 
1050 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1051    four 32-bit intermediate results, which are then summed by pairs to
1052    produce two 32-bit results.  */
1053 extern __inline __m64
1054     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055     _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1056   __vector signed short __a, __b;
1057   __vector signed int __c;
1058   __vector signed int __zero = {0, 0, 0, 0};
1059 
1060   __a = (__vector signed short)vec_splats(__m1);
1061   __b = (__vector signed short)vec_splats(__m2);
1062   __c = vec_vmsumshm(__a, __b, __zero);
1063   return (__m64)((__vector long long)__c)[0];
1064 }
1065 
1066 extern __inline __m64
1067     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068     _m_pmaddwd(__m64 __m1, __m64 __m2) {
1069   return _mm_madd_pi16(__m1, __m2);
1070 }
1071 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1072    M2 and produce the high 16 bits of the 32-bit results.  */
1073 extern __inline __m64
1074     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075     _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1076   __vector signed short __a, __b;
1077   __vector signed short __c;
1078   __vector signed int __w0, __w1;
1079   __vector unsigned char __xform1 = {
1080 #ifdef __LITTLE_ENDIAN__
1081       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1082       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1083 #else
1084       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1085       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1086 #endif
1087   };
1088 
1089   __a = (__vector signed short)vec_splats(__m1);
1090   __b = (__vector signed short)vec_splats(__m2);
1091 
1092   __w0 = vec_vmulesh(__a, __b);
1093   __w1 = vec_vmulosh(__a, __b);
1094   __c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
1095 
1096   return (__m64)((__vector long long)__c)[0];
1097 }
1098 
1099 extern __inline __m64
1100     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1101     _m_pmulhw(__m64 __m1, __m64 __m2) {
1102   return _mm_mulhi_pi16(__m1, __m2);
1103 }
1104 
1105 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1106    the low 16 bits of the results.  */
1107 extern __inline __m64
1108     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1109     _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1110   __vector signed short __a, __b, __c;
1111 
1112   __a = (__vector signed short)vec_splats(__m1);
1113   __b = (__vector signed short)vec_splats(__m2);
1114   __c = __a * __b;
1115   return (__m64)((__vector long long)__c)[0];
1116 }
1117 
1118 extern __inline __m64
1119     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120     _m_pmullw(__m64 __m1, __m64 __m2) {
1121   return _mm_mullo_pi16(__m1, __m2);
1122 }
1123 
1124 /* Shift four 16-bit values in M left by COUNT.  */
1125 extern __inline __m64
1126     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1127     _mm_sll_pi16(__m64 __m, __m64 __count) {
1128   __vector signed short __r;
1129   __vector unsigned short __c;
1130 
1131   if (__count <= 15) {
1132     __r = (__vector signed short)vec_splats(__m);
1133     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1134     __r = vec_sl(__r, (__vector unsigned short)__c);
1135     return (__m64)((__vector long long)__r)[0];
1136   } else
1137     return (0);
1138 }
1139 
1140 extern __inline __m64
1141     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142     _m_psllw(__m64 __m, __m64 __count) {
1143   return _mm_sll_pi16(__m, __count);
1144 }
1145 
1146 extern __inline __m64
1147     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1148     _mm_slli_pi16(__m64 __m, int __count) {
1149   /* Promote int to long then invoke mm_sll_pi16.  */
1150   return _mm_sll_pi16(__m, __count);
1151 }
1152 
1153 extern __inline __m64
1154     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1155     _m_psllwi(__m64 __m, int __count) {
1156   return _mm_slli_pi16(__m, __count);
1157 }
1158 
1159 /* Shift two 32-bit values in M left by COUNT.  */
1160 extern __inline __m64
1161     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162     _mm_sll_pi32(__m64 __m, __m64 __count) {
1163   __m64_union __res;
1164 
1165   __res.as_m64 = __m;
1166 
1167   __res.as_int[0] = __res.as_int[0] << __count;
1168   __res.as_int[1] = __res.as_int[1] << __count;
1169   return (__res.as_m64);
1170 }
1171 
1172 extern __inline __m64
1173     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174     _m_pslld(__m64 __m, __m64 __count) {
1175   return _mm_sll_pi32(__m, __count);
1176 }
1177 
1178 extern __inline __m64
1179     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180     _mm_slli_pi32(__m64 __m, int __count) {
1181   /* Promote int to long then invoke mm_sll_pi32.  */
1182   return _mm_sll_pi32(__m, __count);
1183 }
1184 
1185 extern __inline __m64
1186     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187     _m_pslldi(__m64 __m, int __count) {
1188   return _mm_slli_pi32(__m, __count);
1189 }
1190 
1191 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1192 extern __inline __m64
1193     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194     _mm_sra_pi16(__m64 __m, __m64 __count) {
1195   __vector signed short __r;
1196   __vector unsigned short __c;
1197 
1198   if (__count <= 15) {
1199     __r = (__vector signed short)vec_splats(__m);
1200     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1201     __r = vec_sra(__r, (__vector unsigned short)__c);
1202     return (__m64)((__vector long long)__r)[0];
1203   } else
1204     return (0);
1205 }
1206 
1207 extern __inline __m64
1208     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209     _m_psraw(__m64 __m, __m64 __count) {
1210   return _mm_sra_pi16(__m, __count);
1211 }
1212 
1213 extern __inline __m64
1214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215     _mm_srai_pi16(__m64 __m, int __count) {
1216   /* Promote int to long then invoke mm_sra_pi32.  */
1217   return _mm_sra_pi16(__m, __count);
1218 }
1219 
1220 extern __inline __m64
1221     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1222     _m_psrawi(__m64 __m, int __count) {
1223   return _mm_srai_pi16(__m, __count);
1224 }
1225 
1226 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1227 extern __inline __m64
1228     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229     _mm_sra_pi32(__m64 __m, __m64 __count) {
1230   __m64_union __res;
1231 
1232   __res.as_m64 = __m;
1233 
1234   __res.as_int[0] = __res.as_int[0] >> __count;
1235   __res.as_int[1] = __res.as_int[1] >> __count;
1236   return (__res.as_m64);
1237 }
1238 
1239 extern __inline __m64
1240     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241     _m_psrad(__m64 __m, __m64 __count) {
1242   return _mm_sra_pi32(__m, __count);
1243 }
1244 
1245 extern __inline __m64
1246     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247     _mm_srai_pi32(__m64 __m, int __count) {
1248   /* Promote int to long then invoke mm_sra_pi32.  */
1249   return _mm_sra_pi32(__m, __count);
1250 }
1251 
1252 extern __inline __m64
1253     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1254     _m_psradi(__m64 __m, int __count) {
1255   return _mm_srai_pi32(__m, __count);
1256 }
1257 
1258 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1259 extern __inline __m64
1260     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1261     _mm_srl_pi16(__m64 __m, __m64 __count) {
1262   __vector unsigned short __r;
1263   __vector unsigned short __c;
1264 
1265   if (__count <= 15) {
1266     __r = (__vector unsigned short)vec_splats(__m);
1267     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1268     __r = vec_sr(__r, (__vector unsigned short)__c);
1269     return (__m64)((__vector long long)__r)[0];
1270   } else
1271     return (0);
1272 }
1273 
1274 extern __inline __m64
1275     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1276     _m_psrlw(__m64 __m, __m64 __count) {
1277   return _mm_srl_pi16(__m, __count);
1278 }
1279 
1280 extern __inline __m64
1281     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282     _mm_srli_pi16(__m64 __m, int __count) {
1283   /* Promote int to long then invoke mm_sra_pi32.  */
1284   return _mm_srl_pi16(__m, __count);
1285 }
1286 
1287 extern __inline __m64
1288     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289     _m_psrlwi(__m64 __m, int __count) {
1290   return _mm_srli_pi16(__m, __count);
1291 }
1292 
1293 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1294 extern __inline __m64
1295     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296     _mm_srl_pi32(__m64 __m, __m64 __count) {
1297   __m64_union __res;
1298 
1299   __res.as_m64 = __m;
1300 
1301   __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1302   __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1303   return (__res.as_m64);
1304 }
1305 
1306 extern __inline __m64
1307     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1308     _m_psrld(__m64 __m, __m64 __count) {
1309   return _mm_srl_pi32(__m, __count);
1310 }
1311 
1312 extern __inline __m64
1313     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1314     _mm_srli_pi32(__m64 __m, int __count) {
1315   /* Promote int to long then invoke mm_srl_pi32.  */
1316   return _mm_srl_pi32(__m, __count);
1317 }
1318 
1319 extern __inline __m64
1320     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321     _m_psrldi(__m64 __m, int __count) {
1322   return _mm_srli_pi32(__m, __count);
1323 }
1324 #endif /* _ARCH_PWR8 */
1325 
1326 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1327 extern __inline __m64
1328     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329     _mm_set_pi32(int __i1, int __i0) {
1330   __m64_union __res;
1331 
1332   __res.as_int[0] = __i0;
1333   __res.as_int[1] = __i1;
1334   return (__res.as_m64);
1335 }
1336 
1337 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1338 extern __inline __m64
1339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340     _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1341   __m64_union __res;
1342 
1343   __res.as_short[0] = __w0;
1344   __res.as_short[1] = __w1;
1345   __res.as_short[2] = __w2;
1346   __res.as_short[3] = __w3;
1347   return (__res.as_m64);
1348 }
1349 
1350 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1351 extern __inline __m64
1352     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353     _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1354                 char __b2, char __b1, char __b0) {
1355   __m64_union __res;
1356 
1357   __res.as_char[0] = __b0;
1358   __res.as_char[1] = __b1;
1359   __res.as_char[2] = __b2;
1360   __res.as_char[3] = __b3;
1361   __res.as_char[4] = __b4;
1362   __res.as_char[5] = __b5;
1363   __res.as_char[6] = __b6;
1364   __res.as_char[7] = __b7;
1365   return (__res.as_m64);
1366 }
1367 
1368 /* Similar, but with the arguments in reverse order.  */
1369 extern __inline __m64
1370     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371     _mm_setr_pi32(int __i0, int __i1) {
1372   __m64_union __res;
1373 
1374   __res.as_int[0] = __i0;
1375   __res.as_int[1] = __i1;
1376   return (__res.as_m64);
1377 }
1378 
1379 extern __inline __m64
1380     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1381     _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1382   return _mm_set_pi16(__w3, __w2, __w1, __w0);
1383 }
1384 
1385 extern __inline __m64
1386     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387     _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1388                  char __b5, char __b6, char __b7) {
1389   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1390 }
1391 
1392 /* Creates a vector of two 32-bit values, both elements containing I.  */
1393 extern __inline __m64
1394     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395     _mm_set1_pi32(int __i) {
1396   __m64_union __res;
1397 
1398   __res.as_int[0] = __i;
1399   __res.as_int[1] = __i;
1400   return (__res.as_m64);
1401 }
1402 
1403 /* Creates a vector of four 16-bit values, all elements containing W.  */
1404 extern __inline __m64
1405     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406     _mm_set1_pi16(short __w) {
1407 #if _ARCH_PWR9
1408   __vector signed short w;
1409 
1410   w = (__vector signed short)vec_splats(__w);
1411   return (__m64)((__vector long long)w)[0];
1412 #else
1413   __m64_union __res;
1414 
1415   __res.as_short[0] = __w;
1416   __res.as_short[1] = __w;
1417   __res.as_short[2] = __w;
1418   __res.as_short[3] = __w;
1419   return (__res.as_m64);
1420 #endif
1421 }
1422 
1423 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1424 extern __inline __m64
1425     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1426     _mm_set1_pi8(signed char __b) {
1427 #if _ARCH_PWR8
1428   __vector signed char __res;
1429 
1430   __res = (__vector signed char)vec_splats(__b);
1431   return (__m64)((__vector long long)__res)[0];
1432 #else
1433   __m64_union __res;
1434 
1435   __res.as_char[0] = __b;
1436   __res.as_char[1] = __b;
1437   __res.as_char[2] = __b;
1438   __res.as_char[3] = __b;
1439   __res.as_char[4] = __b;
1440   __res.as_char[5] = __b;
1441   __res.as_char[6] = __b;
1442   __res.as_char[7] = __b;
1443   return (__res.as_m64);
1444 #endif
1445 }
1446 
1447 #else
1448 #include_next <mmintrin.h>
1449 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
1450         */
1451 
1452 #endif /* _MMINTRIN_H_INCLUDED */
1453