1 /* Copyright (C) 2002-2020 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
35
36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37 target does not support a native __vector_size__ (8) type. Instead
38 we typedef __m64 to a 64-bit unsigned long long, which is natively
39 supported in 64-bit mode. This works well for the _si64 and some
40 _pi32 operations, but starts to generate long sequences for _pi16
41 and _pi8 operations. For those cases it better (faster and
42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43 unit, perform the operation, and then transfer the result back to
44 the __m64 type. This implies that the direct register move
45 instructions, introduced with power8, are available for efficient
46 implementation of these transfers.
47
48 Most MMX intrinsic operations can be performed efficiently as
49 C language 64-bit scalar operation or optimized to use the newer
50 128-bit SSE/Altivec operations. We recomend this for new
51 applications. */
52 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
53 #endif
54
55 #ifndef _MMINTRIN_H_INCLUDED
56 #define _MMINTRIN_H_INCLUDED
57
58 #include <altivec.h>
59 /* The Intel API is flexible enough that we must allow aliasing with other
60 vector types, and their scalar components. */
61 typedef __attribute__ ((__aligned__ (8),
62 __may_alias__)) unsigned long long __m64;
63
64 typedef __attribute__ ((__aligned__ (8)))
65 union
66 {
67 __m64 as_m64;
68 char as_char[8];
69 signed char as_signed_char [8];
70 short as_short[4];
71 int as_int[2];
72 long long as_long_long;
73 float as_float[2];
74 double as_double;
75 } __m64_union;
76
77 /* Empty the multimedia state. */
78 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)79 _mm_empty (void)
80 {
81 /* nothing to do on PowerPC. */
82 }
83
84 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_empty(void)85 _m_empty (void)
86 {
87 /* nothing to do on PowerPC. */
88 }
89
90 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
91 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si64(int __i)92 _mm_cvtsi32_si64 (int __i)
93 {
94 return (__m64) (unsigned int) __i;
95 }
96
97 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int(int __i)98 _m_from_int (int __i)
99 {
100 return _mm_cvtsi32_si64 (__i);
101 }
102
103 /* Convert the lower 32 bits of the __m64 object into an integer. */
104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si32(__m64 __i)105 _mm_cvtsi64_si32 (__m64 __i)
106 {
107 return ((int) __i);
108 }
109
110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int(__m64 __i)111 _m_to_int (__m64 __i)
112 {
113 return _mm_cvtsi64_si32 (__i);
114 }
115
116 /* Convert I to a __m64 object. */
117
118 /* Intel intrinsic. */
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int64(long long __i)120 _m_from_int64 (long long __i)
121 {
122 return (__m64) __i;
123 }
124
125 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_m64(long long __i)126 _mm_cvtsi64_m64 (long long __i)
127 {
128 return (__m64) __i;
129 }
130
131 /* Microsoft intrinsic. */
132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si64(long long __i)133 _mm_cvtsi64x_si64 (long long __i)
134 {
135 return (__m64) __i;
136 }
137
138 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi64x(long long __i)139 _mm_set_pi64x (long long __i)
140 {
141 return (__m64) __i;
142 }
143
144 /* Convert the __m64 object to a 64bit integer. */
145
146 /* Intel intrinsic. */
147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int64(__m64 __i)148 _m_to_int64 (__m64 __i)
149 {
150 return (long long)__i;
151 }
152
153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtm64_si64(__m64 __i)154 _mm_cvtm64_si64 (__m64 __i)
155 {
156 return (long long) __i;
157 }
158
159 /* Microsoft intrinsic. */
160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si64x(__m64 __i)161 _mm_cvtsi64_si64x (__m64 __i)
162 {
163 return (long long) __i;
164 }
165
166 #ifdef _ARCH_PWR8
167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
168 the result, and the four 16-bit values from M2 into the upper four 8-bit
169 values of the result, all with signed saturation. */
170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi16(__m64 __m1,__m64 __m2)171 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
172 {
173 __vector signed short vm1;
174 __vector signed char vresult;
175
176 vm1 = (__vector signed short) (__vector unsigned long long)
177 #ifdef __LITTLE_ENDIAN__
178 { __m1, __m2 };
179 #else
180 { __m2, __m1 };
181 #endif
182 vresult = vec_packs (vm1, vm1);
183 return (__m64) ((__vector long long) vresult)[0];
184 }
185
186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packsswb(__m64 __m1,__m64 __m2)187 _m_packsswb (__m64 __m1, __m64 __m2)
188 {
189 return _mm_packs_pi16 (__m1, __m2);
190 }
191
192 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
193 the result, and the two 32-bit values from M2 into the upper two 16-bit
194 values of the result, all with signed saturation. */
195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi32(__m64 __m1,__m64 __m2)196 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
197 {
198 __vector signed int vm1;
199 __vector signed short vresult;
200
201 vm1 = (__vector signed int) (__vector unsigned long long)
202 #ifdef __LITTLE_ENDIAN__
203 { __m1, __m2 };
204 #else
205 { __m2, __m1 };
206 #endif
207 vresult = vec_packs (vm1, vm1);
208 return (__m64) ((__vector long long) vresult)[0];
209 }
210
211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packssdw(__m64 __m1,__m64 __m2)212 _m_packssdw (__m64 __m1, __m64 __m2)
213 {
214 return _mm_packs_pi32 (__m1, __m2);
215 }
216
217 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
218 the result, and the four 16-bit values from M2 into the upper four 8-bit
219 values of the result, all with unsigned saturation. */
220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pu16(__m64 __m1,__m64 __m2)221 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
222 {
223 __vector unsigned char r;
224 __vector signed short vm1 = (__vector signed short) (__vector long long)
225 #ifdef __LITTLE_ENDIAN__
226 { __m1, __m2 };
227 #else
228 { __m2, __m1 };
229 #endif
230 const __vector signed short __zero = { 0 };
231 __vector __bool short __select = vec_cmplt (vm1, __zero);
232 r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1);
233 __vector __bool char packsel = vec_pack (__select, __select);
234 r = vec_sel (r, (const __vector unsigned char) __zero, packsel);
235 return (__m64) ((__vector long long) r)[0];
236 }
237
238 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packuswb(__m64 __m1,__m64 __m2)239 _m_packuswb (__m64 __m1, __m64 __m2)
240 {
241 return _mm_packs_pu16 (__m1, __m2);
242 }
243 #endif /* end ARCH_PWR8 */
244
245 /* Interleave the four 8-bit values from the high half of M1 with the four
246 8-bit values from the high half of M2. */
247 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)248 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
249 {
250 #if _ARCH_PWR8
251 __vector unsigned char a, b, c;
252
253 a = (__vector unsigned char)vec_splats (__m1);
254 b = (__vector unsigned char)vec_splats (__m2);
255 c = vec_mergel (a, b);
256 return (__m64) ((__vector long long) c)[1];
257 #else
258 __m64_union m1, m2, res;
259
260 m1.as_m64 = __m1;
261 m2.as_m64 = __m2;
262
263 res.as_char[0] = m1.as_char[4];
264 res.as_char[1] = m2.as_char[4];
265 res.as_char[2] = m1.as_char[5];
266 res.as_char[3] = m2.as_char[5];
267 res.as_char[4] = m1.as_char[6];
268 res.as_char[5] = m2.as_char[6];
269 res.as_char[6] = m1.as_char[7];
270 res.as_char[7] = m2.as_char[7];
271
272 return (__m64) res.as_m64;
273 #endif
274 }
275
276 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhbw(__m64 __m1,__m64 __m2)277 _m_punpckhbw (__m64 __m1, __m64 __m2)
278 {
279 return _mm_unpackhi_pi8 (__m1, __m2);
280 }
281
282 /* Interleave the two 16-bit values from the high half of M1 with the two
283 16-bit values from the high half of M2. */
284 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)285 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
286 {
287 __m64_union m1, m2, res;
288
289 m1.as_m64 = __m1;
290 m2.as_m64 = __m2;
291
292 res.as_short[0] = m1.as_short[2];
293 res.as_short[1] = m2.as_short[2];
294 res.as_short[2] = m1.as_short[3];
295 res.as_short[3] = m2.as_short[3];
296
297 return (__m64) res.as_m64;
298 }
299
300 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhwd(__m64 __m1,__m64 __m2)301 _m_punpckhwd (__m64 __m1, __m64 __m2)
302 {
303 return _mm_unpackhi_pi16 (__m1, __m2);
304 }
305 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
306 value from the high half of M2. */
307 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi32(__m64 __m1,__m64 __m2)308 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
309 {
310 __m64_union m1, m2, res;
311
312 m1.as_m64 = __m1;
313 m2.as_m64 = __m2;
314
315 res.as_int[0] = m1.as_int[1];
316 res.as_int[1] = m2.as_int[1];
317
318 return (__m64) res.as_m64;
319 }
320
321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhdq(__m64 __m1,__m64 __m2)322 _m_punpckhdq (__m64 __m1, __m64 __m2)
323 {
324 return _mm_unpackhi_pi32 (__m1, __m2);
325 }
326 /* Interleave the four 8-bit values from the low half of M1 with the four
327 8-bit values from the low half of M2. */
328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)329 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
330 {
331 #if _ARCH_PWR8
332 __vector unsigned char a, b, c;
333
334 a = (__vector unsigned char)vec_splats (__m1);
335 b = (__vector unsigned char)vec_splats (__m2);
336 c = vec_mergel (a, b);
337 return (__m64) ((__vector long long) c)[0];
338 #else
339 __m64_union m1, m2, res;
340
341 m1.as_m64 = __m1;
342 m2.as_m64 = __m2;
343
344 res.as_char[0] = m1.as_char[0];
345 res.as_char[1] = m2.as_char[0];
346 res.as_char[2] = m1.as_char[1];
347 res.as_char[3] = m2.as_char[1];
348 res.as_char[4] = m1.as_char[2];
349 res.as_char[5] = m2.as_char[2];
350 res.as_char[6] = m1.as_char[3];
351 res.as_char[7] = m2.as_char[3];
352
353 return (__m64) res.as_m64;
354 #endif
355 }
356
357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklbw(__m64 __m1,__m64 __m2)358 _m_punpcklbw (__m64 __m1, __m64 __m2)
359 {
360 return _mm_unpacklo_pi8 (__m1, __m2);
361 }
362 /* Interleave the two 16-bit values from the low half of M1 with the two
363 16-bit values from the low half of M2. */
364 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)365 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
366 {
367 __m64_union m1, m2, res;
368
369 m1.as_m64 = __m1;
370 m2.as_m64 = __m2;
371
372 res.as_short[0] = m1.as_short[0];
373 res.as_short[1] = m2.as_short[0];
374 res.as_short[2] = m1.as_short[1];
375 res.as_short[3] = m2.as_short[1];
376
377 return (__m64) res.as_m64;
378 }
379
380 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklwd(__m64 __m1,__m64 __m2)381 _m_punpcklwd (__m64 __m1, __m64 __m2)
382 {
383 return _mm_unpacklo_pi16 (__m1, __m2);
384 }
385
386 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
387 value from the low half of M2. */
388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi32(__m64 __m1,__m64 __m2)389 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
390 {
391 __m64_union m1, m2, res;
392
393 m1.as_m64 = __m1;
394 m2.as_m64 = __m2;
395
396 res.as_int[0] = m1.as_int[0];
397 res.as_int[1] = m2.as_int[0];
398
399 return (__m64) res.as_m64;
400 }
401
402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckldq(__m64 __m1,__m64 __m2)403 _m_punpckldq (__m64 __m1, __m64 __m2)
404 {
405 return _mm_unpacklo_pi32 (__m1, __m2);
406 }
407
408 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi8(__m64 __m1,__m64 __m2)410 _mm_add_pi8 (__m64 __m1, __m64 __m2)
411 {
412 #if _ARCH_PWR8
413 __vector signed char a, b, c;
414
415 a = (__vector signed char)vec_splats (__m1);
416 b = (__vector signed char)vec_splats (__m2);
417 c = vec_add (a, b);
418 return (__m64) ((__vector long long) c)[0];
419 #else
420 __m64_union m1, m2, res;
421
422 m1.as_m64 = __m1;
423 m2.as_m64 = __m2;
424
425 res.as_char[0] = m1.as_char[0] + m2.as_char[0];
426 res.as_char[1] = m1.as_char[1] + m2.as_char[1];
427 res.as_char[2] = m1.as_char[2] + m2.as_char[2];
428 res.as_char[3] = m1.as_char[3] + m2.as_char[3];
429 res.as_char[4] = m1.as_char[4] + m2.as_char[4];
430 res.as_char[5] = m1.as_char[5] + m2.as_char[5];
431 res.as_char[6] = m1.as_char[6] + m2.as_char[6];
432 res.as_char[7] = m1.as_char[7] + m2.as_char[7];
433
434 return (__m64) res.as_m64;
435 #endif
436 }
437
438 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddb(__m64 __m1,__m64 __m2)439 _m_paddb (__m64 __m1, __m64 __m2)
440 {
441 return _mm_add_pi8 (__m1, __m2);
442 }
443
444 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi16(__m64 __m1,__m64 __m2)446 _mm_add_pi16 (__m64 __m1, __m64 __m2)
447 {
448 #if _ARCH_PWR8
449 __vector signed short a, b, c;
450
451 a = (__vector signed short)vec_splats (__m1);
452 b = (__vector signed short)vec_splats (__m2);
453 c = vec_add (a, b);
454 return (__m64) ((__vector long long) c)[0];
455 #else
456 __m64_union m1, m2, res;
457
458 m1.as_m64 = __m1;
459 m2.as_m64 = __m2;
460
461 res.as_short[0] = m1.as_short[0] + m2.as_short[0];
462 res.as_short[1] = m1.as_short[1] + m2.as_short[1];
463 res.as_short[2] = m1.as_short[2] + m2.as_short[2];
464 res.as_short[3] = m1.as_short[3] + m2.as_short[3];
465
466 return (__m64) res.as_m64;
467 #endif
468 }
469
470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddw(__m64 __m1,__m64 __m2)471 _m_paddw (__m64 __m1, __m64 __m2)
472 {
473 return _mm_add_pi16 (__m1, __m2);
474 }
475
476 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi32(__m64 __m1,__m64 __m2)478 _mm_add_pi32 (__m64 __m1, __m64 __m2)
479 {
480 #if _ARCH_PWR9
481 __vector signed int a, b, c;
482
483 a = (__vector signed int)vec_splats (__m1);
484 b = (__vector signed int)vec_splats (__m2);
485 c = vec_add (a, b);
486 return (__m64) ((__vector long long) c)[0];
487 #else
488 __m64_union m1, m2, res;
489
490 m1.as_m64 = __m1;
491 m2.as_m64 = __m2;
492
493 res.as_int[0] = m1.as_int[0] + m2.as_int[0];
494 res.as_int[1] = m1.as_int[1] + m2.as_int[1];
495
496 return (__m64) res.as_m64;
497 #endif
498 }
499
500 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddd(__m64 __m1,__m64 __m2)501 _m_paddd (__m64 __m1, __m64 __m2)
502 {
503 return _mm_add_pi32 (__m1, __m2);
504 }
505
506 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
507 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi8(__m64 __m1,__m64 __m2)508 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
509 {
510 #if _ARCH_PWR8
511 __vector signed char a, b, c;
512
513 a = (__vector signed char)vec_splats (__m1);
514 b = (__vector signed char)vec_splats (__m2);
515 c = vec_sub (a, b);
516 return (__m64) ((__vector long long) c)[0];
517 #else
518 __m64_union m1, m2, res;
519
520 m1.as_m64 = __m1;
521 m2.as_m64 = __m2;
522
523 res.as_char[0] = m1.as_char[0] - m2.as_char[0];
524 res.as_char[1] = m1.as_char[1] - m2.as_char[1];
525 res.as_char[2] = m1.as_char[2] - m2.as_char[2];
526 res.as_char[3] = m1.as_char[3] - m2.as_char[3];
527 res.as_char[4] = m1.as_char[4] - m2.as_char[4];
528 res.as_char[5] = m1.as_char[5] - m2.as_char[5];
529 res.as_char[6] = m1.as_char[6] - m2.as_char[6];
530 res.as_char[7] = m1.as_char[7] - m2.as_char[7];
531
532 return (__m64) res.as_m64;
533 #endif
534 }
535
536 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubb(__m64 __m1,__m64 __m2)537 _m_psubb (__m64 __m1, __m64 __m2)
538 {
539 return _mm_sub_pi8 (__m1, __m2);
540 }
541
542 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
543 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi16(__m64 __m1,__m64 __m2)544 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
545 {
546 #if _ARCH_PWR8
547 __vector signed short a, b, c;
548
549 a = (__vector signed short)vec_splats (__m1);
550 b = (__vector signed short)vec_splats (__m2);
551 c = vec_sub (a, b);
552 return (__m64) ((__vector long long) c)[0];
553 #else
554 __m64_union m1, m2, res;
555
556 m1.as_m64 = __m1;
557 m2.as_m64 = __m2;
558
559 res.as_short[0] = m1.as_short[0] - m2.as_short[0];
560 res.as_short[1] = m1.as_short[1] - m2.as_short[1];
561 res.as_short[2] = m1.as_short[2] - m2.as_short[2];
562 res.as_short[3] = m1.as_short[3] - m2.as_short[3];
563
564 return (__m64) res.as_m64;
565 #endif
566 }
567
568 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubw(__m64 __m1,__m64 __m2)569 _m_psubw (__m64 __m1, __m64 __m2)
570 {
571 return _mm_sub_pi16 (__m1, __m2);
572 }
573
574 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
575 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi32(__m64 __m1,__m64 __m2)576 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
577 {
578 #if _ARCH_PWR9
579 __vector signed int a, b, c;
580
581 a = (__vector signed int)vec_splats (__m1);
582 b = (__vector signed int)vec_splats (__m2);
583 c = vec_sub (a, b);
584 return (__m64) ((__vector long long) c)[0];
585 #else
586 __m64_union m1, m2, res;
587
588 m1.as_m64 = __m1;
589 m2.as_m64 = __m2;
590
591 res.as_int[0] = m1.as_int[0] - m2.as_int[0];
592 res.as_int[1] = m1.as_int[1] - m2.as_int[1];
593
594 return (__m64) res.as_m64;
595 #endif
596 }
597
598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubd(__m64 __m1,__m64 __m2)599 _m_psubd (__m64 __m1, __m64 __m2)
600 {
601 return _mm_sub_pi32 (__m1, __m2);
602 }
603
604 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_si64(__m64 __m1,__m64 __m2)605 _mm_add_si64 (__m64 __m1, __m64 __m2)
606 {
607 return (__m1 + __m2);
608 }
609
610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_si64(__m64 __m1,__m64 __m2)611 _mm_sub_si64 (__m64 __m1, __m64 __m2)
612 {
613 return (__m1 - __m2);
614 }
615
616 /* Shift the 64-bit value in M left by COUNT. */
617 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_si64(__m64 __m,__m64 __count)618 _mm_sll_si64 (__m64 __m, __m64 __count)
619 {
620 return (__m << __count);
621 }
622
623 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllq(__m64 __m,__m64 __count)624 _m_psllq (__m64 __m, __m64 __count)
625 {
626 return _mm_sll_si64 (__m, __count);
627 }
628
629 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si64(__m64 __m,const int __count)630 _mm_slli_si64 (__m64 __m, const int __count)
631 {
632 return (__m << __count);
633 }
634
635 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllqi(__m64 __m,const int __count)636 _m_psllqi (__m64 __m, const int __count)
637 {
638 return _mm_slli_si64 (__m, __count);
639 }
640
641 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_si64(__m64 __m,__m64 __count)643 _mm_srl_si64 (__m64 __m, __m64 __count)
644 {
645 return (__m >> __count);
646 }
647
648 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlq(__m64 __m,__m64 __count)649 _m_psrlq (__m64 __m, __m64 __count)
650 {
651 return _mm_srl_si64 (__m, __count);
652 }
653
654 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si64(__m64 __m,const int __count)655 _mm_srli_si64 (__m64 __m, const int __count)
656 {
657 return (__m >> __count);
658 }
659
660 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlqi(__m64 __m,const int __count)661 _m_psrlqi (__m64 __m, const int __count)
662 {
663 return _mm_srli_si64 (__m, __count);
664 }
665
666 /* Bit-wise AND the 64-bit values in M1 and M2. */
667 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si64(__m64 __m1,__m64 __m2)668 _mm_and_si64 (__m64 __m1, __m64 __m2)
669 {
670 return (__m1 & __m2);
671 }
672
673 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pand(__m64 __m1,__m64 __m2)674 _m_pand (__m64 __m1, __m64 __m2)
675 {
676 return _mm_and_si64 (__m1, __m2);
677 }
678
679 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
680 64-bit value in M2. */
681 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si64(__m64 __m1,__m64 __m2)682 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
683 {
684 return (~__m1 & __m2);
685 }
686
687 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pandn(__m64 __m1,__m64 __m2)688 _m_pandn (__m64 __m1, __m64 __m2)
689 {
690 return _mm_andnot_si64 (__m1, __m2);
691 }
692
693 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
694 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si64(__m64 __m1,__m64 __m2)695 _mm_or_si64 (__m64 __m1, __m64 __m2)
696 {
697 return (__m1 | __m2);
698 }
699
700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_por(__m64 __m1,__m64 __m2)701 _m_por (__m64 __m1, __m64 __m2)
702 {
703 return _mm_or_si64 (__m1, __m2);
704 }
705
706 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
707 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si64(__m64 __m1,__m64 __m2)708 _mm_xor_si64 (__m64 __m1, __m64 __m2)
709 {
710 return (__m1 ^ __m2);
711 }
712
713 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pxor(__m64 __m1,__m64 __m2)714 _m_pxor (__m64 __m1, __m64 __m2)
715 {
716 return _mm_xor_si64 (__m1, __m2);
717 }
718
719 /* Creates a 64-bit zero. */
720 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si64(void)721 _mm_setzero_si64 (void)
722 {
723 return (__m64) 0;
724 }
725
726 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
727 test is true and zero if false. */
728 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi8(__m64 __m1,__m64 __m2)729 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
730 {
731 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
732 __m64 res;
733 __asm__(
734 "cmpb %0,%1,%2;\n"
735 : "=r" (res)
736 : "r" (__m1),
737 "r" (__m2)
738 : );
739 return (res);
740 #else
741 __m64_union m1, m2, res;
742
743 m1.as_m64 = __m1;
744 m2.as_m64 = __m2;
745
746 res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
747 res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
748 res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
749 res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
750 res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
751 res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
752 res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
753 res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
754
755 return (__m64) res.as_m64;
756 #endif
757 }
758
759 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqb(__m64 __m1,__m64 __m2)760 _m_pcmpeqb (__m64 __m1, __m64 __m2)
761 {
762 return _mm_cmpeq_pi8 (__m1, __m2);
763 }
764
765 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi8(__m64 __m1,__m64 __m2)766 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
767 {
768 #if _ARCH_PWR8
769 __vector signed char a, b, c;
770
771 a = (__vector signed char)vec_splats (__m1);
772 b = (__vector signed char)vec_splats (__m2);
773 c = (__vector signed char)vec_cmpgt (a, b);
774 return (__m64) ((__vector long long) c)[0];
775 #else
776 __m64_union m1, m2, res;
777
778 m1.as_m64 = __m1;
779 m2.as_m64 = __m2;
780
781 res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
782 res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
783 res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
784 res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
785 res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
786 res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
787 res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
788 res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
789
790 return (__m64) res.as_m64;
791 #endif
792 }
793
794 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtb(__m64 __m1,__m64 __m2)795 _m_pcmpgtb (__m64 __m1, __m64 __m2)
796 {
797 return _mm_cmpgt_pi8 (__m1, __m2);
798 }
799
800 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
801 the test is true and zero if false. */
802 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi16(__m64 __m1,__m64 __m2)803 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
804 {
805 #if _ARCH_PWR8
806 __vector signed short a, b, c;
807
808 a = (__vector signed short)vec_splats (__m1);
809 b = (__vector signed short)vec_splats (__m2);
810 c = (__vector signed short)vec_cmpeq (a, b);
811 return (__m64) ((__vector long long) c)[0];
812 #else
813 __m64_union m1, m2, res;
814
815 m1.as_m64 = __m1;
816 m2.as_m64 = __m2;
817
818 res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
819 res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
820 res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
821 res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
822
823 return (__m64) res.as_m64;
824 #endif
825 }
826
827 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqw(__m64 __m1,__m64 __m2)828 _m_pcmpeqw (__m64 __m1, __m64 __m2)
829 {
830 return _mm_cmpeq_pi16 (__m1, __m2);
831 }
832
833 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi16(__m64 __m1,__m64 __m2)834 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
835 {
836 #if _ARCH_PWR8
837 __vector signed short a, b, c;
838
839 a = (__vector signed short)vec_splats (__m1);
840 b = (__vector signed short)vec_splats (__m2);
841 c = (__vector signed short)vec_cmpgt (a, b);
842 return (__m64) ((__vector long long) c)[0];
843 #else
844 __m64_union m1, m2, res;
845
846 m1.as_m64 = __m1;
847 m2.as_m64 = __m2;
848
849 res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
850 res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
851 res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
852 res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
853
854 return (__m64) res.as_m64;
855 #endif
856 }
857
858 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtw(__m64 __m1,__m64 __m2)859 _m_pcmpgtw (__m64 __m1, __m64 __m2)
860 {
861 return _mm_cmpgt_pi16 (__m1, __m2);
862 }
863
864 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
865 the test is true and zero if false. */
866 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)867 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
868 {
869 #if _ARCH_PWR9
870 __vector signed int a, b, c;
871
872 a = (__vector signed int)vec_splats (__m1);
873 b = (__vector signed int)vec_splats (__m2);
874 c = (__vector signed int)vec_cmpeq (a, b);
875 return (__m64) ((__vector long long) c)[0];
876 #else
877 __m64_union m1, m2, res;
878
879 m1.as_m64 = __m1;
880 m2.as_m64 = __m2;
881
882 res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
883 res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
884
885 return (__m64) res.as_m64;
886 #endif
887 }
888
889 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqd(__m64 __m1,__m64 __m2)890 _m_pcmpeqd (__m64 __m1, __m64 __m2)
891 {
892 return _mm_cmpeq_pi32 (__m1, __m2);
893 }
894
895 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi32(__m64 __m1,__m64 __m2)896 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
897 {
898 #if _ARCH_PWR9
899 __vector signed int a, b, c;
900
901 a = (__vector signed int)vec_splats (__m1);
902 b = (__vector signed int)vec_splats (__m2);
903 c = (__vector signed int)vec_cmpgt (a, b);
904 return (__m64) ((__vector long long) c)[0];
905 #else
906 __m64_union m1, m2, res;
907
908 m1.as_m64 = __m1;
909 m2.as_m64 = __m2;
910
911 res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
912 res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
913
914 return (__m64) res.as_m64;
915 #endif
916 }
917
918 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtd(__m64 __m1,__m64 __m2)919 _m_pcmpgtd (__m64 __m1, __m64 __m2)
920 {
921 return _mm_cmpgt_pi32 (__m1, __m2);
922 }
923
924 #if _ARCH_PWR8
925 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
926 saturated arithmetic. */
927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi8(__m64 __m1,__m64 __m2)928 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
929 {
930 __vector signed char a, b, c;
931
932 a = (__vector signed char)vec_splats (__m1);
933 b = (__vector signed char)vec_splats (__m2);
934 c = vec_adds (a, b);
935 return (__m64) ((__vector long long) c)[0];
936 }
937
938 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsb(__m64 __m1,__m64 __m2)939 _m_paddsb (__m64 __m1, __m64 __m2)
940 {
941 return _mm_adds_pi8 (__m1, __m2);
942 }
943 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
944 saturated arithmetic. */
945 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi16(__m64 __m1,__m64 __m2)946 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
947 {
948 __vector signed short a, b, c;
949
950 a = (__vector signed short)vec_splats (__m1);
951 b = (__vector signed short)vec_splats (__m2);
952 c = vec_adds (a, b);
953 return (__m64) ((__vector long long) c)[0];
954 }
955
956 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsw(__m64 __m1,__m64 __m2)957 _m_paddsw (__m64 __m1, __m64 __m2)
958 {
959 return _mm_adds_pi16 (__m1, __m2);
960 }
961 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
962 saturated arithmetic. */
963 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu8(__m64 __m1,__m64 __m2)964 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
965 {
966 __vector unsigned char a, b, c;
967
968 a = (__vector unsigned char)vec_splats (__m1);
969 b = (__vector unsigned char)vec_splats (__m2);
970 c = vec_adds (a, b);
971 return (__m64) ((__vector long long) c)[0];
972 }
973
974 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusb(__m64 __m1,__m64 __m2)975 _m_paddusb (__m64 __m1, __m64 __m2)
976 {
977 return _mm_adds_pu8 (__m1, __m2);
978 }
979
980 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
981 saturated arithmetic. */
982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu16(__m64 __m1,__m64 __m2)983 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
984 {
985 __vector unsigned short a, b, c;
986
987 a = (__vector unsigned short)vec_splats (__m1);
988 b = (__vector unsigned short)vec_splats (__m2);
989 c = vec_adds (a, b);
990 return (__m64) ((__vector long long) c)[0];
991 }
992
993 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusw(__m64 __m1,__m64 __m2)994 _m_paddusw (__m64 __m1, __m64 __m2)
995 {
996 return _mm_adds_pu16 (__m1, __m2);
997 }
998
999 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
1000 saturating arithmetic. */
1001 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi8(__m64 __m1,__m64 __m2)1002 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
1003 {
1004 __vector signed char a, b, c;
1005
1006 a = (__vector signed char)vec_splats (__m1);
1007 b = (__vector signed char)vec_splats (__m2);
1008 c = vec_subs (a, b);
1009 return (__m64) ((__vector long long) c)[0];
1010 }
1011
1012 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsb(__m64 __m1,__m64 __m2)1013 _m_psubsb (__m64 __m1, __m64 __m2)
1014 {
1015 return _mm_subs_pi8 (__m1, __m2);
1016 }
1017
1018 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1019 signed saturating arithmetic. */
1020 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi16(__m64 __m1,__m64 __m2)1021 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1022 {
1023 __vector signed short a, b, c;
1024
1025 a = (__vector signed short)vec_splats (__m1);
1026 b = (__vector signed short)vec_splats (__m2);
1027 c = vec_subs (a, b);
1028 return (__m64) ((__vector long long) c)[0];
1029 }
1030
1031 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsw(__m64 __m1,__m64 __m2)1032 _m_psubsw (__m64 __m1, __m64 __m2)
1033 {
1034 return _mm_subs_pi16 (__m1, __m2);
1035 }
1036
1037 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1038 unsigned saturating arithmetic. */
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu8(__m64 __m1,__m64 __m2)1040 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1041 {
1042 __vector unsigned char a, b, c;
1043
1044 a = (__vector unsigned char)vec_splats (__m1);
1045 b = (__vector unsigned char)vec_splats (__m2);
1046 c = vec_subs (a, b);
1047 return (__m64) ((__vector long long) c)[0];
1048 }
1049
1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusb(__m64 __m1,__m64 __m2)1051 _m_psubusb (__m64 __m1, __m64 __m2)
1052 {
1053 return _mm_subs_pu8 (__m1, __m2);
1054 }
1055
1056 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1057 unsigned saturating arithmetic. */
1058 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu16(__m64 __m1,__m64 __m2)1059 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1060 {
1061 __vector unsigned short a, b, c;
1062
1063 a = (__vector unsigned short)vec_splats (__m1);
1064 b = (__vector unsigned short)vec_splats (__m2);
1065 c = vec_subs (a, b);
1066 return (__m64) ((__vector long long) c)[0];
1067 }
1068
1069 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusw(__m64 __m1,__m64 __m2)1070 _m_psubusw (__m64 __m1, __m64 __m2)
1071 {
1072 return _mm_subs_pu16 (__m1, __m2);
1073 }
1074
1075 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1076 four 32-bit intermediate results, which are then summed by pairs to
1077 produce two 32-bit results. */
1078 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_pi16(__m64 __m1,__m64 __m2)1079 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1080 {
1081 __vector signed short a, b;
1082 __vector signed int c;
1083 __vector signed int zero = {0, 0, 0, 0};
1084
1085 a = (__vector signed short)vec_splats (__m1);
1086 b = (__vector signed short)vec_splats (__m2);
1087 c = vec_vmsumshm (a, b, zero);
1088 return (__m64) ((__vector long long) c)[0];
1089 }
1090
1091 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaddwd(__m64 __m1,__m64 __m2)1092 _m_pmaddwd (__m64 __m1, __m64 __m2)
1093 {
1094 return _mm_madd_pi16 (__m1, __m2);
1095 }
1096 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1097 M2 and produce the high 16 bits of the 32-bit results. */
1098 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pi16(__m64 __m1,__m64 __m2)1099 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1100 {
1101 __vector signed short a, b;
1102 __vector signed short c;
1103 __vector signed int w0, w1;
1104 __vector unsigned char xform1 = {
1105 #ifdef __LITTLE_ENDIAN__
1106 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1107 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1108 #else
1109 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1110 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1111 #endif
1112 };
1113
1114 a = (__vector signed short)vec_splats (__m1);
1115 b = (__vector signed short)vec_splats (__m2);
1116
1117 w0 = vec_vmulesh (a, b);
1118 w1 = vec_vmulosh (a, b);
1119 c = (__vector signed short)vec_perm (w0, w1, xform1);
1120
1121 return (__m64) ((__vector long long) c)[0];
1122 }
1123
1124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhw(__m64 __m1,__m64 __m2)1125 _m_pmulhw (__m64 __m1, __m64 __m2)
1126 {
1127 return _mm_mulhi_pi16 (__m1, __m2);
1128 }
1129
1130 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1131 the low 16 bits of the results. */
1132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_pi16(__m64 __m1,__m64 __m2)1133 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1134 {
1135 __vector signed short a, b, c;
1136
1137 a = (__vector signed short)vec_splats (__m1);
1138 b = (__vector signed short)vec_splats (__m2);
1139 c = a * b;
1140 return (__m64) ((__vector long long) c)[0];
1141 }
1142
1143 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmullw(__m64 __m1,__m64 __m2)1144 _m_pmullw (__m64 __m1, __m64 __m2)
1145 {
1146 return _mm_mullo_pi16 (__m1, __m2);
1147 }
1148
1149 /* Shift four 16-bit values in M left by COUNT. */
1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi16(__m64 __m,__m64 __count)1151 _mm_sll_pi16 (__m64 __m, __m64 __count)
1152 {
1153 __vector signed short m, r;
1154 __vector unsigned short c;
1155
1156 if (__count <= 15)
1157 {
1158 m = (__vector signed short)vec_splats (__m);
1159 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1160 r = vec_sl (m, (__vector unsigned short)c);
1161 return (__m64) ((__vector long long) r)[0];
1162 }
1163 else
1164 return (0);
1165 }
1166
1167 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllw(__m64 __m,__m64 __count)1168 _m_psllw (__m64 __m, __m64 __count)
1169 {
1170 return _mm_sll_pi16 (__m, __count);
1171 }
1172
1173 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi16(__m64 __m,int __count)1174 _mm_slli_pi16 (__m64 __m, int __count)
1175 {
1176 /* Promote int to long then invoke mm_sll_pi16. */
1177 return _mm_sll_pi16 (__m, __count);
1178 }
1179
1180 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllwi(__m64 __m,int __count)1181 _m_psllwi (__m64 __m, int __count)
1182 {
1183 return _mm_slli_pi16 (__m, __count);
1184 }
1185
1186 /* Shift two 32-bit values in M left by COUNT. */
1187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi32(__m64 __m,__m64 __count)1188 _mm_sll_pi32 (__m64 __m, __m64 __count)
1189 {
1190 __m64_union m, res;
1191
1192 m.as_m64 = __m;
1193
1194 res.as_int[0] = m.as_int[0] << __count;
1195 res.as_int[1] = m.as_int[1] << __count;
1196 return (res.as_m64);
1197 }
1198
1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslld(__m64 __m,__m64 __count)1200 _m_pslld (__m64 __m, __m64 __count)
1201 {
1202 return _mm_sll_pi32 (__m, __count);
1203 }
1204
1205 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi32(__m64 __m,int __count)1206 _mm_slli_pi32 (__m64 __m, int __count)
1207 {
1208 /* Promote int to long then invoke mm_sll_pi32. */
1209 return _mm_sll_pi32 (__m, __count);
1210 }
1211
1212 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslldi(__m64 __m,int __count)1213 _m_pslldi (__m64 __m, int __count)
1214 {
1215 return _mm_slli_pi32 (__m, __count);
1216 }
1217
1218 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi16(__m64 __m,__m64 __count)1220 _mm_sra_pi16 (__m64 __m, __m64 __count)
1221 {
1222 __vector signed short m, r;
1223 __vector unsigned short c;
1224
1225 if (__count <= 15)
1226 {
1227 m = (__vector signed short)vec_splats (__m);
1228 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1229 r = vec_sra (m, (__vector unsigned short)c);
1230 return (__m64) ((__vector long long) r)[0];
1231 }
1232 else
1233 return (0);
1234 }
1235
1236 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psraw(__m64 __m,__m64 __count)1237 _m_psraw (__m64 __m, __m64 __count)
1238 {
1239 return _mm_sra_pi16 (__m, __count);
1240 }
1241
1242 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi16(__m64 __m,int __count)1243 _mm_srai_pi16 (__m64 __m, int __count)
1244 {
1245 /* Promote int to long then invoke mm_sra_pi32. */
1246 return _mm_sra_pi16 (__m, __count);
1247 }
1248
1249 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrawi(__m64 __m,int __count)1250 _m_psrawi (__m64 __m, int __count)
1251 {
1252 return _mm_srai_pi16 (__m, __count);
1253 }
1254
1255 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1256 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi32(__m64 __m,__m64 __count)1257 _mm_sra_pi32 (__m64 __m, __m64 __count)
1258 {
1259 __m64_union m, res;
1260
1261 m.as_m64 = __m;
1262
1263 res.as_int[0] = m.as_int[0] >> __count;
1264 res.as_int[1] = m.as_int[1] >> __count;
1265 return (res.as_m64);
1266 }
1267
1268 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrad(__m64 __m,__m64 __count)1269 _m_psrad (__m64 __m, __m64 __count)
1270 {
1271 return _mm_sra_pi32 (__m, __count);
1272 }
1273
1274 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi32(__m64 __m,int __count)1275 _mm_srai_pi32 (__m64 __m, int __count)
1276 {
1277 /* Promote int to long then invoke mm_sra_pi32. */
1278 return _mm_sra_pi32 (__m, __count);
1279 }
1280
1281 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psradi(__m64 __m,int __count)1282 _m_psradi (__m64 __m, int __count)
1283 {
1284 return _mm_srai_pi32 (__m, __count);
1285 }
1286
1287 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1288 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi16(__m64 __m,__m64 __count)1289 _mm_srl_pi16 (__m64 __m, __m64 __count)
1290 {
1291 __vector unsigned short m, r;
1292 __vector unsigned short c;
1293
1294 if (__count <= 15)
1295 {
1296 m = (__vector unsigned short)vec_splats (__m);
1297 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1298 r = vec_sr (m, (__vector unsigned short)c);
1299 return (__m64) ((__vector long long) r)[0];
1300 }
1301 else
1302 return (0);
1303 }
1304
1305 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlw(__m64 __m,__m64 __count)1306 _m_psrlw (__m64 __m, __m64 __count)
1307 {
1308 return _mm_srl_pi16 (__m, __count);
1309 }
1310
1311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi16(__m64 __m,int __count)1312 _mm_srli_pi16 (__m64 __m, int __count)
1313 {
1314 /* Promote int to long then invoke mm_sra_pi32. */
1315 return _mm_srl_pi16 (__m, __count);
1316 }
1317
1318 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlwi(__m64 __m,int __count)1319 _m_psrlwi (__m64 __m, int __count)
1320 {
1321 return _mm_srli_pi16 (__m, __count);
1322 }
1323
1324 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1325 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi32(__m64 __m,__m64 __count)1326 _mm_srl_pi32 (__m64 __m, __m64 __count)
1327 {
1328 __m64_union m, res;
1329
1330 m.as_m64 = __m;
1331
1332 res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1333 res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1334 return (res.as_m64);
1335 }
1336
1337 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrld(__m64 __m,__m64 __count)1338 _m_psrld (__m64 __m, __m64 __count)
1339 {
1340 return _mm_srl_pi32 (__m, __count);
1341 }
1342
1343 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi32(__m64 __m,int __count)1344 _mm_srli_pi32 (__m64 __m, int __count)
1345 {
1346 /* Promote int to long then invoke mm_srl_pi32. */
1347 return _mm_srl_pi32 (__m, __count);
1348 }
1349
1350 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrldi(__m64 __m,int __count)1351 _m_psrldi (__m64 __m, int __count)
1352 {
1353 return _mm_srli_pi32 (__m, __count);
1354 }
1355 #endif /* _ARCH_PWR8 */
1356
1357 /* Creates a vector of two 32-bit values; I0 is least significant. */
1358 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi32(int __i1,int __i0)1359 _mm_set_pi32 (int __i1, int __i0)
1360 {
1361 __m64_union res;
1362
1363 res.as_int[0] = __i0;
1364 res.as_int[1] = __i1;
1365 return (res.as_m64);
1366 }
1367
1368 /* Creates a vector of four 16-bit values; W0 is least significant. */
1369 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi16(short __w3,short __w2,short __w1,short __w0)1370 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1371 {
1372 __m64_union res;
1373
1374 res.as_short[0] = __w0;
1375 res.as_short[1] = __w1;
1376 res.as_short[2] = __w2;
1377 res.as_short[3] = __w3;
1378 return (res.as_m64);
1379 }
1380
1381 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1382 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi8(char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)1383 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1384 char __b3, char __b2, char __b1, char __b0)
1385 {
1386 __m64_union res;
1387
1388 res.as_char[0] = __b0;
1389 res.as_char[1] = __b1;
1390 res.as_char[2] = __b2;
1391 res.as_char[3] = __b3;
1392 res.as_char[4] = __b4;
1393 res.as_char[5] = __b5;
1394 res.as_char[6] = __b6;
1395 res.as_char[7] = __b7;
1396 return (res.as_m64);
1397 }
1398
1399 /* Similar, but with the arguments in reverse order. */
1400 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi32(int __i0,int __i1)1401 _mm_setr_pi32 (int __i0, int __i1)
1402 {
1403 __m64_union res;
1404
1405 res.as_int[0] = __i0;
1406 res.as_int[1] = __i1;
1407 return (res.as_m64);
1408 }
1409
1410 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi16(short __w0,short __w1,short __w2,short __w3)1411 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1412 {
1413 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1414 }
1415
1416 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7)1417 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1418 char __b4, char __b5, char __b6, char __b7)
1419 {
1420 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1421 }
1422
1423 /* Creates a vector of two 32-bit values, both elements containing I. */
1424 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi32(int __i)1425 _mm_set1_pi32 (int __i)
1426 {
1427 __m64_union res;
1428
1429 res.as_int[0] = __i;
1430 res.as_int[1] = __i;
1431 return (res.as_m64);
1432 }
1433
1434 /* Creates a vector of four 16-bit values, all elements containing W. */
1435 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi16(short __w)1436 _mm_set1_pi16 (short __w)
1437 {
1438 #if _ARCH_PWR9
1439 __vector signed short w;
1440
1441 w = (__vector signed short)vec_splats (__w);
1442 return (__m64) ((__vector long long) w)[0];
1443 #else
1444 __m64_union res;
1445
1446 res.as_short[0] = __w;
1447 res.as_short[1] = __w;
1448 res.as_short[2] = __w;
1449 res.as_short[3] = __w;
1450 return (res.as_m64);
1451 #endif
1452 }
1453
1454 /* Creates a vector of eight 8-bit values, all elements containing B. */
1455 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi8(signed char __b)1456 _mm_set1_pi8 (signed char __b)
1457 {
1458 #if _ARCH_PWR8
1459 __vector signed char b;
1460
1461 b = (__vector signed char)vec_splats (__b);
1462 return (__m64) ((__vector long long) b)[0];
1463 #else
1464 __m64_union res;
1465
1466 res.as_char[0] = __b;
1467 res.as_char[1] = __b;
1468 res.as_char[2] = __b;
1469 res.as_char[3] = __b;
1470 res.as_char[4] = __b;
1471 res.as_char[5] = __b;
1472 res.as_char[6] = __b;
1473 res.as_char[7] = __b;
1474 return (res.as_m64);
1475 #endif
1476 }
1477 #endif /* _MMINTRIN_H_INCLUDED */
1478