1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __TMMINTRIN_H
11 #define __TMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <pmmintrin.h>
18 
19 /* Define the default attributes for the functions in this file. */
20 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
21 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
22 
23 /// Computes the absolute value of each of the packed 8-bit signed
24 ///    integers in the source operand and stores the 8-bit unsigned integer
25 ///    results in the destination.
26 ///
27 /// \headerfile <x86intrin.h>
28 ///
29 /// This intrinsic corresponds to the \c PABSB instruction.
30 ///
31 /// \param __a
32 ///    A 64-bit vector of [8 x i8].
33 /// \returns A 64-bit integer vector containing the absolute values of the
34 ///    elements in the operand.
35 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
36 _mm_abs_pi8(__m64 __a)
37 {
38     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
39 }
40 
41 /// Computes the absolute value of each of the packed 8-bit signed
42 ///    integers in the source operand and stores the 8-bit unsigned integer
43 ///    results in the destination.
44 ///
45 /// \headerfile <x86intrin.h>
46 ///
47 /// This intrinsic corresponds to the \c VPABSB instruction.
48 ///
49 /// \param __a
50 ///    A 128-bit vector of [16 x i8].
51 /// \returns A 128-bit integer vector containing the absolute values of the
52 ///    elements in the operand.
53 static __inline__ __m128i __DEFAULT_FN_ATTRS
54 _mm_abs_epi8(__m128i __a)
55 {
56     return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
57 }
58 
59 /// Computes the absolute value of each of the packed 16-bit signed
60 ///    integers in the source operand and stores the 16-bit unsigned integer
61 ///    results in the destination.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the \c PABSW instruction.
66 ///
67 /// \param __a
68 ///    A 64-bit vector of [4 x i16].
69 /// \returns A 64-bit integer vector containing the absolute values of the
70 ///    elements in the operand.
71 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
72 _mm_abs_pi16(__m64 __a)
73 {
74     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
75 }
76 
77 /// Computes the absolute value of each of the packed 16-bit signed
78 ///    integers in the source operand and stores the 16-bit unsigned integer
79 ///    results in the destination.
80 ///
81 /// \headerfile <x86intrin.h>
82 ///
83 /// This intrinsic corresponds to the \c VPABSW instruction.
84 ///
85 /// \param __a
86 ///    A 128-bit vector of [8 x i16].
87 /// \returns A 128-bit integer vector containing the absolute values of the
88 ///    elements in the operand.
89 static __inline__ __m128i __DEFAULT_FN_ATTRS
90 _mm_abs_epi16(__m128i __a)
91 {
92     return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
93 }
94 
95 /// Computes the absolute value of each of the packed 32-bit signed
96 ///    integers in the source operand and stores the 32-bit unsigned integer
97 ///    results in the destination.
98 ///
99 /// \headerfile <x86intrin.h>
100 ///
101 /// This intrinsic corresponds to the \c PABSD instruction.
102 ///
103 /// \param __a
104 ///    A 64-bit vector of [2 x i32].
105 /// \returns A 64-bit integer vector containing the absolute values of the
106 ///    elements in the operand.
107 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
108 _mm_abs_pi32(__m64 __a)
109 {
110     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
111 }
112 
113 /// Computes the absolute value of each of the packed 32-bit signed
114 ///    integers in the source operand and stores the 32-bit unsigned integer
115 ///    results in the destination.
116 ///
117 /// \headerfile <x86intrin.h>
118 ///
119 /// This intrinsic corresponds to the \c VPABSD instruction.
120 ///
121 /// \param __a
122 ///    A 128-bit vector of [4 x i32].
123 /// \returns A 128-bit integer vector containing the absolute values of the
124 ///    elements in the operand.
125 static __inline__ __m128i __DEFAULT_FN_ATTRS
126 _mm_abs_epi32(__m128i __a)
127 {
128     return (__m128i)__builtin_elementwise_abs((__v4si)__a);
129 }
130 
131 /// Concatenates the two 128-bit integer vector operands, and
132 ///    right-shifts the result by the number of bytes specified in the immediate
133 ///    operand.
134 ///
135 /// \headerfile <x86intrin.h>
136 ///
137 /// \code
138 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
139 /// \endcode
140 ///
141 /// This intrinsic corresponds to the \c PALIGNR instruction.
142 ///
143 /// \param a
144 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
145 /// \param b
146 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
147 /// \param n
148 ///    An immediate operand specifying how many bytes to right-shift the result.
149 /// \returns A 128-bit integer vector containing the concatenated right-shifted
150 ///    value.
151 #define _mm_alignr_epi8(a, b, n) \
152   ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
153                                       (__v16qi)(__m128i)(b), (n)))
154 
155 /// Concatenates the two 64-bit integer vector operands, and right-shifts
156 ///    the result by the number of bytes specified in the immediate operand.
157 ///
158 /// \headerfile <x86intrin.h>
159 ///
160 /// \code
161 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
162 /// \endcode
163 ///
164 /// This intrinsic corresponds to the \c PALIGNR instruction.
165 ///
166 /// \param a
167 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
168 /// \param b
169 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
170 /// \param n
171 ///    An immediate operand specifying how many bytes to right-shift the result.
172 /// \returns A 64-bit integer vector containing the concatenated right-shifted
173 ///    value.
174 #define _mm_alignr_pi8(a, b, n) \
175   ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
176 
177 /// Horizontally adds the adjacent pairs of values contained in 2 packed
178 ///    128-bit vectors of [8 x i16].
179 ///
180 /// \headerfile <x86intrin.h>
181 ///
182 /// This intrinsic corresponds to the \c VPHADDW instruction.
183 ///
184 /// \param __a
185 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
186 ///    horizontal sums of the values are stored in the lower bits of the
187 ///    destination.
188 /// \param __b
189 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
190 ///    horizontal sums of the values are stored in the upper bits of the
191 ///    destination.
192 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
193 ///    both operands.
194 static __inline__ __m128i __DEFAULT_FN_ATTRS
195 _mm_hadd_epi16(__m128i __a, __m128i __b)
196 {
197     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
198 }
199 
200 /// Horizontally adds the adjacent pairs of values contained in 2 packed
201 ///    128-bit vectors of [4 x i32].
202 ///
203 /// \headerfile <x86intrin.h>
204 ///
205 /// This intrinsic corresponds to the \c VPHADDD instruction.
206 ///
207 /// \param __a
208 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
209 ///    horizontal sums of the values are stored in the lower bits of the
210 ///    destination.
211 /// \param __b
212 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
213 ///    horizontal sums of the values are stored in the upper bits of the
214 ///    destination.
215 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
216 ///    both operands.
217 static __inline__ __m128i __DEFAULT_FN_ATTRS
218 _mm_hadd_epi32(__m128i __a, __m128i __b)
219 {
220     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
221 }
222 
223 /// Horizontally adds the adjacent pairs of values contained in 2 packed
224 ///    64-bit vectors of [4 x i16].
225 ///
226 /// \headerfile <x86intrin.h>
227 ///
228 /// This intrinsic corresponds to the \c PHADDW instruction.
229 ///
230 /// \param __a
231 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
232 ///    horizontal sums of the values are stored in the lower bits of the
233 ///    destination.
234 /// \param __b
235 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
236 ///    horizontal sums of the values are stored in the upper bits of the
237 ///    destination.
238 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
239 ///    operands.
240 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
241 _mm_hadd_pi16(__m64 __a, __m64 __b)
242 {
243     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
244 }
245 
246 /// Horizontally adds the adjacent pairs of values contained in 2 packed
247 ///    64-bit vectors of [2 x i32].
248 ///
249 /// \headerfile <x86intrin.h>
250 ///
251 /// This intrinsic corresponds to the \c PHADDD instruction.
252 ///
253 /// \param __a
254 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
255 ///    horizontal sums of the values are stored in the lower bits of the
256 ///    destination.
257 /// \param __b
258 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
259 ///    horizontal sums of the values are stored in the upper bits of the
260 ///    destination.
261 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
262 ///    operands.
263 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
264 _mm_hadd_pi32(__m64 __a, __m64 __b)
265 {
266     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
267 }
268 
269 /// Horizontally adds the adjacent pairs of values contained in 2 packed
270 ///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
271 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
272 ///    0x8000.
273 ///
274 /// \headerfile <x86intrin.h>
275 ///
276 /// This intrinsic corresponds to the \c VPHADDSW instruction.
277 ///
278 /// \param __a
279 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
280 ///    horizontal sums of the values are stored in the lower bits of the
281 ///    destination.
282 /// \param __b
283 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
284 ///    horizontal sums of the values are stored in the upper bits of the
285 ///    destination.
286 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
287 ///    sums of both operands.
288 static __inline__ __m128i __DEFAULT_FN_ATTRS
289 _mm_hadds_epi16(__m128i __a, __m128i __b)
290 {
291     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
292 }
293 
294 /// Horizontally adds the adjacent pairs of values contained in 2 packed
295 ///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
296 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
297 ///    0x8000.
298 ///
299 /// \headerfile <x86intrin.h>
300 ///
301 /// This intrinsic corresponds to the \c PHADDSW instruction.
302 ///
303 /// \param __a
304 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
305 ///    horizontal sums of the values are stored in the lower bits of the
306 ///    destination.
307 /// \param __b
308 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
309 ///    horizontal sums of the values are stored in the upper bits of the
310 ///    destination.
311 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
312 ///    sums of both operands.
313 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
314 _mm_hadds_pi16(__m64 __a, __m64 __b)
315 {
316     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
317 }
318 
319 /// Horizontally subtracts the adjacent pairs of values contained in 2
320 ///    packed 128-bit vectors of [8 x i16].
321 ///
322 /// \headerfile <x86intrin.h>
323 ///
324 /// This intrinsic corresponds to the \c VPHSUBW instruction.
325 ///
326 /// \param __a
327 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
328 ///    horizontal differences between the values are stored in the lower bits of
329 ///    the destination.
330 /// \param __b
331 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
332 ///    horizontal differences between the values are stored in the upper bits of
333 ///    the destination.
334 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
335 ///    of both operands.
336 static __inline__ __m128i __DEFAULT_FN_ATTRS
337 _mm_hsub_epi16(__m128i __a, __m128i __b)
338 {
339     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
340 }
341 
342 /// Horizontally subtracts the adjacent pairs of values contained in 2
343 ///    packed 128-bit vectors of [4 x i32].
344 ///
345 /// \headerfile <x86intrin.h>
346 ///
347 /// This intrinsic corresponds to the \c VPHSUBD instruction.
348 ///
349 /// \param __a
350 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
351 ///    horizontal differences between the values are stored in the lower bits of
352 ///    the destination.
353 /// \param __b
354 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
355 ///    horizontal differences between the values are stored in the upper bits of
356 ///    the destination.
357 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
358 ///    of both operands.
359 static __inline__ __m128i __DEFAULT_FN_ATTRS
360 _mm_hsub_epi32(__m128i __a, __m128i __b)
361 {
362     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
363 }
364 
365 /// Horizontally subtracts the adjacent pairs of values contained in 2
366 ///    packed 64-bit vectors of [4 x i16].
367 ///
368 /// \headerfile <x86intrin.h>
369 ///
370 /// This intrinsic corresponds to the \c PHSUBW instruction.
371 ///
372 /// \param __a
373 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
374 ///    horizontal differences between the values are stored in the lower bits of
375 ///    the destination.
376 /// \param __b
377 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
378 ///    horizontal differences between the values are stored in the upper bits of
379 ///    the destination.
380 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
381 ///    of both operands.
382 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
383 _mm_hsub_pi16(__m64 __a, __m64 __b)
384 {
385     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
386 }
387 
388 /// Horizontally subtracts the adjacent pairs of values contained in 2
389 ///    packed 64-bit vectors of [2 x i32].
390 ///
391 /// \headerfile <x86intrin.h>
392 ///
393 /// This intrinsic corresponds to the \c PHSUBD instruction.
394 ///
395 /// \param __a
396 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
397 ///    horizontal differences between the values are stored in the lower bits of
398 ///    the destination.
399 /// \param __b
400 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
401 ///    horizontal differences between the values are stored in the upper bits of
402 ///    the destination.
403 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
404 ///    of both operands.
405 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
406 _mm_hsub_pi32(__m64 __a, __m64 __b)
407 {
408     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
409 }
410 
411 /// Horizontally subtracts the adjacent pairs of values contained in 2
412 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
413 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
414 ///    saturated to 0x8000.
415 ///
416 /// \headerfile <x86intrin.h>
417 ///
418 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
419 ///
420 /// \param __a
421 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
422 ///    horizontal differences between the values are stored in the lower bits of
423 ///    the destination.
424 /// \param __b
425 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
426 ///    horizontal differences between the values are stored in the upper bits of
427 ///    the destination.
428 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
429 ///    differences of both operands.
430 static __inline__ __m128i __DEFAULT_FN_ATTRS
431 _mm_hsubs_epi16(__m128i __a, __m128i __b)
432 {
433     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
434 }
435 
436 /// Horizontally subtracts the adjacent pairs of values contained in 2
437 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
438 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
439 ///    saturated to 0x8000.
440 ///
441 /// \headerfile <x86intrin.h>
442 ///
443 /// This intrinsic corresponds to the \c PHSUBSW instruction.
444 ///
445 /// \param __a
446 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
447 ///    horizontal differences between the values are stored in the lower bits of
448 ///    the destination.
449 /// \param __b
450 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
451 ///    horizontal differences between the values are stored in the upper bits of
452 ///    the destination.
453 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
454 ///    differences of both operands.
455 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
456 _mm_hsubs_pi16(__m64 __a, __m64 __b)
457 {
458     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
459 }
460 
461 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
462 ///    values contained in the first source operand and packed 8-bit signed
463 ///    integer values contained in the second source operand, adds pairs of
464 ///    contiguous products with signed saturation, and writes the 16-bit sums to
465 ///    the corresponding bits in the destination.
466 ///
467 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
468 ///    both operands are multiplied, and the sum of both results is written to
469 ///    bits [15:0] of the destination.
470 ///
471 /// \headerfile <x86intrin.h>
472 ///
473 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
474 ///
475 /// \param __a
476 ///    A 128-bit integer vector containing the first source operand.
477 /// \param __b
478 ///    A 128-bit integer vector containing the second source operand.
479 /// \returns A 128-bit integer vector containing the sums of products of both
480 ///    operands: \n
481 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
482 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
483 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
484 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
485 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
486 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
487 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
488 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
489 static __inline__ __m128i __DEFAULT_FN_ATTRS
490 _mm_maddubs_epi16(__m128i __a, __m128i __b)
491 {
492     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
493 }
494 
495 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
496 ///    values contained in the first source operand and packed 8-bit signed
497 ///    integer values contained in the second source operand, adds pairs of
498 ///    contiguous products with signed saturation, and writes the 16-bit sums to
499 ///    the corresponding bits in the destination.
500 ///
501 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
502 ///    both operands are multiplied, and the sum of both results is written to
503 ///    bits [15:0] of the destination.
504 ///
505 /// \headerfile <x86intrin.h>
506 ///
507 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
508 ///
509 /// \param __a
510 ///    A 64-bit integer vector containing the first source operand.
511 /// \param __b
512 ///    A 64-bit integer vector containing the second source operand.
513 /// \returns A 64-bit integer vector containing the sums of products of both
514 ///    operands: \n
515 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
516 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
517 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
518 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
519 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
520 _mm_maddubs_pi16(__m64 __a, __m64 __b)
521 {
522     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
523 }
524 
525 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
526 ///    products to the 18 most significant bits by right-shifting, rounds the
527 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
528 ///
529 /// \headerfile <x86intrin.h>
530 ///
531 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
532 ///
533 /// \param __a
534 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
535 /// \param __b
536 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
537 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
538 ///    products of both operands.
539 static __inline__ __m128i __DEFAULT_FN_ATTRS
540 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
541 {
542     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
543 }
544 
545 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
546 ///    products to the 18 most significant bits by right-shifting, rounds the
547 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
548 ///
549 /// \headerfile <x86intrin.h>
550 ///
551 /// This intrinsic corresponds to the \c PMULHRSW instruction.
552 ///
553 /// \param __a
554 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
555 /// \param __b
556 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
557 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
558 ///    products of both operands.
559 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
560 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
561 {
562     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
563 }
564 
565 /// Copies the 8-bit integers from a 128-bit integer vector to the
566 ///    destination or clears 8-bit values in the destination, as specified by
567 ///    the second source operand.
568 ///
569 /// \headerfile <x86intrin.h>
570 ///
571 /// This intrinsic corresponds to the \c VPSHUFB instruction.
572 ///
573 /// \param __a
574 ///    A 128-bit integer vector containing the values to be copied.
575 /// \param __b
576 ///    A 128-bit integer vector containing control bytes corresponding to
577 ///    positions in the destination:
578 ///    Bit 7: \n
579 ///    1: Clear the corresponding byte in the destination. \n
580 ///    0: Copy the selected source byte to the corresponding byte in the
581 ///    destination. \n
582 ///    Bits [6:4] Reserved.  \n
583 ///    Bits [3:0] select the source byte to be copied.
584 /// \returns A 128-bit integer vector containing the copied or cleared values.
585 static __inline__ __m128i __DEFAULT_FN_ATTRS
586 _mm_shuffle_epi8(__m128i __a, __m128i __b)
587 {
588     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
589 }
590 
591 /// Copies the 8-bit integers from a 64-bit integer vector to the
592 ///    destination or clears 8-bit values in the destination, as specified by
593 ///    the second source operand.
594 ///
595 /// \headerfile <x86intrin.h>
596 ///
597 /// This intrinsic corresponds to the \c PSHUFB instruction.
598 ///
599 /// \param __a
600 ///    A 64-bit integer vector containing the values to be copied.
601 /// \param __b
602 ///    A 64-bit integer vector containing control bytes corresponding to
603 ///    positions in the destination:
604 ///    Bit 7: \n
605 ///    1: Clear the corresponding byte in the destination. \n
606 ///    0: Copy the selected source byte to the corresponding byte in the
607 ///    destination. \n
608 ///    Bits [3:0] select the source byte to be copied.
609 /// \returns A 64-bit integer vector containing the copied or cleared values.
610 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
611 _mm_shuffle_pi8(__m64 __a, __m64 __b)
612 {
613     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
614 }
615 
616 /// For each 8-bit integer in the first source operand, perform one of
617 ///    the following actions as specified by the second source operand.
618 ///
619 ///    If the byte in the second source is negative, calculate the two's
620 ///    complement of the corresponding byte in the first source, and write that
621 ///    value to the destination. If the byte in the second source is positive,
622 ///    copy the corresponding byte from the first source to the destination. If
623 ///    the byte in the second source is zero, clear the corresponding byte in
624 ///    the destination.
625 ///
626 /// \headerfile <x86intrin.h>
627 ///
628 /// This intrinsic corresponds to the \c VPSIGNB instruction.
629 ///
630 /// \param __a
631 ///    A 128-bit integer vector containing the values to be copied.
632 /// \param __b
633 ///    A 128-bit integer vector containing control bytes corresponding to
634 ///    positions in the destination.
635 /// \returns A 128-bit integer vector containing the resultant values.
636 static __inline__ __m128i __DEFAULT_FN_ATTRS
637 _mm_sign_epi8(__m128i __a, __m128i __b)
638 {
639     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
640 }
641 
642 /// For each 16-bit integer in the first source operand, perform one of
643 ///    the following actions as specified by the second source operand.
644 ///
645 ///    If the word in the second source is negative, calculate the two's
646 ///    complement of the corresponding word in the first source, and write that
647 ///    value to the destination. If the word in the second source is positive,
648 ///    copy the corresponding word from the first source to the destination. If
649 ///    the word in the second source is zero, clear the corresponding word in
650 ///    the destination.
651 ///
652 /// \headerfile <x86intrin.h>
653 ///
654 /// This intrinsic corresponds to the \c VPSIGNW instruction.
655 ///
656 /// \param __a
657 ///    A 128-bit integer vector containing the values to be copied.
658 /// \param __b
659 ///    A 128-bit integer vector containing control words corresponding to
660 ///    positions in the destination.
661 /// \returns A 128-bit integer vector containing the resultant values.
662 static __inline__ __m128i __DEFAULT_FN_ATTRS
663 _mm_sign_epi16(__m128i __a, __m128i __b)
664 {
665     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
666 }
667 
668 /// For each 32-bit integer in the first source operand, perform one of
669 ///    the following actions as specified by the second source operand.
670 ///
671 ///    If the doubleword in the second source is negative, calculate the two's
672 ///    complement of the corresponding word in the first source, and write that
673 ///    value to the destination. If the doubleword in the second source is
674 ///    positive, copy the corresponding word from the first source to the
675 ///    destination. If the doubleword in the second source is zero, clear the
676 ///    corresponding word in the destination.
677 ///
678 /// \headerfile <x86intrin.h>
679 ///
680 /// This intrinsic corresponds to the \c VPSIGND instruction.
681 ///
682 /// \param __a
683 ///    A 128-bit integer vector containing the values to be copied.
684 /// \param __b
685 ///    A 128-bit integer vector containing control doublewords corresponding to
686 ///    positions in the destination.
687 /// \returns A 128-bit integer vector containing the resultant values.
688 static __inline__ __m128i __DEFAULT_FN_ATTRS
689 _mm_sign_epi32(__m128i __a, __m128i __b)
690 {
691     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
692 }
693 
694 /// For each 8-bit integer in the first source operand, perform one of
695 ///    the following actions as specified by the second source operand.
696 ///
697 ///    If the byte in the second source is negative, calculate the two's
698 ///    complement of the corresponding byte in the first source, and write that
699 ///    value to the destination. If the byte in the second source is positive,
700 ///    copy the corresponding byte from the first source to the destination. If
701 ///    the byte in the second source is zero, clear the corresponding byte in
702 ///    the destination.
703 ///
704 /// \headerfile <x86intrin.h>
705 ///
706 /// This intrinsic corresponds to the \c PSIGNB instruction.
707 ///
708 /// \param __a
709 ///    A 64-bit integer vector containing the values to be copied.
710 /// \param __b
711 ///    A 64-bit integer vector containing control bytes corresponding to
712 ///    positions in the destination.
713 /// \returns A 64-bit integer vector containing the resultant values.
714 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
715 _mm_sign_pi8(__m64 __a, __m64 __b)
716 {
717     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
718 }
719 
720 /// For each 16-bit integer in the first source operand, perform one of
721 ///    the following actions as specified by the second source operand.
722 ///
723 ///    If the word in the second source is negative, calculate the two's
724 ///    complement of the corresponding word in the first source, and write that
725 ///    value to the destination. If the word in the second source is positive,
726 ///    copy the corresponding word from the first source to the destination. If
727 ///    the word in the second source is zero, clear the corresponding word in
728 ///    the destination.
729 ///
730 /// \headerfile <x86intrin.h>
731 ///
732 /// This intrinsic corresponds to the \c PSIGNW instruction.
733 ///
734 /// \param __a
735 ///    A 64-bit integer vector containing the values to be copied.
736 /// \param __b
737 ///    A 64-bit integer vector containing control words corresponding to
738 ///    positions in the destination.
739 /// \returns A 64-bit integer vector containing the resultant values.
740 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
741 _mm_sign_pi16(__m64 __a, __m64 __b)
742 {
743     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
744 }
745 
746 /// For each 32-bit integer in the first source operand, perform one of
747 ///    the following actions as specified by the second source operand.
748 ///
749 ///    If the doubleword in the second source is negative, calculate the two's
750 ///    complement of the corresponding doubleword in the first source, and
751 ///    write that value to the destination. If the doubleword in the second
752 ///    source is positive, copy the corresponding doubleword from the first
753 ///    source to the destination. If the doubleword in the second source is
754 ///    zero, clear the corresponding doubleword in the destination.
755 ///
756 /// \headerfile <x86intrin.h>
757 ///
758 /// This intrinsic corresponds to the \c PSIGND instruction.
759 ///
760 /// \param __a
761 ///    A 64-bit integer vector containing the values to be copied.
762 /// \param __b
763 ///    A 64-bit integer vector containing two control doublewords corresponding
764 ///    to positions in the destination.
765 /// \returns A 64-bit integer vector containing the resultant values.
766 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
767 _mm_sign_pi32(__m64 __a, __m64 __b)
768 {
769     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
770 }
771 
772 #undef __DEFAULT_FN_ATTRS
773 #undef __DEFAULT_FN_ATTRS_MMX
774 
775 #endif /* __TMMINTRIN_H */
776