1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.
12 
13    NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
14 
15 #ifndef NO_WARN_X86_INTRINSICS
16 /* This header is distributed to simplify porting x86_64 code that
17    makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
18 
19    It is the user's responsibility to determine if the results are
20    acceptable and make additional changes as necessary.
21 
22    Note that much code that uses Intel intrinsics can be rewritten in
23    standard C or GNU C extensions, which are more portable and better
24    optimized across multiple targets.  */
25 #error                                                                         \
26     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
27 #endif
28 
29 #ifndef SMMINTRIN_H_
30 #define SMMINTRIN_H_
31 
32 #if defined(__powerpc64__) &&                                                  \
33     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
34 
35 #include <altivec.h>
36 #include <tmmintrin.h>
37 
38 /* Rounding mode macros. */
39 #define _MM_FROUND_TO_NEAREST_INT 0x00
40 #define _MM_FROUND_TO_ZERO 0x01
41 #define _MM_FROUND_TO_POS_INF 0x02
42 #define _MM_FROUND_TO_NEG_INF 0x03
43 #define _MM_FROUND_CUR_DIRECTION 0x04
44 
45 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
46 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
47 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
48 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
49 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
50 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
51 
52 #define _MM_FROUND_RAISE_EXC 0x00
53 #define _MM_FROUND_NO_EXC 0x08
54 
55 extern __inline __m128d
56     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57     _mm_round_pd(__m128d __A, int __rounding) {
58   __v2df __r;
59   union {
60     double __fr;
61     long long __fpscr;
62   } __enables_save, __fpscr_save;
63 
64   if (__rounding & _MM_FROUND_NO_EXC) {
65     /* Save enabled exceptions, disable all exceptions,
66        and preserve the rounding mode.  */
67 #ifdef _ARCH_PWR9
68     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
69     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
70 #else
71     __fpscr_save.__fr = __builtin_ppc_mffs();
72     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
73     __fpscr_save.__fpscr &= ~0xf8;
74     __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
75 #endif
76     /* Insert an artificial "read/write" reference to the variable
77        read below, to ensure the compiler does not schedule
78        a read/use of the variable before the FPSCR is modified, above.
79        This can be removed if and when GCC PR102783 is fixed.
80      */
81     __asm__("" : "+wa"(__A));
82   }
83 
84   switch (__rounding) {
85   case _MM_FROUND_TO_NEAREST_INT:
86 #ifdef _ARCH_PWR9
87     __fpscr_save.__fr = __builtin_ppc_mffsl();
88 #else
89     __fpscr_save.__fr = __builtin_ppc_mffs();
90     __fpscr_save.__fpscr &= 0x70007f0ffL;
91 #endif
92     __attribute__((fallthrough));
93   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
94     __builtin_ppc_set_fpscr_rn(0b00);
95     /* Insert an artificial "read/write" reference to the variable
96        read below, to ensure the compiler does not schedule
97        a read/use of the variable before the FPSCR is modified, above.
98        This can be removed if and when GCC PR102783 is fixed.
99      */
100     __asm__("" : "+wa"(__A));
101 
102     __r = vec_rint((__v2df)__A);
103 
104     /* Insert an artificial "read" reference to the variable written
105        above, to ensure the compiler does not schedule the computation
106        of the value after the manipulation of the FPSCR, below.
107        This can be removed if and when GCC PR102783 is fixed.
108      */
109     __asm__("" : : "wa"(__r));
110     __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
111     break;
112   case _MM_FROUND_TO_NEG_INF:
113   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
114     __r = vec_floor((__v2df)__A);
115     break;
116   case _MM_FROUND_TO_POS_INF:
117   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
118     __r = vec_ceil((__v2df)__A);
119     break;
120   case _MM_FROUND_TO_ZERO:
121   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
122     __r = vec_trunc((__v2df)__A);
123     break;
124   case _MM_FROUND_CUR_DIRECTION:
125     __r = vec_rint((__v2df)__A);
126     break;
127   }
128   if (__rounding & _MM_FROUND_NO_EXC) {
129     /* Insert an artificial "read" reference to the variable written
130        above, to ensure the compiler does not schedule the computation
131        of the value after the manipulation of the FPSCR, below.
132        This can be removed if and when GCC PR102783 is fixed.
133      */
134     __asm__("" : : "wa"(__r));
135     /* Restore enabled exceptions.  */
136 #ifdef _ARCH_PWR9
137     __fpscr_save.__fr = __builtin_ppc_mffsl();
138 #else
139     __fpscr_save.__fr = __builtin_ppc_mffs();
140     __fpscr_save.__fpscr &= 0x70007f0ffL;
141 #endif
142     __fpscr_save.__fpscr |= __enables_save.__fpscr;
143     __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
144   }
145   return (__m128d)__r;
146 }
147 
148 extern __inline __m128d
149     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150     _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
151   __B = _mm_round_pd(__B, __rounding);
152   __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
153   return (__m128d)__r;
154 }
155 
156 extern __inline __m128
157     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158     _mm_round_ps(__m128 __A, int __rounding) {
159   __v4sf __r;
160   union {
161     double __fr;
162     long long __fpscr;
163   } __enables_save, __fpscr_save;
164 
165   if (__rounding & _MM_FROUND_NO_EXC) {
166     /* Save enabled exceptions, disable all exceptions,
167        and preserve the rounding mode.  */
168 #ifdef _ARCH_PWR9
169     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
170     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
171 #else
172     __fpscr_save.__fr = __builtin_ppc_mffs();
173     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
174     __fpscr_save.__fpscr &= ~0xf8;
175     __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
176 #endif
177     /* Insert an artificial "read/write" reference to the variable
178        read below, to ensure the compiler does not schedule
179        a read/use of the variable before the FPSCR is modified, above.
180        This can be removed if and when GCC PR102783 is fixed.
181      */
182     __asm__("" : "+wa"(__A));
183   }
184 
185   switch (__rounding) {
186   case _MM_FROUND_TO_NEAREST_INT:
187 #ifdef _ARCH_PWR9
188     __fpscr_save.__fr = __builtin_ppc_mffsl();
189 #else
190     __fpscr_save.__fr = __builtin_ppc_mffs();
191     __fpscr_save.__fpscr &= 0x70007f0ffL;
192 #endif
193     __attribute__((fallthrough));
194   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
195     __builtin_ppc_set_fpscr_rn(0b00);
196     /* Insert an artificial "read/write" reference to the variable
197        read below, to ensure the compiler does not schedule
198        a read/use of the variable before the FPSCR is modified, above.
199        This can be removed if and when GCC PR102783 is fixed.
200      */
201     __asm__("" : "+wa"(__A));
202 
203     __r = vec_rint((__v4sf)__A);
204 
205     /* Insert an artificial "read" reference to the variable written
206        above, to ensure the compiler does not schedule the computation
207        of the value after the manipulation of the FPSCR, below.
208        This can be removed if and when GCC PR102783 is fixed.
209      */
210     __asm__("" : : "wa"(__r));
211     __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
212     break;
213   case _MM_FROUND_TO_NEG_INF:
214   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
215     __r = vec_floor((__v4sf)__A);
216     break;
217   case _MM_FROUND_TO_POS_INF:
218   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
219     __r = vec_ceil((__v4sf)__A);
220     break;
221   case _MM_FROUND_TO_ZERO:
222   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
223     __r = vec_trunc((__v4sf)__A);
224     break;
225   case _MM_FROUND_CUR_DIRECTION:
226     __r = vec_rint((__v4sf)__A);
227     break;
228   }
229   if (__rounding & _MM_FROUND_NO_EXC) {
230     /* Insert an artificial "read" reference to the variable written
231        above, to ensure the compiler does not schedule the computation
232        of the value after the manipulation of the FPSCR, below.
233        This can be removed if and when GCC PR102783 is fixed.
234      */
235     __asm__("" : : "wa"(__r));
236     /* Restore enabled exceptions.  */
237 #ifdef _ARCH_PWR9
238     __fpscr_save.__fr = __builtin_ppc_mffsl();
239 #else
240     __fpscr_save.__fr = __builtin_ppc_mffs();
241     __fpscr_save.__fpscr &= 0x70007f0ffL;
242 #endif
243     __fpscr_save.__fpscr |= __enables_save.__fpscr;
244     __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
245   }
246   return (__m128)__r;
247 }
248 
249 extern __inline __m128
250     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251     _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
252   __B = _mm_round_ps(__B, __rounding);
253   __v4sf __r = (__v4sf)__A;
254   __r[0] = ((__v4sf)__B)[0];
255   return (__m128)__r;
256 }
257 
258 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
259 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
260 
261 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
262 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
263 
264 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
265 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
266 
267 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
268 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
269 
270 extern __inline __m128i
271     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272     _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
273   __v16qi __result = (__v16qi)__A;
274 
275   __result[__N & 0xf] = __D;
276 
277   return (__m128i)__result;
278 }
279 
280 extern __inline __m128i
281     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282     _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
283   __v4si __result = (__v4si)__A;
284 
285   __result[__N & 3] = __D;
286 
287   return (__m128i)__result;
288 }
289 
290 extern __inline __m128i
291     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292     _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
293   __v2di __result = (__v2di)__A;
294 
295   __result[__N & 1] = __D;
296 
297   return (__m128i)__result;
298 }
299 
300 extern __inline int
301     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302     _mm_extract_epi8(__m128i __X, const int __N) {
303   return (unsigned char)((__v16qi)__X)[__N & 15];
304 }
305 
306 extern __inline int
307     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308     _mm_extract_epi32(__m128i __X, const int __N) {
309   return ((__v4si)__X)[__N & 3];
310 }
311 
312 extern __inline int
313     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314     _mm_extract_epi64(__m128i __X, const int __N) {
315   return ((__v2di)__X)[__N & 1];
316 }
317 
318 extern __inline int
319     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320     _mm_extract_ps(__m128 __X, const int __N) {
321   return ((__v4si)__X)[__N & 3];
322 }
323 
324 #ifdef _ARCH_PWR8
325 extern __inline __m128i
326     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327     _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
328   __v16qu __charmask = vec_splats((unsigned char)__imm8);
329   __charmask = vec_gb(__charmask);
330   __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
331 #ifdef __BIG_ENDIAN__
332   __shortmask = vec_reve(__shortmask);
333 #endif
334   return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
335 }
336 #endif
337 
338 extern __inline __m128i
339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340     _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
341 #ifdef _ARCH_PWR10
342   return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
343 #else
344   const __v16qu __seven = vec_splats((unsigned char)0x07);
345   __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
346   return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
347 #endif
348 }
349 
350 extern __inline __m128
351     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
352     _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
353   __v16qu __pcv[] = {
354       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
355       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
356       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
357       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
358       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
359       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
360       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
361       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
362       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
363       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
364       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
365       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
366       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
367       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
368       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
369       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
370   };
371   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
372   return (__m128)__r;
373 }
374 
375 extern __inline __m128
376     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
377     _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
378 #ifdef _ARCH_PWR10
379   return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
380 #else
381   const __v4si __zero = {0};
382   const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
383   return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
384 #endif
385 }
386 
387 extern __inline __m128d
388     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389     _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
390   __v16qu __pcv[] = {
391       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
392       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
393       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
394       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
395   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
396   return (__m128d)__r;
397 }
398 
399 #ifdef _ARCH_PWR8
400 extern __inline __m128d
401     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402     _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
403 #ifdef _ARCH_PWR10
404   return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
405 #else
406   const __v2di __zero = {0};
407   const __vector __bool long long __boolmask =
408       vec_cmplt((__v2di)__mask, __zero);
409   return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
410 #endif
411 }
412 #endif
413 
414 extern __inline int
415     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416     _mm_testz_si128(__m128i __A, __m128i __B) {
417   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
418   const __v16qu __zero = {0};
419   return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
420 }
421 
422 extern __inline int
423     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424     _mm_testc_si128(__m128i __A, __m128i __B) {
425   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
426   const __v16qu __zero = {0};
427   const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
428   return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
429 }
430 
431 extern __inline int
432     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433     _mm_testnzc_si128(__m128i __A, __m128i __B) {
434   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
435   return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
436 }
437 
438 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
439 
440 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
441 
442 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
443 
444 #ifdef _ARCH_PWR8
445 extern __inline __m128i
446     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447     _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
448   return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
449 }
450 #endif
451 
452 extern __inline __m128i
453     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454     _mm_min_epi8(__m128i __X, __m128i __Y) {
455   return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
456 }
457 
458 extern __inline __m128i
459     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460     _mm_min_epu16(__m128i __X, __m128i __Y) {
461   return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
462 }
463 
464 extern __inline __m128i
465     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466     _mm_min_epi32(__m128i __X, __m128i __Y) {
467   return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
468 }
469 
470 extern __inline __m128i
471     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472     _mm_min_epu32(__m128i __X, __m128i __Y) {
473   return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
474 }
475 
476 extern __inline __m128i
477     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478     _mm_max_epi8(__m128i __X, __m128i __Y) {
479   return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
480 }
481 
482 extern __inline __m128i
483     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484     _mm_max_epu16(__m128i __X, __m128i __Y) {
485   return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
486 }
487 
488 extern __inline __m128i
489     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490     _mm_max_epi32(__m128i __X, __m128i __Y) {
491   return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
492 }
493 
494 extern __inline __m128i
495     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496     _mm_max_epu32(__m128i __X, __m128i __Y) {
497   return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
498 }
499 
500 extern __inline __m128i
501     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502     _mm_mullo_epi32(__m128i __X, __m128i __Y) {
503   return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
504 }
505 
506 #ifdef _ARCH_PWR8
507 extern __inline __m128i
508     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509     _mm_mul_epi32(__m128i __X, __m128i __Y) {
510   return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
511 }
512 #endif
513 
514 extern __inline __m128i
515     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516     _mm_cvtepi8_epi16(__m128i __A) {
517   return (__m128i)vec_unpackh((__v16qi)__A);
518 }
519 
520 extern __inline __m128i
521     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522     _mm_cvtepi8_epi32(__m128i __A) {
523   __A = (__m128i)vec_unpackh((__v16qi)__A);
524   return (__m128i)vec_unpackh((__v8hi)__A);
525 }
526 
527 #ifdef _ARCH_PWR8
528 extern __inline __m128i
529     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
530     _mm_cvtepi8_epi64(__m128i __A) {
531   __A = (__m128i)vec_unpackh((__v16qi)__A);
532   __A = (__m128i)vec_unpackh((__v8hi)__A);
533   return (__m128i)vec_unpackh((__v4si)__A);
534 }
535 #endif
536 
537 extern __inline __m128i
538     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539     _mm_cvtepi16_epi32(__m128i __A) {
540   return (__m128i)vec_unpackh((__v8hi)__A);
541 }
542 
543 #ifdef _ARCH_PWR8
544 extern __inline __m128i
545     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546     _mm_cvtepi16_epi64(__m128i __A) {
547   __A = (__m128i)vec_unpackh((__v8hi)__A);
548   return (__m128i)vec_unpackh((__v4si)__A);
549 }
550 #endif
551 
552 #ifdef _ARCH_PWR8
553 extern __inline __m128i
554     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
555     _mm_cvtepi32_epi64(__m128i __A) {
556   return (__m128i)vec_unpackh((__v4si)__A);
557 }
558 #endif
559 
560 extern __inline __m128i
561     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562     _mm_cvtepu8_epi16(__m128i __A) {
563   const __v16qu __zero = {0};
564 #ifdef __LITTLE_ENDIAN__
565   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
566 #else  /* __BIG_ENDIAN__.  */
567   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
568 #endif /* __BIG_ENDIAN__.  */
569   return __A;
570 }
571 
572 extern __inline __m128i
573     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574     _mm_cvtepu8_epi32(__m128i __A) {
575   const __v16qu __zero = {0};
576 #ifdef __LITTLE_ENDIAN__
577   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
578   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
579 #else  /* __BIG_ENDIAN__.  */
580   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
581   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
582 #endif /* __BIG_ENDIAN__.  */
583   return __A;
584 }
585 
586 extern __inline __m128i
587     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588     _mm_cvtepu8_epi64(__m128i __A) {
589   const __v16qu __zero = {0};
590 #ifdef __LITTLE_ENDIAN__
591   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
592   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
593   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
594 #else  /* __BIG_ENDIAN__.  */
595   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
596   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
597   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
598 #endif /* __BIG_ENDIAN__.  */
599   return __A;
600 }
601 
602 extern __inline __m128i
603     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604     _mm_cvtepu16_epi32(__m128i __A) {
605   const __v8hu __zero = {0};
606 #ifdef __LITTLE_ENDIAN__
607   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
608 #else  /* __BIG_ENDIAN__.  */
609   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
610 #endif /* __BIG_ENDIAN__.  */
611   return __A;
612 }
613 
614 extern __inline __m128i
615     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616     _mm_cvtepu16_epi64(__m128i __A) {
617   const __v8hu __zero = {0};
618 #ifdef __LITTLE_ENDIAN__
619   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
620   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
621 #else  /* __BIG_ENDIAN__.  */
622   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
623   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
624 #endif /* __BIG_ENDIAN__.  */
625   return __A;
626 }
627 
628 extern __inline __m128i
629     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630     _mm_cvtepu32_epi64(__m128i __A) {
631   const __v4su __zero = {0};
632 #ifdef __LITTLE_ENDIAN__
633   __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
634 #else  /* __BIG_ENDIAN__.  */
635   __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
636 #endif /* __BIG_ENDIAN__.  */
637   return __A;
638 }
639 
640 /* Return horizontal packed word minimum and its index in bits [15:0]
641    and bits [18:16] respectively.  */
642 extern __inline __m128i
643     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644     _mm_minpos_epu16(__m128i __A) {
645   union __u {
646     __m128i __m;
647     __v8hu __uh;
648   };
649   union __u __u = {.__m = __A}, __r = {.__m = {0}};
650   unsigned short __ridx = 0;
651   unsigned short __rmin = __u.__uh[__ridx];
652   unsigned long __i;
653   for (__i = 1; __i < 8; __i++) {
654     if (__u.__uh[__i] < __rmin) {
655       __rmin = __u.__uh[__i];
656       __ridx = __i;
657     }
658   }
659   __r.__uh[0] = __rmin;
660   __r.__uh[1] = __ridx;
661   return __r.__m;
662 }
663 
664 extern __inline __m128i
665     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666     _mm_packus_epi32(__m128i __X, __m128i __Y) {
667   return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
668 }
669 
670 #ifdef _ARCH_PWR8
671 extern __inline __m128i
672     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673     _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
674   return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
675 }
676 #endif
677 
678 #else
679 #include_next <smmintrin.h>
680 #endif /* defined(__powerpc64__) &&                                            \
681         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
682 
683 #endif /* SMMINTRIN_H_ */
684