1 /*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header is distributed to simplify porting x86_64 code that
15    makes explicit use of Intel intrinsics to powerpc64le.
16 
17    It is the user's responsibility to determine if the results are
18    acceptable and make additional changes as necessary.
19 
20    Note that much code that uses Intel intrinsics can be rewritten in
21    standard C or GNU C extensions, which are more portable and better
22    optimized across multiple targets.  */
23 #endif
24 
25 #ifndef TMMINTRIN_H_
26 #define TMMINTRIN_H_
27 
28 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
29 
30 #include <altivec.h>
31 
32 /* We need definitions from the SSE header files.  */
33 #include <pmmintrin.h>
34 
35 extern __inline __m128i
36     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
37     _mm_abs_epi16(__m128i __A) {
38   return (__m128i)vec_abs((__v8hi)__A);
39 }
40 
41 extern __inline __m128i
42     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
43     _mm_abs_epi32(__m128i __A) {
44   return (__m128i)vec_abs((__v4si)__A);
45 }
46 
47 extern __inline __m128i
48     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
49     _mm_abs_epi8(__m128i __A) {
50   return (__m128i)vec_abs((__v16qi)__A);
51 }
52 
53 extern __inline __m64
54     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
55     _mm_abs_pi16(__m64 __A) {
56   __v8hi __B = (__v8hi)(__v2du){__A, __A};
57   return (__m64)((__v2du)vec_abs(__B))[0];
58 }
59 
60 extern __inline __m64
61     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
62     _mm_abs_pi32(__m64 __A) {
63   __v4si __B = (__v4si)(__v2du){__A, __A};
64   return (__m64)((__v2du)vec_abs(__B))[0];
65 }
66 
67 extern __inline __m64
68     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
69     _mm_abs_pi8(__m64 __A) {
70   __v16qi __B = (__v16qi)(__v2du){__A, __A};
71   return (__m64)((__v2du)vec_abs(__B))[0];
72 }
73 
74 extern __inline __m128i
75     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76     _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
77   if (__builtin_constant_p(__count) && __count < 16) {
78 #ifdef __LITTLE_ENDIAN__
79     __A = (__m128i)vec_reve((__v16qu)__A);
80     __B = (__m128i)vec_reve((__v16qu)__B);
81 #endif
82     __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
83 #ifdef __LITTLE_ENDIAN__
84     __A = (__m128i)vec_reve((__v16qu)__A);
85 #endif
86     return __A;
87   }
88 
89   if (__count == 0)
90     return __B;
91 
92   if (__count >= 16) {
93     if (__count >= 32) {
94       const __v16qu __zero = {0};
95       return (__m128i)__zero;
96     } else {
97       const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
98 #ifdef __LITTLE_ENDIAN__
99       return (__m128i)vec_sro((__v16qu)__A, __shift);
100 #else
101       return (__m128i)vec_slo((__v16qu)__A, __shift);
102 #endif
103     }
104   } else {
105     const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
106     const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
107 #ifdef __LITTLE_ENDIAN__
108     __A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
109     __B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
110 #else
111     __A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
112     __B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
113 #endif
114     return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
115   }
116 }
117 
118 extern __inline __m64
119     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120     _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
121   if (__count < 16) {
122     __v2du __C = {__B, __A};
123 #ifdef __LITTLE_ENDIAN__
124     const __v4su __shift = {__count << 3, 0, 0, 0};
125     __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
126 #else
127     const __v4su __shift = {0, 0, 0, __count << 3};
128     __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
129 #endif
130     return (__m64)__C[0];
131   } else {
132     const __m64 __zero = {0};
133     return __zero;
134   }
135 }
136 
137 extern __inline __m128i
138     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139     _mm_hadd_epi16(__m128i __A, __m128i __B) {
140   const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
141                        16, 17, 20, 21, 24, 25, 28, 29};
142   const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
143                        18, 19, 22, 23, 26, 27, 30, 31};
144   __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
145   __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
146   return (__m128i)vec_add(__C, __D);
147 }
148 
149 extern __inline __m128i
150     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151     _mm_hadd_epi32(__m128i __A, __m128i __B) {
152   const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
153                        16, 17, 18, 19, 24, 25, 26, 27};
154   const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
155                        20, 21, 22, 23, 28, 29, 30, 31};
156   __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
157   __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
158   return (__m128i)vec_add(__C, __D);
159 }
160 
161 extern __inline __m64
162     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163     _mm_hadd_pi16(__m64 __A, __m64 __B) {
164   __v8hi __C = (__v8hi)(__v2du){__A, __B};
165   const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
166   const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
167   __v8hi __D = vec_perm(__C, __C, __Q);
168   __C = vec_perm(__C, __C, __P);
169   __C = vec_add(__C, __D);
170   return (__m64)((__v2du)__C)[1];
171 }
172 
173 extern __inline __m64
174     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175     _mm_hadd_pi32(__m64 __A, __m64 __B) {
176   __v4si __C = (__v4si)(__v2du){__A, __B};
177   const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
178   const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
179   __v4si __D = vec_perm(__C, __C, __Q);
180   __C = vec_perm(__C, __C, __P);
181   __C = vec_add(__C, __D);
182   return (__m64)((__v2du)__C)[1];
183 }
184 
185 extern __inline __m128i
186     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
187     _mm_hadds_epi16(__m128i __A, __m128i __B) {
188   __v4si __C = {0}, __D = {0};
189   __C = vec_sum4s((__v8hi)__A, __C);
190   __D = vec_sum4s((__v8hi)__B, __D);
191   __C = (__v4si)vec_packs(__C, __D);
192   return (__m128i)__C;
193 }
194 
195 extern __inline __m64
196     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
197     _mm_hadds_pi16(__m64 __A, __m64 __B) {
198   const __v4si __zero = {0};
199   __v8hi __C = (__v8hi)(__v2du){__A, __B};
200   __v4si __D = vec_sum4s(__C, __zero);
201   __C = vec_packs(__D, __D);
202   return (__m64)((__v2du)__C)[1];
203 }
204 
205 extern __inline __m128i
206     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207     _mm_hsub_epi16(__m128i __A, __m128i __B) {
208   const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
209                        16, 17, 20, 21, 24, 25, 28, 29};
210   const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
211                        18, 19, 22, 23, 26, 27, 30, 31};
212   __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
213   __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
214   return (__m128i)vec_sub(__C, __D);
215 }
216 
217 extern __inline __m128i
218     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219     _mm_hsub_epi32(__m128i __A, __m128i __B) {
220   const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
221                        16, 17, 18, 19, 24, 25, 26, 27};
222   const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
223                        20, 21, 22, 23, 28, 29, 30, 31};
224   __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
225   __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
226   return (__m128i)vec_sub(__C, __D);
227 }
228 
229 extern __inline __m64
230     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231     _mm_hsub_pi16(__m64 __A, __m64 __B) {
232   const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
233   const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
234   __v8hi __C = (__v8hi)(__v2du){__A, __B};
235   __v8hi __D = vec_perm(__C, __C, __Q);
236   __C = vec_perm(__C, __C, __P);
237   __C = vec_sub(__C, __D);
238   return (__m64)((__v2du)__C)[1];
239 }
240 
241 extern __inline __m64
242     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243     _mm_hsub_pi32(__m64 __A, __m64 __B) {
244   const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
245   const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
246   __v4si __C = (__v4si)(__v2du){__A, __B};
247   __v4si __D = vec_perm(__C, __C, __Q);
248   __C = vec_perm(__C, __C, __P);
249   __C = vec_sub(__C, __D);
250   return (__m64)((__v2du)__C)[1];
251 }
252 
253 extern __inline __m128i
254     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255     _mm_hsubs_epi16(__m128i __A, __m128i __B) {
256   const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
257                        16, 17, 20, 21, 24, 25, 28, 29};
258   const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
259                        18, 19, 22, 23, 26, 27, 30, 31};
260   __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
261   __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
262   return (__m128i)vec_subs(__C, __D);
263 }
264 
265 extern __inline __m64
266     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267     _mm_hsubs_pi16(__m64 __A, __m64 __B) {
268   const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
269   const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
270   __v8hi __C = (__v8hi)(__v2du){__A, __B};
271   __v8hi __D = vec_perm(__C, __C, __P);
272   __v8hi __E = vec_perm(__C, __C, __Q);
273   __C = vec_subs(__D, __E);
274   return (__m64)((__v2du)__C)[1];
275 }
276 
277 extern __inline __m128i
278     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279     _mm_shuffle_epi8(__m128i __A, __m128i __B) {
280   const __v16qi __zero = {0};
281   __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
282   __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
283   return (__m128i)vec_sel(__C, __zero, __select);
284 }
285 
286 extern __inline __m64
287     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288     _mm_shuffle_pi8(__m64 __A, __m64 __B) {
289   const __v16qi __zero = {0};
290   __v16qi __C = (__v16qi)(__v2du){__A, __A};
291   __v16qi __D = (__v16qi)(__v2du){__B, __B};
292   __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
293   __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
294   __C = vec_sel(__C, __zero, __select);
295   return (__m64)((__v2du)(__C))[0];
296 }
297 
298 #ifdef _ARCH_PWR8
299 extern __inline __m128i
300     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
301     _mm_sign_epi8(__m128i __A, __m128i __B) {
302   const __v16qi __zero = {0};
303   __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
304   __v16qi __selectpos =
305       (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
306   __v16qi __conv = vec_add(__selectneg, __selectpos);
307   return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
308 }
309 #endif
310 
311 #ifdef _ARCH_PWR8
312 extern __inline __m128i
313     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314     _mm_sign_epi16(__m128i __A, __m128i __B) {
315   const __v8hi __zero = {0};
316   __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
317   __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
318   __v8hi __conv = vec_add(__selectneg, __selectpos);
319   return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
320 }
321 #endif
322 
323 #ifdef _ARCH_PWR8
324 extern __inline __m128i
325     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
326     _mm_sign_epi32(__m128i __A, __m128i __B) {
327   const __v4si __zero = {0};
328   __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
329   __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
330   __v4si __conv = vec_add(__selectneg, __selectpos);
331   return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
332 }
333 #endif
334 
335 #ifdef _ARCH_PWR8
336 extern __inline __m64
337     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338     _mm_sign_pi8(__m64 __A, __m64 __B) {
339   const __v16qi __zero = {0};
340   __v16qi __C = (__v16qi)(__v2du){__A, __A};
341   __v16qi __D = (__v16qi)(__v2du){__B, __B};
342   __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
343   return (__m64)((__v2du)(__C))[0];
344 }
345 #endif
346 
347 #ifdef _ARCH_PWR8
348 extern __inline __m64
349     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350     _mm_sign_pi16(__m64 __A, __m64 __B) {
351   const __v8hi __zero = {0};
352   __v8hi __C = (__v8hi)(__v2du){__A, __A};
353   __v8hi __D = (__v8hi)(__v2du){__B, __B};
354   __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
355   return (__m64)((__v2du)(__C))[0];
356 }
357 #endif
358 
359 #ifdef _ARCH_PWR8
360 extern __inline __m64
361     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362     _mm_sign_pi32(__m64 __A, __m64 __B) {
363   const __v4si __zero = {0};
364   __v4si __C = (__v4si)(__v2du){__A, __A};
365   __v4si __D = (__v4si)(__v2du){__B, __B};
366   __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
367   return (__m64)((__v2du)(__C))[0];
368 }
369 #endif
370 
371 extern __inline __m128i
372     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373     _mm_maddubs_epi16(__m128i __A, __m128i __B) {
374   __v8hi __unsigned = vec_splats((signed short)0x00ff);
375   __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
376   __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
377   __v8hi __E = vec_unpackh((__v16qi)__B);
378   __v8hi __F = vec_unpackl((__v16qi)__B);
379   __C = vec_mul(__C, __E);
380   __D = vec_mul(__D, __F);
381   const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
382                           16, 17, 20, 21, 24, 25, 28, 29};
383   const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
384                            18, 19, 22, 23, 26, 27, 30, 31};
385   __E = vec_perm(__C, __D, __odds);
386   __F = vec_perm(__C, __D, __evens);
387   return (__m128i)vec_adds(__E, __F);
388 }
389 
390 extern __inline __m64
391     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392     _mm_maddubs_pi16(__m64 __A, __m64 __B) {
393   __v8hi __C = (__v8hi)(__v2du){__A, __A};
394   __C = vec_unpackl((__v16qi)__C);
395   const __v8hi __unsigned = vec_splats((signed short)0x00ff);
396   __C = vec_and(__C, __unsigned);
397   __v8hi __D = (__v8hi)(__v2du){__B, __B};
398   __D = vec_unpackl((__v16qi)__D);
399   __D = vec_mul(__C, __D);
400   const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
401                           16, 17, 20, 21, 24, 25, 28, 29};
402   const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
403                            18, 19, 22, 23, 26, 27, 30, 31};
404   __C = vec_perm(__D, __D, __odds);
405   __D = vec_perm(__D, __D, __evens);
406   __C = vec_adds(__C, __D);
407   return (__m64)((__v2du)(__C))[0];
408 }
409 
410 extern __inline __m128i
411     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
412     _mm_mulhrs_epi16(__m128i __A, __m128i __B) {
413   __v4si __C = vec_unpackh((__v8hi)__A);
414   __v4si __D = vec_unpackh((__v8hi)__B);
415   __C = vec_mul(__C, __D);
416   __D = vec_unpackl((__v8hi)__A);
417   __v4si __E = vec_unpackl((__v8hi)__B);
418   __D = vec_mul(__D, __E);
419   const __v4su __shift = vec_splats((unsigned int)14);
420   __C = vec_sr(__C, __shift);
421   __D = vec_sr(__D, __shift);
422   const __v4si __ones = vec_splats((signed int)1);
423   __C = vec_add(__C, __ones);
424   __C = vec_sr(__C, (__v4su)__ones);
425   __D = vec_add(__D, __ones);
426   __D = vec_sr(__D, (__v4su)__ones);
427   return (__m128i)vec_pack(__C, __D);
428 }
429 
430 extern __inline __m64
431     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432     _mm_mulhrs_pi16(__m64 __A, __m64 __B) {
433   __v4si __C = (__v4si)(__v2du){__A, __A};
434   __C = vec_unpackh((__v8hi)__C);
435   __v4si __D = (__v4si)(__v2du){__B, __B};
436   __D = vec_unpackh((__v8hi)__D);
437   __C = vec_mul(__C, __D);
438   const __v4su __shift = vec_splats((unsigned int)14);
439   __C = vec_sr(__C, __shift);
440   const __v4si __ones = vec_splats((signed int)1);
441   __C = vec_add(__C, __ones);
442   __C = vec_sr(__C, (__v4su)__ones);
443   __v8hi __E = vec_pack(__C, __D);
444   return (__m64)((__v2du)(__E))[0];
445 }
446 
447 #else
448 #include_next <tmmintrin.h>
449 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
450         */
451 
452 #endif /* TMMINTRIN_H_ */
453