1 /*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header is distributed to simplify porting x86_64 code that
15    makes explicit use of Intel intrinsics to powerpc64le.
16 
17    It is the user's responsibility to determine if the results are
18    acceptable and make additional changes as necessary.
19 
20    Note that much code that uses Intel intrinsics can be rewritten in
21    standard C or GNU C extensions, which are more portable and better
22    optimized across multiple targets.  */
23 #endif
24 
25 #ifndef TMMINTRIN_H_
26 #define TMMINTRIN_H_
27 
28 #if defined(__linux__) && defined(__ppc64__)
29 
30 #include <altivec.h>
31 
32 /* We need definitions from the SSE header files.  */
33 #include <pmmintrin.h>
34 
35 extern __inline __m128i
36 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
37 _mm_abs_epi16 (__m128i __A)
38 {
39   return (__m128i) vec_abs ((__v8hi) __A);
40 }
41 
42 extern __inline __m128i
43 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
44 _mm_abs_epi32 (__m128i __A)
45 {
46   return (__m128i) vec_abs ((__v4si) __A);
47 }
48 
49 extern __inline __m128i
50 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
51 _mm_abs_epi8 (__m128i __A)
52 {
53   return (__m128i) vec_abs ((__v16qi) __A);
54 }
55 
56 extern __inline __m64
57 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
58 _mm_abs_pi16 (__m64 __A)
59 {
60   __v8hi __B = (__v8hi) (__v2du) { __A, __A };
61   return (__m64) ((__v2du) vec_abs (__B))[0];
62 }
63 
64 extern __inline __m64
65 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66 _mm_abs_pi32 (__m64 __A)
67 {
68   __v4si __B = (__v4si) (__v2du) { __A, __A };
69   return (__m64) ((__v2du) vec_abs (__B))[0];
70 }
71 
72 extern __inline __m64
73 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74 _mm_abs_pi8 (__m64 __A)
75 {
76   __v16qi __B = (__v16qi) (__v2du) { __A, __A };
77   return (__m64) ((__v2du) vec_abs (__B))[0];
78 }
79 
80 extern __inline __m128i
81 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
83 {
84   if (__builtin_constant_p (__count) && __count < 16)
85     {
86 #ifdef __LITTLE_ENDIAN__
87       __A = (__m128i) vec_reve ((__v16qu) __A);
88       __B = (__m128i) vec_reve ((__v16qu) __B);
89 #endif
90       __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
91 #ifdef __LITTLE_ENDIAN__
92       __A = (__m128i) vec_reve ((__v16qu) __A);
93 #endif
94       return __A;
95     }
96 
97   if (__count == 0)
98     return __B;
99 
100   if (__count >= 16)
101     {
102       if (__count >= 32)
103 	{
104 	  const __v16qu zero = { 0 };
105 	  return (__m128i) zero;
106 	}
107       else
108 	{
109 	  const __v16qu __shift =
110 	    vec_splats ((unsigned char) ((__count - 16) * 8));
111 #ifdef __LITTLE_ENDIAN__
112 	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
113 #else
114 	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
115 #endif
116 	}
117     }
118   else
119     {
120       const __v16qu __shiftA =
121 	vec_splats ((unsigned char) ((16 - __count) * 8));
122       const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
123 #ifdef __LITTLE_ENDIAN__
124       __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
125       __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
126 #else
127       __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
128       __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
129 #endif
130       return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
131     }
132 }
133 
134 extern __inline __m64
135 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
137 {
138   if (__count < 16)
139     {
140       __v2du __C = { __B, __A };
141 #ifdef __LITTLE_ENDIAN__
142       const __v4su __shift = { __count << 3, 0, 0, 0 };
143       __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
144 #else
145       const __v4su __shift = { 0, 0, 0, __count << 3 };
146       __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
147 #endif
148       return (__m64) __C[0];
149     }
150   else
151     {
152       const __m64 __zero = { 0 };
153       return __zero;
154     }
155 }
156 
157 extern __inline __m128i
158 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159 _mm_hadd_epi16 (__m128i __A, __m128i __B)
160 {
161   const __v16qu __P =
162     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
163   const __v16qu __Q =
164     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
165   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
166   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
167   return (__m128i) vec_add (__C, __D);
168 }
169 
170 extern __inline __m128i
171 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172 _mm_hadd_epi32 (__m128i __A, __m128i __B)
173 {
174   const __v16qu __P =
175     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
176   const __v16qu __Q =
177     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
178   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
179   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
180   return (__m128i) vec_add (__C, __D);
181 }
182 
183 extern __inline __m64
184 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
185 _mm_hadd_pi16 (__m64 __A, __m64 __B)
186 {
187   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
188   const __v16qu __P =
189     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
190   const __v16qu __Q =
191     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
192   __v8hi __D = vec_perm (__C, __C, __Q);
193   __C = vec_perm (__C, __C, __P);
194   __C = vec_add (__C, __D);
195   return (__m64) ((__v2du) __C)[1];
196 }
197 
198 extern __inline __m64
199 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
200 _mm_hadd_pi32 (__m64 __A, __m64 __B)
201 {
202   __v4si __C = (__v4si) (__v2du) { __A, __B };
203   const __v16qu __P =
204     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
205   const __v16qu __Q =
206     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
207   __v4si __D = vec_perm (__C, __C, __Q);
208   __C = vec_perm (__C, __C, __P);
209   __C = vec_add (__C, __D);
210   return (__m64) ((__v2du) __C)[1];
211 }
212 
213 extern __inline __m128i
214 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215 _mm_hadds_epi16 (__m128i __A, __m128i __B)
216 {
217   __v4si __C = { 0 }, __D = { 0 };
218   __C = vec_sum4s ((__v8hi) __A, __C);
219   __D = vec_sum4s ((__v8hi) __B, __D);
220   __C = (__v4si) vec_packs (__C, __D);
221   return (__m128i) __C;
222 }
223 
224 extern __inline __m64
225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm_hadds_pi16 (__m64 __A, __m64 __B)
227 {
228   const __v4si __zero = { 0 };
229   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
230   __v4si __D = vec_sum4s (__C, __zero);
231   __C = vec_packs (__D, __D);
232   return (__m64) ((__v2du) __C)[1];
233 }
234 
235 extern __inline __m128i
236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237 _mm_hsub_epi16 (__m128i __A, __m128i __B)
238 {
239   const __v16qu __P =
240     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
241   const __v16qu __Q =
242     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
243   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
244   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
245   return (__m128i) vec_sub (__C, __D);
246 }
247 
248 extern __inline __m128i
249 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250 _mm_hsub_epi32 (__m128i __A, __m128i __B)
251 {
252   const __v16qu __P =
253     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
254   const __v16qu __Q =
255     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
256   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
257   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
258   return (__m128i) vec_sub (__C, __D);
259 }
260 
261 extern __inline __m64
262 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm_hsub_pi16 (__m64 __A, __m64 __B)
264 {
265   const __v16qu __P =
266     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
267   const __v16qu __Q =
268     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
269   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
270   __v8hi __D = vec_perm (__C, __C, __Q);
271   __C = vec_perm (__C, __C, __P);
272   __C = vec_sub (__C, __D);
273   return (__m64) ((__v2du) __C)[1];
274 }
275 
276 extern __inline __m64
277 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278 _mm_hsub_pi32 (__m64 __A, __m64 __B)
279 {
280   const __v16qu __P =
281     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
282   const __v16qu __Q =
283     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
284   __v4si __C = (__v4si) (__v2du) { __A, __B };
285   __v4si __D = vec_perm (__C, __C, __Q);
286   __C = vec_perm (__C, __C, __P);
287   __C = vec_sub (__C, __D);
288   return (__m64) ((__v2du) __C)[1];
289 }
290 
291 extern __inline __m128i
292 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 _mm_hsubs_epi16 (__m128i __A, __m128i __B)
294 {
295   const __v16qu __P =
296     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
297   const __v16qu __Q =
298     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
299   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
300   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
301   return (__m128i) vec_subs (__C, __D);
302 }
303 
304 extern __inline __m64
305 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
306 _mm_hsubs_pi16 (__m64 __A, __m64 __B)
307 {
308   const __v16qu __P =
309     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
310   const __v16qu __Q =
311     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
312   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
313   __v8hi __D = vec_perm (__C, __C, __P);
314   __v8hi __E = vec_perm (__C, __C, __Q);
315   __C = vec_subs (__D, __E);
316   return (__m64) ((__v2du) __C)[1];
317 }
318 
319 extern __inline __m128i
320 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321 _mm_shuffle_epi8 (__m128i __A, __m128i __B)
322 {
323   const __v16qi __zero = { 0 };
324   __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
325   __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
326   return (__m128i) vec_sel (__C, __zero, __select);
327 }
328 
329 extern __inline __m64
330 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
331 _mm_shuffle_pi8 (__m64 __A, __m64 __B)
332 {
333   const __v16qi __zero = { 0 };
334   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
335   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
336   __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
337   __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
338   __C = vec_sel (__C, __zero, __select);
339   return (__m64) ((__v2du) (__C))[0];
340 }
341 
342 extern __inline __m128i
343 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm_sign_epi8 (__m128i __A, __m128i __B)
345 {
346   const __v16qi __zero = { 0 };
347   __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
348   __v16qi __selectpos =
349     (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
350   __v16qi __conv = vec_add (__selectneg, __selectpos);
351   return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
352 }
353 
354 extern __inline __m128i
355 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm_sign_epi16 (__m128i __A, __m128i __B)
357 {
358   const __v8hi __zero = { 0 };
359   __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
360   __v8hi __selectpos =
361     (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
362   __v8hi __conv = vec_add (__selectneg, __selectpos);
363   return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
364 }
365 
366 extern __inline __m128i
367 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368 _mm_sign_epi32 (__m128i __A, __m128i __B)
369 {
370   const __v4si __zero = { 0 };
371   __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
372   __v4si __selectpos =
373     (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
374   __v4si __conv = vec_add (__selectneg, __selectpos);
375   return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
376 }
377 
378 extern __inline __m64
379 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380 _mm_sign_pi8 (__m64 __A, __m64 __B)
381 {
382   const __v16qi __zero = { 0 };
383   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
384   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
385   __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
386   return (__m64) ((__v2du) (__C))[0];
387 }
388 
389 extern __inline __m64
390 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391 _mm_sign_pi16 (__m64 __A, __m64 __B)
392 {
393   const __v8hi __zero = { 0 };
394   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
395   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
396   __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
397   return (__m64) ((__v2du) (__C))[0];
398 }
399 
400 extern __inline __m64
401 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 _mm_sign_pi32 (__m64 __A, __m64 __B)
403 {
404   const __v4si __zero = { 0 };
405   __v4si __C = (__v4si) (__v2du) { __A, __A };
406   __v4si __D = (__v4si) (__v2du) { __B, __B };
407   __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
408   return (__m64) ((__v2du) (__C))[0];
409 }
410 
411 extern __inline __m128i
412 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413 _mm_maddubs_epi16 (__m128i __A, __m128i __B)
414 {
415   __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
416   __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
417   __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
418   __v8hi __E = vec_unpackh ((__v16qi) __B);
419   __v8hi __F = vec_unpackl ((__v16qi) __B);
420   __C = vec_mul (__C, __E);
421   __D = vec_mul (__D, __F);
422   const __v16qu __odds  =
423     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
424   const __v16qu __evens =
425     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
426   __E = vec_perm (__C, __D, __odds);
427   __F = vec_perm (__C, __D, __evens);
428   return (__m128i) vec_adds (__E, __F);
429 }
430 
431 extern __inline __m64
432 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433 _mm_maddubs_pi16 (__m64 __A, __m64 __B)
434 {
435   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
436   __C = vec_unpackl ((__v16qi) __C);
437   const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
438   __C = vec_and (__C, __unsigned);
439   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
440   __D = vec_unpackl ((__v16qi) __D);
441   __D = vec_mul (__C, __D);
442   const __v16qu __odds  =
443     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
444   const __v16qu __evens =
445     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
446   __C = vec_perm (__D, __D, __odds);
447   __D = vec_perm (__D, __D, __evens);
448   __C = vec_adds (__C, __D);
449   return (__m64) ((__v2du) (__C))[0];
450 }
451 
452 extern __inline __m128i
453 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454 _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
455 {
456   __v4si __C = vec_unpackh ((__v8hi) __A);
457   __v4si __D = vec_unpackh ((__v8hi) __B);
458   __C = vec_mul (__C, __D);
459   __D = vec_unpackl ((__v8hi) __A);
460   __v4si __E = vec_unpackl ((__v8hi) __B);
461   __D = vec_mul (__D, __E);
462   const __v4su __shift = vec_splats ((unsigned int) 14);
463   __C = vec_sr (__C, __shift);
464   __D = vec_sr (__D, __shift);
465   const __v4si __ones = vec_splats ((signed int) 1);
466   __C = vec_add (__C, __ones);
467   __C = vec_sr (__C, (__v4su) __ones);
468   __D = vec_add (__D, __ones);
469   __D = vec_sr (__D, (__v4su) __ones);
470   return (__m128i) vec_pack (__C, __D);
471 }
472 
473 extern __inline __m64
474 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475 _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
476 {
477   __v4si __C = (__v4si) (__v2du) { __A, __A };
478   __C = vec_unpackh ((__v8hi) __C);
479   __v4si __D = (__v4si) (__v2du) { __B, __B };
480   __D = vec_unpackh ((__v8hi) __D);
481   __C = vec_mul (__C, __D);
482   const __v4su __shift = vec_splats ((unsigned int) 14);
483   __C = vec_sr (__C, __shift);
484   const __v4si __ones = vec_splats ((signed int) 1);
485   __C = vec_add (__C, __ones);
486   __C = vec_sr (__C, (__v4su) __ones);
487   __v8hi __E = vec_pack (__C, __D);
488   return (__m64) ((__v2du) (__E))[0];
489 }
490 
491 #else
492 #include_next <tmmintrin.h>
493 #endif /* defined(__linux__) && defined(__ppc64__) */
494 
495 #endif /* TMMINTRIN_H_ */
496