1 /* Copyright (C) 2003-2021 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 9.0.  */
26 
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29    makes explicit use of Intel intrinsics to powerpc64le.
30    It is the user's responsibility to determine if the results are
31    acceptable and make additional changes as necessary.
32    Note that much code that uses Intel intrinsics can be rewritten in
33    standard C or GNU C extensions, which are more portable and better
34    optimized across multiple targets.  */
35 #endif
36 
37 #ifndef TMMINTRIN_H_
38 #define TMMINTRIN_H_
39 
40 #include <altivec.h>
41 #include <assert.h>
42 
43 /* We need definitions from the SSE header files.  */
44 #include <pmmintrin.h>
45 
46 extern __inline __m128i
47 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi16(__m128i __A)48 _mm_abs_epi16 (__m128i __A)
49 {
50   return (__m128i) vec_abs ((__v8hi) __A);
51 }
52 
53 extern __inline __m128i
54 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi32(__m128i __A)55 _mm_abs_epi32 (__m128i __A)
56 {
57   return (__m128i) vec_abs ((__v4si) __A);
58 }
59 
60 extern __inline __m128i
61 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi8(__m128i __A)62 _mm_abs_epi8 (__m128i __A)
63 {
64   return (__m128i) vec_abs ((__v16qi) __A);
65 }
66 
67 extern __inline __m64
68 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi16(__m64 __A)69 _mm_abs_pi16 (__m64 __A)
70 {
71   __v8hi __B = (__v8hi) (__v2du) { __A, __A };
72   return (__m64) ((__v2du) vec_abs (__B))[0];
73 }
74 
75 extern __inline __m64
76 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi32(__m64 __A)77 _mm_abs_pi32 (__m64 __A)
78 {
79   __v4si __B = (__v4si) (__v2du) { __A, __A };
80   return (__m64) ((__v2du) vec_abs (__B))[0];
81 }
82 
83 extern __inline __m64
84 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi8(__m64 __A)85 _mm_abs_pi8 (__m64 __A)
86 {
87   __v16qi __B = (__v16qi) (__v2du) { __A, __A };
88   return (__m64) ((__v2du) vec_abs (__B))[0];
89 }
90 
91 extern __inline __m128i
92 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_epi8(__m128i __A,__m128i __B,const unsigned int __count)93 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
94 {
95   if (__builtin_constant_p (__count) && __count < 16)
96     {
97 #ifdef __LITTLE_ENDIAN__
98       __A = (__m128i) vec_reve ((__v16qu) __A);
99       __B = (__m128i) vec_reve ((__v16qu) __B);
100 #endif
101       __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
102 #ifdef __LITTLE_ENDIAN__
103       __A = (__m128i) vec_reve ((__v16qu) __A);
104 #endif
105       return __A;
106     }
107 
108   if (__count == 0)
109     return __B;
110 
111   if (__count >= 16)
112     {
113       if (__count >= 32)
114 	{
115 	  const __v16qu zero = { 0 };
116 	  return (__m128i) zero;
117 	}
118       else
119 	{
120 	  const __v16qu __shift =
121 	    vec_splats ((unsigned char) ((__count - 16) * 8));
122 #ifdef __LITTLE_ENDIAN__
123 	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
124 #else
125 	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
126 #endif
127 	}
128     }
129   else
130     {
131       const __v16qu __shiftA =
132 	vec_splats ((unsigned char) ((16 - __count) * 8));
133       const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
134 #ifdef __LITTLE_ENDIAN__
135       __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
136       __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
137 #else
138       __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
139       __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
140 #endif
141       return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
142     }
143 }
144 
145 extern __inline __m64
146 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_pi8(__m64 __A,__m64 __B,unsigned int __count)147 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
148 {
149   if (__count < 16)
150     {
151       __v2du __C = { __B, __A };
152 #ifdef __LITTLE_ENDIAN__
153       const __v4su __shift = { __count << 3, 0, 0, 0 };
154       __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
155 #else
156       const __v4su __shift = { 0, 0, 0, __count << 3 };
157       __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
158 #endif
159       return (__m64) __C[0];
160     }
161   else
162     {
163       const __m64 __zero = { 0 };
164       return __zero;
165     }
166 }
167 
168 extern __inline __m128i
169 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi16(__m128i __A,__m128i __B)170 _mm_hadd_epi16 (__m128i __A, __m128i __B)
171 {
172   const __v16qu __P =
173     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
174   const __v16qu __Q =
175     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
176   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
177   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
178   return (__m128i) vec_add (__C, __D);
179 }
180 
181 extern __inline __m128i
182 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi32(__m128i __A,__m128i __B)183 _mm_hadd_epi32 (__m128i __A, __m128i __B)
184 {
185   const __v16qu __P =
186     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
187   const __v16qu __Q =
188     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
189   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
190   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
191   return (__m128i) vec_add (__C, __D);
192 }
193 
194 extern __inline __m64
195 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi16(__m64 __A,__m64 __B)196 _mm_hadd_pi16 (__m64 __A, __m64 __B)
197 {
198   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
199   const __v16qu __P =
200     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
201   const __v16qu __Q =
202     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
203   __v8hi __D = vec_perm (__C, __C, __Q);
204   __C = vec_perm (__C, __C, __P);
205   __C = vec_add (__C, __D);
206   return (__m64) ((__v2du) __C)[1];
207 }
208 
209 extern __inline __m64
210 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi32(__m64 __A,__m64 __B)211 _mm_hadd_pi32 (__m64 __A, __m64 __B)
212 {
213   __v4si __C = (__v4si) (__v2du) { __A, __B };
214   const __v16qu __P =
215     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
216   const __v16qu __Q =
217     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
218   __v4si __D = vec_perm (__C, __C, __Q);
219   __C = vec_perm (__C, __C, __P);
220   __C = vec_add (__C, __D);
221   return (__m64) ((__v2du) __C)[1];
222 }
223 
224 extern __inline __m128i
225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_epi16(__m128i __A,__m128i __B)226 _mm_hadds_epi16 (__m128i __A, __m128i __B)
227 {
228   __v4si __C = { 0 }, __D = { 0 };
229   __C = vec_sum4s ((__v8hi) __A, __C);
230   __D = vec_sum4s ((__v8hi) __B, __D);
231   __C = (__v4si) vec_packs (__C, __D);
232   return (__m128i) __C;
233 }
234 
235 extern __inline __m64
236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_pi16(__m64 __A,__m64 __B)237 _mm_hadds_pi16 (__m64 __A, __m64 __B)
238 {
239   const __v4si __zero = { 0 };
240   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
241   __v4si __D = vec_sum4s (__C, __zero);
242   __C = vec_packs (__D, __D);
243   return (__m64) ((__v2du) __C)[1];
244 }
245 
246 extern __inline __m128i
247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi16(__m128i __A,__m128i __B)248 _mm_hsub_epi16 (__m128i __A, __m128i __B)
249 {
250   const __v16qu __P =
251     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
252   const __v16qu __Q =
253     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
254   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
255   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
256   return (__m128i) vec_sub (__C, __D);
257 }
258 
259 extern __inline __m128i
260 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi32(__m128i __A,__m128i __B)261 _mm_hsub_epi32 (__m128i __A, __m128i __B)
262 {
263   const __v16qu __P =
264     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
265   const __v16qu __Q =
266     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
267   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
268   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
269   return (__m128i) vec_sub (__C, __D);
270 }
271 
272 extern __inline __m64
273 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi16(__m64 __A,__m64 __B)274 _mm_hsub_pi16 (__m64 __A, __m64 __B)
275 {
276   const __v16qu __P =
277     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
278   const __v16qu __Q =
279     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
280   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
281   __v8hi __D = vec_perm (__C, __C, __Q);
282   __C = vec_perm (__C, __C, __P);
283   __C = vec_sub (__C, __D);
284   return (__m64) ((__v2du) __C)[1];
285 }
286 
287 extern __inline __m64
288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi32(__m64 __A,__m64 __B)289 _mm_hsub_pi32 (__m64 __A, __m64 __B)
290 {
291   const __v16qu __P =
292     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
293   const __v16qu __Q =
294     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
295   __v4si __C = (__v4si) (__v2du) { __A, __B };
296   __v4si __D = vec_perm (__C, __C, __Q);
297   __C = vec_perm (__C, __C, __P);
298   __C = vec_sub (__C, __D);
299   return (__m64) ((__v2du) __C)[1];
300 }
301 
302 extern __inline __m128i
303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_epi16(__m128i __A,__m128i __B)304 _mm_hsubs_epi16 (__m128i __A, __m128i __B)
305 {
306   const __v16qu __P =
307     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
308   const __v16qu __Q =
309     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
310   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
311   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
312   return (__m128i) vec_subs (__C, __D);
313 }
314 
315 extern __inline __m64
316 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_pi16(__m64 __A,__m64 __B)317 _mm_hsubs_pi16 (__m64 __A, __m64 __B)
318 {
319   const __v16qu __P =
320     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
321   const __v16qu __Q =
322     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
323   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
324   __v8hi __D = vec_perm (__C, __C, __P);
325   __v8hi __E = vec_perm (__C, __C, __Q);
326   __C = vec_subs (__D, __E);
327   return (__m64) ((__v2du) __C)[1];
328 }
329 
330 extern __inline __m128i
331 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8(__m128i __A,__m128i __B)332 _mm_shuffle_epi8 (__m128i __A, __m128i __B)
333 {
334   const __v16qi __zero = { 0 };
335   __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
336   __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
337   return (__m128i) vec_sel (__C, __zero, __select);
338 }
339 
340 extern __inline __m64
341 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi8(__m64 __A,__m64 __B)342 _mm_shuffle_pi8 (__m64 __A, __m64 __B)
343 {
344   const __v16qi __zero = { 0 };
345   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
346   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
347   __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
348   __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
349   __C = vec_sel (__C, __zero, __select);
350   return (__m64) ((__v2du) (__C))[0];
351 }
352 
353 extern __inline __m128i
354 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8(__m128i __A,__m128i __B)355 _mm_sign_epi8 (__m128i __A, __m128i __B)
356 {
357   const __v16qi __zero = { 0 };
358   __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
359   __v16qi __selectpos =
360     (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
361   __v16qi __conv = vec_add (__selectneg, __selectpos);
362   return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
363 }
364 
365 extern __inline __m128i
366 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16(__m128i __A,__m128i __B)367 _mm_sign_epi16 (__m128i __A, __m128i __B)
368 {
369   const __v8hi __zero = { 0 };
370   __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
371   __v8hi __selectpos =
372     (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
373   __v8hi __conv = vec_add (__selectneg, __selectpos);
374   return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
375 }
376 
377 extern __inline __m128i
378 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32(__m128i __A,__m128i __B)379 _mm_sign_epi32 (__m128i __A, __m128i __B)
380 {
381   const __v4si __zero = { 0 };
382   __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
383   __v4si __selectpos =
384     (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
385   __v4si __conv = vec_add (__selectneg, __selectpos);
386   return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
387 }
388 
389 extern __inline __m64
390 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8(__m64 __A,__m64 __B)391 _mm_sign_pi8 (__m64 __A, __m64 __B)
392 {
393   const __v16qi __zero = { 0 };
394   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
395   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
396   __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
397   return (__m64) ((__v2du) (__C))[0];
398 }
399 
400 extern __inline __m64
401 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16(__m64 __A,__m64 __B)402 _mm_sign_pi16 (__m64 __A, __m64 __B)
403 {
404   const __v8hi __zero = { 0 };
405   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
406   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
407   __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
408   return (__m64) ((__v2du) (__C))[0];
409 }
410 
411 extern __inline __m64
412 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32(__m64 __A,__m64 __B)413 _mm_sign_pi32 (__m64 __A, __m64 __B)
414 {
415   const __v4si __zero = { 0 };
416   __v4si __C = (__v4si) (__v2du) { __A, __A };
417   __v4si __D = (__v4si) (__v2du) { __B, __B };
418   __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
419   return (__m64) ((__v2du) (__C))[0];
420 }
421 
422 extern __inline __m128i
423 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_epi16(__m128i __A,__m128i __B)424 _mm_maddubs_epi16 (__m128i __A, __m128i __B)
425 {
426   __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
427   __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
428   __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
429   __v8hi __E = vec_unpackh ((__v16qi) __B);
430   __v8hi __F = vec_unpackl ((__v16qi) __B);
431   __C = vec_mul (__C, __E);
432   __D = vec_mul (__D, __F);
433   const __v16qu __odds  =
434     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
435   const __v16qu __evens =
436     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
437   __E = vec_perm (__C, __D, __odds);
438   __F = vec_perm (__C, __D, __evens);
439   return (__m128i) vec_adds (__E, __F);
440 }
441 
442 extern __inline __m64
443 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_pi16(__m64 __A,__m64 __B)444 _mm_maddubs_pi16 (__m64 __A, __m64 __B)
445 {
446   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
447   __C = vec_unpackl ((__v16qi) __C);
448   const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
449   __C = vec_and (__C, __unsigned);
450   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
451   __D = vec_unpackl ((__v16qi) __D);
452   __D = vec_mul (__C, __D);
453   const __v16qu __odds  =
454     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
455   const __v16qu __evens =
456     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
457   __C = vec_perm (__D, __D, __odds);
458   __D = vec_perm (__D, __D, __evens);
459   __C = vec_adds (__C, __D);
460   return (__m64) ((__v2du) (__C))[0];
461 }
462 
463 extern __inline __m128i
464 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_epi16(__m128i __A,__m128i __B)465 _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
466 {
467   __v4si __C = vec_unpackh ((__v8hi) __A);
468   __v4si __D = vec_unpackh ((__v8hi) __B);
469   __C = vec_mul (__C, __D);
470   __D = vec_unpackl ((__v8hi) __A);
471   __v4si __E = vec_unpackl ((__v8hi) __B);
472   __D = vec_mul (__D, __E);
473   const __v4su __shift = vec_splats ((unsigned int) 14);
474   __C = vec_sr (__C, __shift);
475   __D = vec_sr (__D, __shift);
476   const __v4si __ones = vec_splats ((signed int) 1);
477   __C = vec_add (__C, __ones);
478   __C = vec_sr (__C, (__v4su) __ones);
479   __D = vec_add (__D, __ones);
480   __D = vec_sr (__D, (__v4su) __ones);
481   return (__m128i) vec_pack (__C, __D);
482 }
483 
484 extern __inline __m64
485 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_pi16(__m64 __A,__m64 __B)486 _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
487 {
488   __v4si __C = (__v4si) (__v2du) { __A, __A };
489   __C = vec_unpackh ((__v8hi) __C);
490   __v4si __D = (__v4si) (__v2du) { __B, __B };
491   __D = vec_unpackh ((__v8hi) __D);
492   __C = vec_mul (__C, __D);
493   const __v4su __shift = vec_splats ((unsigned int) 14);
494   __C = vec_sr (__C, __shift);
495   const __v4si __ones = vec_splats ((signed int) 1);
496   __C = vec_add (__C, __ones);
497   __C = vec_sr (__C, (__v4su) __ones);
498   __v8hi __E = vec_pack (__C, __D);
499   return (__m64) ((__v2du) (__E))[0];
500 }
501 
502 #endif
503