1 /* Copyright (C) 2011
2    Free Software Foundation, Inc.
3 
4    This file is part of GCC.
5 
6    GCC is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    GCC is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    Under Section 7 of GPL version 3, you are granted additional
17    permissions described in the GCC Runtime Library Exception, version
18    3.1, as published by the Free Software Foundation.
19 
20    You should have received a copy of the GNU General Public License and
21    a copy of the GCC Runtime Library Exception along with this program;
22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23    <http://www.gnu.org/licenses/>.  */
24 
25 #ifndef _IMMINTRIN_H_INCLUDED
26 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
27 #endif
28 
29 /* Sum absolute 8-bit integer difference of adjacent groups of 4
30    byte integers in the first 2 operands.  Starting offsets within
31    operands are determined by the 3rd mask operand.  */
32 #ifdef __OPTIMIZE__
33 extern __inline __m256i
34 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
35 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
36 {
37   return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
38 					      (__v32qi)__Y, __M);
39 }
40 #else
41 #define _mm256_mpsadbw_epu8(X, Y, M)					\
42   ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),		\
43 					(__v32qi)(__m256i)(Y), (int)(M)))
44 #endif
45 
46 extern __inline __m256i
47 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
48 _mm256_abs_epi8 (__m256i __A)
49 {
50   return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
51 }
52 
53 extern __inline __m256i
54 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
55 _mm256_abs_epi16 (__m256i __A)
56 {
57   return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
58 }
59 
60 extern __inline __m256i
61 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
62 _mm256_abs_epi32 (__m256i __A)
63 {
64   return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
65 }
66 
67 extern __inline __m256i
68 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
69 _mm256_packs_epi32 (__m256i __A, __m256i __B)
70 {
71   return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
72 }
73 
74 extern __inline __m256i
75 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
76 _mm256_packs_epi16 (__m256i __A, __m256i __B)
77 {
78   return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
79 }
80 
81 extern __inline __m256i
82 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
83 _mm256_packus_epi32 (__m256i __A, __m256i __B)
84 {
85   return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
86 }
87 
88 extern __inline __m256i
89 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
90 _mm256_packus_epi16 (__m256i __A, __m256i __B)
91 {
92   return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
93 }
94 
95 extern __inline __m256i
96 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
97 _mm256_add_epi8 (__m256i __A, __m256i __B)
98 {
99   return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
100 }
101 
102 extern __inline __m256i
103 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
104 _mm256_add_epi16 (__m256i __A, __m256i __B)
105 {
106   return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
107 }
108 
109 extern __inline __m256i
110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
111 _mm256_add_epi32 (__m256i __A, __m256i __B)
112 {
113   return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
114 }
115 
116 extern __inline __m256i
117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
118 _mm256_add_epi64 (__m256i __A, __m256i __B)
119 {
120   return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
121 }
122 
123 extern __inline __m256i
124 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
125 _mm256_adds_epi8 (__m256i __A, __m256i __B)
126 {
127   return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
128 }
129 
130 extern __inline __m256i
131 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
132 _mm256_adds_epi16 (__m256i __A, __m256i __B)
133 {
134   return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
135 }
136 
137 extern __inline __m256i
138 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
139 _mm256_adds_epu8 (__m256i __A, __m256i __B)
140 {
141   return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
142 }
143 
144 extern __inline __m256i
145 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
146 _mm256_adds_epu16 (__m256i __A, __m256i __B)
147 {
148   return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
149 }
150 
151 #ifdef __OPTIMIZE__
152 extern __inline __m256i
153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
155 {
156   return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
157 					      (__v4di)__B,
158 					      __N * 8);
159 }
160 #else
161 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
162 /* Use define instead */
163 #define _mm256_alignr_epi8(A, B, N)				   \
164   ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),	   \
165 					(__v4di)(__m256i)(B),	   \
166 					(int)(N) * 8))
167 #endif
168 
169 extern __inline __m256i
170 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
171 _mm256_and_si256 (__m256i __A, __m256i __B)
172 {
173   return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
174 }
175 
176 extern __inline __m256i
177 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
178 _mm256_andnot_si256 (__m256i __A, __m256i __B)
179 {
180   return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
181 }
182 
183 extern __inline __m256i
184 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
185 _mm256_avg_epu8 (__m256i __A, __m256i __B)
186 {
187   return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
188 }
189 
190 extern __inline __m256i
191 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
192 _mm256_avg_epu16 (__m256i __A, __m256i __B)
193 {
194   return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
195 }
196 
197 extern __inline __m256i
198 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
199 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
200 {
201   return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
202 					       (__v32qi)__Y,
203 					       (__v32qi)__M);
204 }
205 
206 #ifdef __OPTIMIZE__
207 extern __inline __m256i
208 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
209 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
210 {
211   return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
212 					      (__v16hi)__Y,
213 					       __M);
214 }
215 #else
216 #define _mm256_blend_epi16(X, Y, M)					\
217   ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),		\
218 					(__v16hi)(__m256i)(Y), (int)(M)))
219 #endif
220 
221 extern __inline __m256i
222 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
223 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
224 {
225   return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
226 }
227 
228 extern __inline __m256i
229 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
230 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
231 {
232   return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
233 }
234 
235 extern __inline __m256i
236 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
237 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
238 {
239   return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
240 }
241 
242 extern __inline __m256i
243 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
244 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
245 {
246   return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
247 }
248 
249 extern __inline __m256i
250 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
251 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
252 {
253   return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
254 					     (__v32qi)__B);
255 }
256 
257 extern __inline __m256i
258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
260 {
261   return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
262 					     (__v16hi)__B);
263 }
264 
265 extern __inline __m256i
266 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
267 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
268 {
269   return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
270 					     (__v8si)__B);
271 }
272 
273 extern __inline __m256i
274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
275 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
276 {
277   return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
278 }
279 
280 extern __inline __m256i
281 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
282 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
283 {
284   return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
285 					     (__v16hi)__Y);
286 }
287 
288 extern __inline __m256i
289 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
290 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
291 {
292   return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
293 }
294 
295 extern __inline __m256i
296 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
297 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
298 {
299   return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
300 					      (__v16hi)__Y);
301 }
302 
303 extern __inline __m256i
304 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
305 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
306 {
307   return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
308 					     (__v16hi)__Y);
309 }
310 
311 extern __inline __m256i
312 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
313 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
314 {
315   return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
316 }
317 
318 extern __inline __m256i
319 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
320 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
321 {
322   return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
323 					      (__v16hi)__Y);
324 }
325 
326 extern __inline __m256i
327 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
328 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
329 {
330   return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
331 						(__v32qi)__Y);
332 }
333 
334 extern __inline __m256i
335 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
336 _mm256_madd_epi16 (__m256i __A, __m256i __B)
337 {
338   return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
339 					     (__v16hi)__B);
340 }
341 
342 extern __inline __m256i
343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
344 _mm256_max_epi8 (__m256i __A, __m256i __B)
345 {
346   return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
347 }
348 
349 extern __inline __m256i
350 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
351 _mm256_max_epi16 (__m256i __A, __m256i __B)
352 {
353   return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
354 }
355 
356 extern __inline __m256i
357 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
358 _mm256_max_epi32 (__m256i __A, __m256i __B)
359 {
360   return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
361 }
362 
363 extern __inline __m256i
364 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
365 _mm256_max_epu8 (__m256i __A, __m256i __B)
366 {
367   return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
368 }
369 
370 extern __inline __m256i
371 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
372 _mm256_max_epu16 (__m256i __A, __m256i __B)
373 {
374   return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
375 }
376 
377 extern __inline __m256i
378 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
379 _mm256_max_epu32 (__m256i __A, __m256i __B)
380 {
381   return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
382 }
383 
384 extern __inline __m256i
385 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
386 _mm256_min_epi8 (__m256i __A, __m256i __B)
387 {
388   return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
389 }
390 
391 extern __inline __m256i
392 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
393 _mm256_min_epi16 (__m256i __A, __m256i __B)
394 {
395   return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
396 }
397 
398 extern __inline __m256i
399 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
400 _mm256_min_epi32 (__m256i __A, __m256i __B)
401 {
402   return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
403 }
404 
405 extern __inline __m256i
406 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
407 _mm256_min_epu8 (__m256i __A, __m256i __B)
408 {
409   return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
410 }
411 
412 extern __inline __m256i
413 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
414 _mm256_min_epu16 (__m256i __A, __m256i __B)
415 {
416   return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
417 }
418 
419 extern __inline __m256i
420 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
421 _mm256_min_epu32 (__m256i __A, __m256i __B)
422 {
423   return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
424 }
425 
426 extern __inline int
427 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
428 _mm256_movemask_epi8 (__m256i __A)
429 {
430   return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
431 }
432 
433 extern __inline __m256i
434 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
435 _mm256_cvtepi8_epi16 (__m128i __X)
436 {
437   return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
438 }
439 
440 extern __inline __m256i
441 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
442 _mm256_cvtepi8_epi32 (__m128i __X)
443 {
444   return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
445 }
446 
447 extern __inline __m256i
448 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
449 _mm256_cvtepi8_epi64 (__m128i __X)
450 {
451   return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
452 }
453 
454 extern __inline __m256i
455 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
456 _mm256_cvtepi16_epi32 (__m128i __X)
457 {
458   return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
459 }
460 
461 extern __inline __m256i
462 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
463 _mm256_cvtepi16_epi64 (__m128i __X)
464 {
465   return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
466 }
467 
468 extern __inline __m256i
469 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
470 _mm256_cvtepi32_epi64 (__m128i __X)
471 {
472   return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
473 }
474 
475 extern __inline __m256i
476 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
477 _mm256_cvtepu8_epi16 (__m128i __X)
478 {
479   return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
480 }
481 
482 extern __inline __m256i
483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
484 _mm256_cvtepu8_epi32 (__m128i __X)
485 {
486   return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
487 }
488 
489 extern __inline __m256i
490 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
491 _mm256_cvtepu8_epi64 (__m128i __X)
492 {
493   return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
494 }
495 
496 extern __inline __m256i
497 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
498 _mm256_cvtepu16_epi32 (__m128i __X)
499 {
500   return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
501 }
502 
503 extern __inline __m256i
504 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
505 _mm256_cvtepu16_epi64 (__m128i __X)
506 {
507   return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
508 }
509 
510 extern __inline __m256i
511 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
512 _mm256_cvtepu32_epi64 (__m128i __X)
513 {
514   return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
515 }
516 
517 extern __inline __m256i
518 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
519 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
520 {
521   return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
522 }
523 
524 extern __inline __m256i
525 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
526 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
527 {
528   return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
529 					       (__v16hi)__Y);
530 }
531 
532 extern __inline __m256i
533 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
534 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
535 {
536   return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
537 }
538 
539 extern __inline __m256i
540 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
541 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
542 {
543   return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
544 }
545 
546 extern __inline __m256i
547 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
548 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
549 {
550   return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
551 }
552 
553 extern __inline __m256i
554 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
555 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
556 {
557   return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
558 }
559 
560 extern __inline __m256i
561 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
562 _mm256_mul_epu32 (__m256i __A, __m256i __B)
563 {
564   return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
565 }
566 
567 extern __inline __m256i
568 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
569 _mm256_or_si256 (__m256i __A, __m256i __B)
570 {
571   return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
572 }
573 
574 extern __inline __m256i
575 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
576 _mm256_sad_epu8 (__m256i __A, __m256i __B)
577 {
578   return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
579 }
580 
581 extern __inline __m256i
582 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
583 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
584 {
585   return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
586 					     (__v32qi)__Y);
587 }
588 
589 #ifdef __OPTIMIZE__
590 extern __inline __m256i
591 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
592 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
593 {
594   return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
595 }
596 
597 extern __inline __m256i
598 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
599 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
600 {
601   return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
602 }
603 
604 extern __inline __m256i
605 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
606 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
607 {
608   return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
609 }
610 #else
611 #define _mm256_shuffle_epi32(A, N) \
612   ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
613 #define _mm256_shufflehi_epi16(A, N) \
614   ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
615 #define _mm256_shufflelo_epi16(A, N) \
616   ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
617 #endif
618 
619 extern __inline __m256i
620 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
621 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
622 {
623   return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
624 }
625 
626 extern __inline __m256i
627 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
628 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
629 {
630   return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
631 }
632 
633 extern __inline __m256i
634 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
635 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
636 {
637   return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
638 }
639 
640 #ifdef __OPTIMIZE__
641 extern __inline __m256i
642 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
643 _mm256_slli_si256 (__m256i __A, const int __N)
644 {
645   return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
646 }
647 #else
648 #define _mm256_slli_si256(A, N) \
649   ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
650 #endif
651 
652 extern __inline __m256i
653 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
654 _mm256_slli_epi16 (__m256i __A, int __B)
655 {
656   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
657 }
658 
659 extern __inline __m256i
660 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
661 _mm256_sll_epi16 (__m256i __A, __m128i __B)
662 {
663   return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
664 }
665 
666 extern __inline __m256i
667 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
668 _mm256_slli_epi32 (__m256i __A, int __B)
669 {
670   return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
671 }
672 
673 extern __inline __m256i
674 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
675 _mm256_sll_epi32 (__m256i __A, __m128i __B)
676 {
677   return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
678 }
679 
680 extern __inline __m256i
681 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
682 _mm256_slli_epi64 (__m256i __A, int __B)
683 {
684   return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
685 }
686 
687 extern __inline __m256i
688 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
689 _mm256_sll_epi64 (__m256i __A, __m128i __B)
690 {
691   return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
692 }
693 
694 extern __inline __m256i
695 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
696 _mm256_srai_epi16 (__m256i __A, int __B)
697 {
698   return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
699 }
700 
701 extern __inline __m256i
702 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_sra_epi16 (__m256i __A, __m128i __B)
704 {
705   return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
706 }
707 
708 extern __inline __m256i
709 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
710 _mm256_srai_epi32 (__m256i __A, int __B)
711 {
712   return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
713 }
714 
715 extern __inline __m256i
716 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
717 _mm256_sra_epi32 (__m256i __A, __m128i __B)
718 {
719   return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
720 }
721 
722 #ifdef __OPTIMIZE__
723 extern __inline __m256i
724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
725 _mm256_srli_si256 (__m256i __A, const int __N)
726 {
727   return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
728 }
729 #else
730 #define _mm256_srli_si256(A, N) \
731   ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
732 #endif
733 
734 extern __inline __m256i
735 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
736 _mm256_srli_epi16 (__m256i __A, int __B)
737 {
738   return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
739 }
740 
741 extern __inline __m256i
742 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
743 _mm256_srl_epi16 (__m256i __A, __m128i __B)
744 {
745   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
746 }
747 
748 extern __inline __m256i
749 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
750 _mm256_srli_epi32 (__m256i __A, int __B)
751 {
752   return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
753 }
754 
755 extern __inline __m256i
756 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
757 _mm256_srl_epi32 (__m256i __A, __m128i __B)
758 {
759   return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
760 }
761 
762 extern __inline __m256i
763 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
764 _mm256_srli_epi64 (__m256i __A, int __B)
765 {
766   return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
767 }
768 
769 extern __inline __m256i
770 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
771 _mm256_srl_epi64 (__m256i __A, __m128i __B)
772 {
773   return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
774 }
775 
776 extern __inline __m256i
777 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
778 _mm256_sub_epi8 (__m256i __A, __m256i __B)
779 {
780   return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
781 }
782 
783 extern __inline __m256i
784 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
785 _mm256_sub_epi16 (__m256i __A, __m256i __B)
786 {
787   return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
788 }
789 
790 extern __inline __m256i
791 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
792 _mm256_sub_epi32 (__m256i __A, __m256i __B)
793 {
794   return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
795 }
796 
797 extern __inline __m256i
798 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
799 _mm256_sub_epi64 (__m256i __A, __m256i __B)
800 {
801   return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
802 }
803 
804 extern __inline __m256i
805 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
806 _mm256_subs_epi8 (__m256i __A, __m256i __B)
807 {
808   return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
809 }
810 
811 extern __inline __m256i
812 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
813 _mm256_subs_epi16 (__m256i __A, __m256i __B)
814 {
815   return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
816 }
817 
818 extern __inline __m256i
819 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
820 _mm256_subs_epu8 (__m256i __A, __m256i __B)
821 {
822   return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
823 }
824 
825 extern __inline __m256i
826 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
827 _mm256_subs_epu16 (__m256i __A, __m256i __B)
828 {
829   return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
830 }
831 
832 extern __inline __m256i
833 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
834 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
835 {
836   return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
837 }
838 
839 extern __inline __m256i
840 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
841 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
842 {
843   return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
844 }
845 
846 extern __inline __m256i
847 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
848 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
849 {
850   return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
851 }
852 
853 extern __inline __m256i
854 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
855 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
856 {
857   return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
858 }
859 
860 extern __inline __m256i
861 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
862 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
863 {
864   return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
865 }
866 
867 extern __inline __m256i
868 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
869 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
870 {
871   return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
872 }
873 
874 extern __inline __m256i
875 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
876 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
877 {
878   return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
879 }
880 
881 extern __inline __m256i
882 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
883 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
884 {
885   return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
886 }
887 
888 extern __inline __m256i
889 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
890 _mm256_xor_si256 (__m256i __A, __m256i __B)
891 {
892   return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
893 }
894 
895 extern __inline __m256i
896 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
897 _mm256_stream_load_si256 (__m256i const *__X)
898 {
899   return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
900 }
901 
902 extern __inline __m128
903 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
904 _mm_broadcastss_ps (__m128 __X)
905 {
906   return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
907 }
908 
909 extern __inline __m256
910 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
911 _mm256_broadcastss_ps (__m128 __X)
912 {
913   return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
914 }
915 
916 extern __inline __m256d
917 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
918 _mm256_broadcastsd_pd (__m128d __X)
919 {
920   return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
921 }
922 
923 extern __inline __m256i
924 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
925 _mm_broadcastsi128_si256 (__m128i __X)
926 {
927   return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
928 }
929 
930 #ifdef __OPTIMIZE__
931 extern __inline __m128i
932 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
933 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
934 {
935   return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
936 					      (__v4si)__Y,
937 					      __M);
938 }
939 #else
940 #define _mm_blend_epi32(X, Y, M)					\
941   ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),		\
942 					(__v4si)(__m128i)(Y), (int)(M)))
943 #endif
944 
945 #ifdef __OPTIMIZE__
946 extern __inline __m256i
947 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
948 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
949 {
950   return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
951 					      (__v8si)__Y,
952 					      __M);
953 }
954 #else
955 #define _mm256_blend_epi32(X, Y, M)					\
956   ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),		\
957 					(__v8si)(__m256i)(Y), (int)(M)))
958 #endif
959 
960 extern __inline __m256i
961 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
962 _mm256_broadcastb_epi8 (__m128i __X)
963 {
964   return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
965 }
966 
967 extern __inline __m256i
968 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
969 _mm256_broadcastw_epi16 (__m128i __X)
970 {
971   return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
972 }
973 
974 extern __inline __m256i
975 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
976 _mm256_broadcastd_epi32 (__m128i __X)
977 {
978   return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
979 }
980 
981 extern __inline __m256i
982 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
983 _mm256_broadcastq_epi64 (__m128i __X)
984 {
985   return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
986 }
987 
988 extern __inline __m128i
989 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
990 _mm_broadcastb_epi8 (__m128i __X)
991 {
992   return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
993 }
994 
995 extern __inline __m128i
996 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
997 _mm_broadcastw_epi16 (__m128i __X)
998 {
999   return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1000 }
1001 
1002 extern __inline __m128i
1003 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1004 _mm_broadcastd_epi32 (__m128i __X)
1005 {
1006   return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1007 }
1008 
1009 extern __inline __m128i
1010 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1011 _mm_broadcastq_epi64 (__m128i __X)
1012 {
1013   return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1014 }
1015 
1016 extern __inline __m256i
1017 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1018 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1019 {
1020   return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1021 }
1022 
1023 #ifdef __OPTIMIZE__
1024 extern __inline __m256d
1025 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm256_permute4x64_pd (__m256d __X, const int __M)
1027 {
1028   return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1029 }
1030 #else
1031 #define _mm256_permute4x64_pd(X, M)			       \
1032   ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1033 #endif
1034 
1035 extern __inline __m256
1036 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1037 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1038 {
1039   return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1040 }
1041 
1042 #ifdef __OPTIMIZE__
1043 extern __inline __m256i
1044 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1045 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1046 {
1047   return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1048 }
1049 #else
1050 #define _mm256_permute4x64_epi64(X, M)			       \
1051   ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1052 #endif
1053 
1054 
1055 #ifdef __OPTIMIZE__
1056 extern __inline __m256i
1057 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1058 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1059 {
1060   return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1061 }
1062 #else
1063 #define _mm256_permute2x128_si256(X, Y, M)				\
1064   ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1065 #endif
1066 
1067 #ifdef __OPTIMIZE__
1068 extern __inline __m128i
1069 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1070 _mm256_extracti128_si256 (__m256i __X, const int __M)
1071 {
1072   return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1073 }
1074 #else
1075 #define _mm256_extracti128_si256(X, M)				\
1076   ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1077 #endif
1078 
1079 #ifdef __OPTIMIZE__
1080 extern __inline __m256i
1081 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1082 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1083 {
1084   return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1085 }
1086 #else
1087 #define _mm256_inserti128_si256(X, Y, M)			 \
1088   ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1089 					   (__v2di)(__m128i)(Y), \
1090 					   (int)(M)))
1091 #endif
1092 
1093 extern __inline __m256i
1094 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1095 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1096 {
1097   return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1098 						(__v8si)__M);
1099 }
1100 
1101 extern __inline __m256i
1102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1103 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1104 {
1105   return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1106 						(__v4di)__M);
1107 }
1108 
1109 extern __inline __m128i
1110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1111 _mm_maskload_epi32 (int const *__X, __m128i __M )
1112 {
1113   return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1114 					     (__v4si)__M);
1115 }
1116 
1117 extern __inline __m128i
1118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1119 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1120 {
1121   return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1122 					     (__v2di)__M);
1123 }
1124 
1125 extern __inline void
1126 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1127 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1128 {
1129   __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1130 }
1131 
1132 extern __inline void
1133 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1134 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1135 {
1136   __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1137 }
1138 
1139 extern __inline void
1140 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1141 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1142 {
1143   __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1144 }
1145 
1146 extern __inline void
1147 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1148 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1149 {
1150   __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1151 }
1152 
1153 extern __inline __m256i
1154 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1155 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1156 {
1157   return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1158 }
1159 
1160 extern __inline __m128i
1161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1163 {
1164   return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1165 }
1166 
1167 extern __inline __m256i
1168 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1169 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1170 {
1171   return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1172 }
1173 
1174 extern __inline __m128i
1175 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1176 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1177 {
1178   return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1179 }
1180 
1181 extern __inline __m256i
1182 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1184 {
1185   return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1186 }
1187 
1188 extern __inline __m128i
1189 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1190 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1191 {
1192   return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1193 }
1194 
1195 extern __inline __m256i
1196 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1198 {
1199   return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1200 }
1201 
1202 extern __inline __m128i
1203 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1204 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1205 {
1206   return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1207 }
1208 
1209 extern __inline __m256i
1210 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1211 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1212 {
1213   return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1214 }
1215 
1216 extern __inline __m128i
1217 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1218 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1219 {
1220   return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1221 }
1222 
1223 #ifdef __OPTIMIZE__
1224 extern __inline __m128d
1225 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1226 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
1227 {
1228   __v2df src = _mm_setzero_pd ();
1229   __v2df mask = _mm_cmpeq_pd (src, src);
1230 
1231   return (__m128d) __builtin_ia32_gathersiv2df (src,
1232 						base,
1233 						(__v4si)index,
1234 						mask,
1235 						scale);
1236 }
1237 
1238 extern __inline __m128d
1239 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1240 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1241 		       __m128d mask, const int scale)
1242 {
1243   return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1244 						base,
1245 						(__v4si)index,
1246 						(__v2df)mask,
1247 						scale);
1248 }
1249 
1250 extern __inline __m256d
1251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1252 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1253 {
1254   __v4df src = _mm256_setzero_pd ();
1255   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1256 
1257   return (__m256d) __builtin_ia32_gathersiv4df (src,
1258 						base,
1259 						(__v4si)index,
1260 						mask,
1261 						scale);
1262 }
1263 
1264 extern __inline __m256d
1265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm256_mask_i32gather_pd (__m256d src, double const *base,
1267 			  __m128i index, __m256d mask, const int scale)
1268 {
1269   return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1270 						base,
1271 						(__v4si)index,
1272 						(__v4df)mask,
1273 						scale);
1274 }
1275 
1276 extern __inline __m128d
1277 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1278 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
1279 {
1280   __v2df src = _mm_setzero_pd ();
1281   __v2df mask = _mm_cmpeq_pd (src, src);
1282 
1283   return (__m128d) __builtin_ia32_gatherdiv2df (src,
1284 						base,
1285 						(__v2di)index,
1286 						mask,
1287 						scale);
1288 }
1289 
1290 extern __inline __m128d
1291 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1292 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1293 		       __m128d mask, const int scale)
1294 {
1295   return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1296 						base,
1297 						(__v2di)index,
1298 						(__v2df)mask,
1299 						scale);
1300 }
1301 
1302 extern __inline __m256d
1303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1305 {
1306   __v4df src = _mm256_setzero_pd ();
1307   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1308 
1309   return (__m256d) __builtin_ia32_gatherdiv4df (src,
1310 						base,
1311 						(__v4di)index,
1312 						mask,
1313 						scale);
1314 }
1315 
1316 extern __inline __m256d
1317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm256_mask_i64gather_pd (__m256d src, double const *base,
1319 			  __m256i index, __m256d mask, const int scale)
1320 {
1321   return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1322 						base,
1323 						(__v4di)index,
1324 						(__v4df)mask,
1325 						scale);
1326 }
1327 
1328 extern __inline __m128
1329 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1330 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
1331 {
1332   __v4sf src = _mm_setzero_ps ();
1333   __v4sf mask = _mm_cmpeq_ps (src, src);
1334 
1335   return (__m128) __builtin_ia32_gathersiv4sf (src,
1336 					       base,
1337 					       (__v4si)index,
1338 					       mask,
1339 					       scale);
1340 }
1341 
1342 extern __inline __m128
1343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1344 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1345 		       __m128 mask, const int scale)
1346 {
1347   return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1348 					       base,
1349 					       (__v4si)index,
1350 					       (__v4sf)mask,
1351 					       scale);
1352 }
1353 
1354 extern __inline __m256
1355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1356 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1357 {
1358   __v8sf src = _mm256_setzero_ps ();
1359   __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1360 
1361   return (__m256) __builtin_ia32_gathersiv8sf (src,
1362 					       base,
1363 					       (__v8si)index,
1364 					       mask,
1365 					       scale);
1366 }
1367 
1368 extern __inline __m256
1369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm256_mask_i32gather_ps (__m256 src, float const *base,
1371 			  __m256i index, __m256 mask, const int scale)
1372 {
1373   return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1374 					       base,
1375 					       (__v8si)index,
1376 					       (__v8sf)mask,
1377 					       scale);
1378 }
1379 
1380 extern __inline __m128
1381 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
1383 {
1384   __v4sf src = _mm_setzero_ps ();
1385   __v4sf mask = _mm_cmpeq_ps (src, src);
1386 
1387   return (__m128) __builtin_ia32_gatherdiv4sf (src,
1388 					       base,
1389 					       (__v2di)index,
1390 					       mask,
1391 					       scale);
1392 }
1393 
1394 extern __inline __m128
1395 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1396 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1397 		       __m128 mask, const int scale)
1398 {
1399   return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1400 						base,
1401 						(__v2di)index,
1402 						(__v4sf)mask,
1403 						scale);
1404 }
1405 
1406 extern __inline __m128
1407 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1408 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1409 {
1410   __v4sf src = _mm_setzero_ps ();
1411   __v4sf mask = _mm_cmpeq_ps (src, src);
1412 
1413   return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1414 						  base,
1415 						  (__v4di)index,
1416 						  mask,
1417 						  scale);
1418 }
1419 
1420 extern __inline __m128
1421 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1422 _mm256_mask_i64gather_ps (__m128 src, float const *base,
1423 			  __m256i index, __m128 mask, const int scale)
1424 {
1425   return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1426 						  base,
1427 						  (__v4di)index,
1428 						  (__v4sf)mask,
1429 						  scale);
1430 }
1431 
1432 extern __inline __m128i
1433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1434 _mm_i32gather_epi64 (long long int const *base,
1435 		     __m128i index, const int scale)
1436 {
1437   __v2di src = __extension__ (__v2di){ 0, 0 };
1438   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1439 
1440   return (__m128i) __builtin_ia32_gathersiv2di (src,
1441 						base,
1442 						(__v4si)index,
1443 						mask,
1444 						scale);
1445 }
1446 
1447 extern __inline __m128i
1448 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1449 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1450 			  __m128i index, __m128i mask, const int scale)
1451 {
1452   return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1453 						base,
1454 						(__v4si)index,
1455 						(__v2di)mask,
1456 						scale);
1457 }
1458 
1459 extern __inline __m256i
1460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1461 _mm256_i32gather_epi64 (long long int const *base,
1462 			__m128i index, const int scale)
1463 {
1464   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1465   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1466 
1467   return (__m256i) __builtin_ia32_gathersiv4di (src,
1468 						base,
1469 						(__v4si)index,
1470 						mask,
1471 						scale);
1472 }
1473 
1474 extern __inline __m256i
1475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1476 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1477 			     __m128i index, __m256i mask, const int scale)
1478 {
1479   return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1480 						base,
1481 						(__v4si)index,
1482 						(__v4di)mask,
1483 						scale);
1484 }
1485 
1486 extern __inline __m128i
1487 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1488 _mm_i64gather_epi64 (long long int const *base,
1489 		     __m128i index, const int scale)
1490 {
1491   __v2di src = __extension__ (__v2di){ 0, 0 };
1492   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1493 
1494   return (__m128i) __builtin_ia32_gatherdiv2di (src,
1495 						base,
1496 						(__v2di)index,
1497 						mask,
1498 						scale);
1499 }
1500 
1501 extern __inline __m128i
1502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1503 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1504 			  __m128i mask, const int scale)
1505 {
1506   return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1507 						base,
1508 						(__v2di)index,
1509 						(__v2di)mask,
1510 						scale);
1511 }
1512 
1513 extern __inline __m256i
1514 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1515 _mm256_i64gather_epi64 (long long int const *base,
1516 			__m256i index, const int scale)
1517 {
1518   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1519   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1520 
1521   return (__m256i) __builtin_ia32_gatherdiv4di (src,
1522 						base,
1523 						(__v4di)index,
1524 						mask,
1525 						scale);
1526 }
1527 
1528 extern __inline __m256i
1529 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1530 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1531 			     __m256i index, __m256i mask, const int scale)
1532 {
1533   return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1534 						base,
1535 						(__v4di)index,
1536 						(__v4di)mask,
1537 						scale);
1538 }
1539 
1540 extern __inline __m128i
1541 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1542 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1543 {
1544   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1545   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1546 
1547   return (__m128i) __builtin_ia32_gathersiv4si (src,
1548 					       base,
1549 					       (__v4si)index,
1550 					       mask,
1551 					       scale);
1552 }
1553 
1554 extern __inline __m128i
1555 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1556 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1557 			  __m128i mask, const int scale)
1558 {
1559   return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1560 						base,
1561 						(__v4si)index,
1562 						(__v4si)mask,
1563 						scale);
1564 }
1565 
1566 extern __inline __m256i
1567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1568 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1569 {
1570   __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1571   __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1572 
1573   return (__m256i) __builtin_ia32_gathersiv8si (src,
1574 						base,
1575 						(__v8si)index,
1576 						mask,
1577 						scale);
1578 }
1579 
1580 extern __inline __m256i
1581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1582 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1583 			     __m256i index, __m256i mask, const int scale)
1584 {
1585   return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1586 						base,
1587 						(__v8si)index,
1588 						(__v8si)mask,
1589 						scale);
1590 }
1591 
1592 extern __inline __m128i
1593 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1594 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1595 {
1596   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1597   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1598 
1599   return (__m128i) __builtin_ia32_gatherdiv4si (src,
1600 						base,
1601 						(__v2di)index,
1602 						mask,
1603 						scale);
1604 }
1605 
1606 extern __inline __m128i
1607 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1608 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1609 			  __m128i mask, const int scale)
1610 {
1611   return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1612 						base,
1613 						(__v2di)index,
1614 						(__v4si)mask,
1615 						scale);
1616 }
1617 
1618 extern __inline __m128i
1619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1620 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1621 {
1622   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1623   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1624 
1625   return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1626 						  base,
1627 						  (__v4di)index,
1628 						  mask,
1629 						  scale);
1630 }
1631 
1632 extern __inline __m128i
1633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1634 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1635 			     __m256i index, __m128i mask, const int scale)
1636 {
1637   return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1638 						   base,
1639 						   (__v4di)index,
1640 						   (__v4si)mask,
1641 						   scale);
1642 }
1643 #else /* __OPTIMIZE__ */
1644 #define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
1645   (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
1646 					 (double const *)BASE,		\
1647 					 (__v4si)(__m128i)INDEX,	\
1648 					 (__v2df)_mm_set1_pd(		\
1649 					   (double)(long long int) -1), \
1650 					 (int)SCALE)
1651 
1652 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1653   (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
1654 					 (double const *)BASE,	 \
1655 					 (__v4si)(__m128i)INDEX, \
1656 					 (__v2df)(__m128d)MASK,	 \
1657 					 (int)SCALE)
1658 
1659 #define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
1660   (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
1661 					 (double const *)BASE,		\
1662 					 (__v4si)(__m128i)INDEX,	\
1663 					 (__v4df)_mm256_set1_pd(	\
1664 					   (double)(long long int) -1), \
1665 					 (int)SCALE)
1666 
1667 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1668   (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
1669 					 (double const *)BASE,	 \
1670 					 (__v4si)(__m128i)INDEX, \
1671 					 (__v4df)(__m256d)MASK,	 \
1672 					 (int)SCALE)
1673 
1674 #define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
1675   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
1676 					 (double const *)BASE,		\
1677 					 (__v2di)(__m128i)INDEX,	\
1678 					 (__v2df)_mm_set1_pd(		\
1679 					   (double)(long long int) -1), \
1680 					 (int)SCALE)
1681 
1682 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1683   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
1684 					 (double const *)BASE,	 \
1685 					 (__v2di)(__m128i)INDEX, \
1686 					 (__v2df)(__m128d)MASK,	 \
1687 					 (int)SCALE)
1688 
1689 #define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
1690   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
1691 					 (double const *)BASE,		\
1692 					 (__v4di)(__m256i)INDEX,	\
1693 					 (__v4df)_mm256_set1_pd(	\
1694 					   (double)(long long int) -1), \
1695 					 (int)SCALE)
1696 
1697 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1698   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
1699 					 (double const *)BASE,	 \
1700 					 (__v4di)(__m256i)INDEX, \
1701 					 (__v4df)(__m256d)MASK,	 \
1702 					 (int)SCALE)
1703 
1704 #define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
1705   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
1706 					(float const *)BASE,		\
1707 					(__v4si)(__m128i)INDEX,		\
1708 					_mm_set1_ps ((float)(int) -1),	\
1709 					(int)SCALE)
1710 
1711 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
1712   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
1713 					(float const *)BASE,	 \
1714 					(__v4si)(__m128i)INDEX,	 \
1715 					(__v4sf)(__m128d)MASK,	 \
1716 					(int)SCALE)
1717 
1718 #define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
1719   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1720 					(float const *)BASE,	       \
1721 					(__v8si)(__m256i)INDEX,	       \
1722 					(__v8sf)_mm256_set1_ps (       \
1723 					  (float)(int) -1),	       \
1724 					(int)SCALE)
1725 
1726 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1727   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
1728 					(float const *)BASE,	\
1729 					(__v8si)(__m256i)INDEX, \
1730 					(__v8sf)(__m256d)MASK,	\
1731 					(int)SCALE)
1732 
1733 #define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
1734   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
1735 					(float const *)BASE,		\
1736 					(__v2di)(__m128i)INDEX,		\
1737 					(__v4sf)_mm_set1_ps (		\
1738 					  (float)(int) -1),		\
1739 					(int)SCALE)
1740 
1741 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
1742   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
1743 					(float const *)BASE,	 \
1744 					(__v2di)(__m128i)INDEX,	 \
1745 					(__v4sf)(__m128d)MASK,	 \
1746 					(int)SCALE)
1747 
1748 #define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
1749   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
1750 					   (float const *)BASE,		\
1751 					   (__v4di)(__m256i)INDEX,	\
1752 					   (__v4sf)_mm_set1_ps(		\
1753 					     (float)(int) -1),		\
1754 					   (int)SCALE)
1755 
1756 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
1757   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
1758 					   (float const *)BASE,	   \
1759 					   (__v4di)(__m256i)INDEX, \
1760 					   (__v4sf)(__m128)MASK,   \
1761 					   (int)SCALE)
1762 
1763 #define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\
1764   (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1765 					 (long long const *)BASE,	\
1766 					 (__v4si)(__m128i)INDEX,	\
1767 					 (__v2di)_mm_set1_epi64x (-1),	\
1768 					 (int)SCALE)
1769 
1770 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
1771   (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,	  \
1772 					 (long long const *)BASE, \
1773 					 (__v4si)(__m128i)INDEX,  \
1774 					 (__v2di)(__m128i)MASK,	  \
1775 					 (int)SCALE)
1776 
1777 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE)			   \
1778   (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1779 					 (long long const *)BASE,	   \
1780 					 (__v4si)(__m128i)INDEX,	   \
1781 					 (__v4di)_mm256_set1_epi64x (-1),  \
1782 					 (int)SCALE)
1783 
1784 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1785   (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,	   \
1786 					 (long long const *)BASE,  \
1787 					 (__v4si)(__m128i)INDEX,   \
1788 					 (__v4di)(__m256i)MASK,	   \
1789 					 (int)SCALE)
1790 
1791 #define _mm_i64gather_epi64(BASE, INDEX, SCALE)				\
1792   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1793 					 (long long const *)BASE,	\
1794 					 (__v2di)(__m128i)INDEX,	\
1795 					 (__v2di)_mm_set1_epi64x (-1),	\
1796 					 (int)SCALE)
1797 
1798 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
1799   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,	  \
1800 					 (long long const *)BASE, \
1801 					 (__v2di)(__m128i)INDEX,  \
1802 					 (__v2di)(__m128i)MASK,	  \
1803 					 (int)SCALE)
1804 
1805 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE)			   \
1806   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1807 					 (long long const *)BASE,	   \
1808 					 (__v4di)(__m256i)INDEX,	   \
1809 					 (__v4di)_mm256_set1_epi64x (-1),  \
1810 					 (int)SCALE)
1811 
1812 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1813   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,	   \
1814 					 (long long const *)BASE,  \
1815 					 (__v4di)(__m256i)INDEX,   \
1816 					 (__v4di)(__m256i)MASK,	   \
1817 					 (int)SCALE)
1818 
1819 #define _mm_i32gather_epi32(BASE, INDEX, SCALE)				\
1820   (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),	\
1821 					 (int const *)BASE,		\
1822 					 (__v4si)(__m128i)INDEX,	\
1823 					 (__v4si)_mm_set1_epi32 (-1),	\
1824 					 (int)SCALE)
1825 
1826 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1827   (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,	\
1828 					(int const *)BASE,	\
1829 					(__v4si)(__m128i)INDEX, \
1830 					(__v4si)(__m128i)MASK,	\
1831 					(int)SCALE)
1832 
1833 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE)			   \
1834   (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1835 					 (int const *)BASE,		   \
1836 					 (__v8si)(__m256i)INDEX,	   \
1837 					 (__v8si)_mm256_set1_epi32 (-1),   \
1838 					 (int)SCALE)
1839 
1840 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1841   (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,	   \
1842 					(int const *)BASE,	   \
1843 					(__v8si)(__m256i)INDEX,	   \
1844 					(__v8si)(__m256i)MASK,	   \
1845 					(int)SCALE)
1846 
1847 #define _mm_i64gather_epi32(BASE, INDEX, SCALE)				\
1848   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),	\
1849 					 (int const *)BASE,		\
1850 					 (__v2di)(__m128i)INDEX,	\
1851 					 (__v4si)_mm_set1_epi32 (-1),	\
1852 					 (int)SCALE)
1853 
1854 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1855   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,	\
1856 					(int const *)BASE,	\
1857 					(__v2di)(__m128i)INDEX, \
1858 					(__v4si)(__m128i)MASK,	\
1859 					(int)SCALE)
1860 
1861 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE)			   \
1862   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1863 					    (int const *)BASE,		   \
1864 					    (__v4di)(__m256i)INDEX,	   \
1865 					    (__v4si)_mm_set1_epi32(-1),	   \
1866 					    (int)SCALE)
1867 
1868 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1869   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
1870 					   (int const *)BASE,	   \
1871 					   (__v4di)(__m256i)INDEX, \
1872 					   (__v4si)(__m128i)MASK,  \
1873 					   (int)SCALE)
1874 #endif  /* __OPTIMIZE__ */
1875