1 /* Copyright (C) 2011-2018 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 #ifndef _IMMINTRIN_H_INCLUDED
25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26 #endif
27 
28 #ifndef _AVX2INTRIN_H_INCLUDED
29 #define _AVX2INTRIN_H_INCLUDED
30 
31 #ifndef __AVX2__
32 #pragma GCC push_options
33 #pragma GCC target("avx2")
34 #define __DISABLE_AVX2__
35 #endif /* __AVX2__ */
36 
37 /* Sum absolute 8-bit integer difference of adjacent groups of 4
38    byte integers in the first 2 operands.  Starting offsets within
39    operands are determined by the 3rd mask operand.  */
40 #ifdef __OPTIMIZE__
41 extern __inline __m256i
42 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
43 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
44 {
45   return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
46 					      (__v32qi)__Y, __M);
47 }
48 #else
49 #define _mm256_mpsadbw_epu8(X, Y, M)					\
50   ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),		\
51 					(__v32qi)(__m256i)(Y), (int)(M)))
52 #endif
53 
54 extern __inline __m256i
55 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
56 _mm256_abs_epi8 (__m256i __A)
57 {
58   return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
59 }
60 
61 extern __inline __m256i
62 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
63 _mm256_abs_epi16 (__m256i __A)
64 {
65   return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
66 }
67 
68 extern __inline __m256i
69 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
70 _mm256_abs_epi32 (__m256i __A)
71 {
72   return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
73 }
74 
75 extern __inline __m256i
76 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
77 _mm256_packs_epi32 (__m256i __A, __m256i __B)
78 {
79   return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
80 }
81 
82 extern __inline __m256i
83 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
84 _mm256_packs_epi16 (__m256i __A, __m256i __B)
85 {
86   return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
87 }
88 
89 extern __inline __m256i
90 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
91 _mm256_packus_epi32 (__m256i __A, __m256i __B)
92 {
93   return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
94 }
95 
96 extern __inline __m256i
97 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
98 _mm256_packus_epi16 (__m256i __A, __m256i __B)
99 {
100   return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
101 }
102 
103 extern __inline __m256i
104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
105 _mm256_add_epi8 (__m256i __A, __m256i __B)
106 {
107   return (__m256i) ((__v32qu)__A + (__v32qu)__B);
108 }
109 
110 extern __inline __m256i
111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
112 _mm256_add_epi16 (__m256i __A, __m256i __B)
113 {
114   return (__m256i) ((__v16hu)__A + (__v16hu)__B);
115 }
116 
117 extern __inline __m256i
118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
119 _mm256_add_epi32 (__m256i __A, __m256i __B)
120 {
121   return (__m256i) ((__v8su)__A + (__v8su)__B);
122 }
123 
124 extern __inline __m256i
125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
126 _mm256_add_epi64 (__m256i __A, __m256i __B)
127 {
128   return (__m256i) ((__v4du)__A + (__v4du)__B);
129 }
130 
131 extern __inline __m256i
132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
133 _mm256_adds_epi8 (__m256i __A, __m256i __B)
134 {
135   return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
136 }
137 
138 extern __inline __m256i
139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
140 _mm256_adds_epi16 (__m256i __A, __m256i __B)
141 {
142   return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
143 }
144 
145 extern __inline __m256i
146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
147 _mm256_adds_epu8 (__m256i __A, __m256i __B)
148 {
149   return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
150 }
151 
152 extern __inline __m256i
153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154 _mm256_adds_epu16 (__m256i __A, __m256i __B)
155 {
156   return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
157 }
158 
159 #ifdef __OPTIMIZE__
160 extern __inline __m256i
161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
162 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
163 {
164   return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
165 					      (__v4di)__B,
166 					      __N * 8);
167 }
168 #else
169 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
170 /* Use define instead */
171 #define _mm256_alignr_epi8(A, B, N)				   \
172   ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),	   \
173 					(__v4di)(__m256i)(B),	   \
174 					(int)(N) * 8))
175 #endif
176 
177 extern __inline __m256i
178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
179 _mm256_and_si256 (__m256i __A, __m256i __B)
180 {
181   return (__m256i) ((__v4du)__A & (__v4du)__B);
182 }
183 
184 extern __inline __m256i
185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
186 _mm256_andnot_si256 (__m256i __A, __m256i __B)
187 {
188   return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
189 }
190 
191 extern __inline __m256i
192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
193 _mm256_avg_epu8 (__m256i __A, __m256i __B)
194 {
195   return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
196 }
197 
198 extern __inline __m256i
199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
200 _mm256_avg_epu16 (__m256i __A, __m256i __B)
201 {
202   return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
203 }
204 
205 extern __inline __m256i
206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
207 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
208 {
209   return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
210 					       (__v32qi)__Y,
211 					       (__v32qi)__M);
212 }
213 
214 #ifdef __OPTIMIZE__
215 extern __inline __m256i
216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
217 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
218 {
219   return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
220 					      (__v16hi)__Y,
221 					       __M);
222 }
223 #else
224 #define _mm256_blend_epi16(X, Y, M)					\
225   ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),		\
226 					(__v16hi)(__m256i)(Y), (int)(M)))
227 #endif
228 
229 extern __inline __m256i
230 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
231 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
232 {
233   return (__m256i) ((__v32qi)__A == (__v32qi)__B);
234 }
235 
236 extern __inline __m256i
237 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
238 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
239 {
240   return (__m256i) ((__v16hi)__A == (__v16hi)__B);
241 }
242 
243 extern __inline __m256i
244 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
245 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
246 {
247   return (__m256i) ((__v8si)__A == (__v8si)__B);
248 }
249 
250 extern __inline __m256i
251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
252 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
253 {
254   return (__m256i) ((__v4di)__A == (__v4di)__B);
255 }
256 
257 extern __inline __m256i
258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
260 {
261   return (__m256i) ((__v32qi)__A > (__v32qi)__B);
262 }
263 
264 extern __inline __m256i
265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
267 {
268   return (__m256i) ((__v16hi)__A > (__v16hi)__B);
269 }
270 
271 extern __inline __m256i
272 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
273 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
274 {
275   return (__m256i) ((__v8si)__A > (__v8si)__B);
276 }
277 
278 extern __inline __m256i
279 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
280 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
281 {
282   return (__m256i) ((__v4di)__A > (__v4di)__B);
283 }
284 
285 extern __inline __m256i
286 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
288 {
289   return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
290 					     (__v16hi)__Y);
291 }
292 
293 extern __inline __m256i
294 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
295 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
296 {
297   return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
298 }
299 
300 extern __inline __m256i
301 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
302 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
303 {
304   return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
305 					      (__v16hi)__Y);
306 }
307 
308 extern __inline __m256i
309 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
310 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
311 {
312   return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
313 					     (__v16hi)__Y);
314 }
315 
316 extern __inline __m256i
317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
318 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
319 {
320   return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
321 }
322 
323 extern __inline __m256i
324 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
325 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
326 {
327   return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
328 					      (__v16hi)__Y);
329 }
330 
331 extern __inline __m256i
332 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
333 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
334 {
335   return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
336 						(__v32qi)__Y);
337 }
338 
339 extern __inline __m256i
340 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
341 _mm256_madd_epi16 (__m256i __A, __m256i __B)
342 {
343   return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
344 					     (__v16hi)__B);
345 }
346 
347 extern __inline __m256i
348 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
349 _mm256_max_epi8 (__m256i __A, __m256i __B)
350 {
351   return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
352 }
353 
354 extern __inline __m256i
355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
356 _mm256_max_epi16 (__m256i __A, __m256i __B)
357 {
358   return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
359 }
360 
361 extern __inline __m256i
362 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
363 _mm256_max_epi32 (__m256i __A, __m256i __B)
364 {
365   return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
366 }
367 
368 extern __inline __m256i
369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
370 _mm256_max_epu8 (__m256i __A, __m256i __B)
371 {
372   return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
373 }
374 
375 extern __inline __m256i
376 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
377 _mm256_max_epu16 (__m256i __A, __m256i __B)
378 {
379   return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
380 }
381 
382 extern __inline __m256i
383 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
384 _mm256_max_epu32 (__m256i __A, __m256i __B)
385 {
386   return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
387 }
388 
389 extern __inline __m256i
390 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
391 _mm256_min_epi8 (__m256i __A, __m256i __B)
392 {
393   return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
394 }
395 
396 extern __inline __m256i
397 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
398 _mm256_min_epi16 (__m256i __A, __m256i __B)
399 {
400   return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
401 }
402 
403 extern __inline __m256i
404 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
405 _mm256_min_epi32 (__m256i __A, __m256i __B)
406 {
407   return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
408 }
409 
410 extern __inline __m256i
411 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
412 _mm256_min_epu8 (__m256i __A, __m256i __B)
413 {
414   return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
415 }
416 
417 extern __inline __m256i
418 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
419 _mm256_min_epu16 (__m256i __A, __m256i __B)
420 {
421   return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
422 }
423 
424 extern __inline __m256i
425 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
426 _mm256_min_epu32 (__m256i __A, __m256i __B)
427 {
428   return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
429 }
430 
431 extern __inline int
432 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
433 _mm256_movemask_epi8 (__m256i __A)
434 {
435   return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
436 }
437 
438 extern __inline __m256i
439 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
440 _mm256_cvtepi8_epi16 (__m128i __X)
441 {
442   return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
443 }
444 
445 extern __inline __m256i
446 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtepi8_epi32 (__m128i __X)
448 {
449   return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
450 }
451 
452 extern __inline __m256i
453 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
454 _mm256_cvtepi8_epi64 (__m128i __X)
455 {
456   return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
457 }
458 
459 extern __inline __m256i
460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
461 _mm256_cvtepi16_epi32 (__m128i __X)
462 {
463   return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
464 }
465 
466 extern __inline __m256i
467 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
468 _mm256_cvtepi16_epi64 (__m128i __X)
469 {
470   return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
471 }
472 
473 extern __inline __m256i
474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
475 _mm256_cvtepi32_epi64 (__m128i __X)
476 {
477   return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
478 }
479 
480 extern __inline __m256i
481 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
482 _mm256_cvtepu8_epi16 (__m128i __X)
483 {
484   return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
485 }
486 
487 extern __inline __m256i
488 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
489 _mm256_cvtepu8_epi32 (__m128i __X)
490 {
491   return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
492 }
493 
494 extern __inline __m256i
495 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
496 _mm256_cvtepu8_epi64 (__m128i __X)
497 {
498   return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
499 }
500 
501 extern __inline __m256i
502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
503 _mm256_cvtepu16_epi32 (__m128i __X)
504 {
505   return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
506 }
507 
508 extern __inline __m256i
509 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
510 _mm256_cvtepu16_epi64 (__m128i __X)
511 {
512   return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
513 }
514 
515 extern __inline __m256i
516 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
517 _mm256_cvtepu32_epi64 (__m128i __X)
518 {
519   return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
520 }
521 
522 extern __inline __m256i
523 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
524 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
525 {
526   return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
527 }
528 
529 extern __inline __m256i
530 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
531 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
532 {
533   return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
534 					       (__v16hi)__Y);
535 }
536 
537 extern __inline __m256i
538 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
539 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
540 {
541   return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
542 }
543 
544 extern __inline __m256i
545 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
546 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
547 {
548   return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
549 }
550 
551 extern __inline __m256i
552 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
553 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
554 {
555   return (__m256i) ((__v16hu)__A * (__v16hu)__B);
556 }
557 
558 extern __inline __m256i
559 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
560 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
561 {
562   return (__m256i) ((__v8su)__A * (__v8su)__B);
563 }
564 
565 extern __inline __m256i
566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
567 _mm256_mul_epu32 (__m256i __A, __m256i __B)
568 {
569   return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
570 }
571 
572 extern __inline __m256i
573 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
574 _mm256_or_si256 (__m256i __A, __m256i __B)
575 {
576   return (__m256i) ((__v4du)__A | (__v4du)__B);
577 }
578 
579 extern __inline __m256i
580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
581 _mm256_sad_epu8 (__m256i __A, __m256i __B)
582 {
583   return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
584 }
585 
586 extern __inline __m256i
587 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
588 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
589 {
590   return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
591 					     (__v32qi)__Y);
592 }
593 
594 #ifdef __OPTIMIZE__
595 extern __inline __m256i
596 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
597 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
598 {
599   return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
600 }
601 
602 extern __inline __m256i
603 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
604 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
605 {
606   return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
607 }
608 
609 extern __inline __m256i
610 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
611 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
612 {
613   return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
614 }
615 #else
616 #define _mm256_shuffle_epi32(A, N) \
617   ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
618 #define _mm256_shufflehi_epi16(A, N) \
619   ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
620 #define _mm256_shufflelo_epi16(A, N) \
621   ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
622 #endif
623 
624 extern __inline __m256i
625 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
626 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
627 {
628   return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
629 }
630 
631 extern __inline __m256i
632 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
633 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
634 {
635   return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
636 }
637 
638 extern __inline __m256i
639 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
640 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
641 {
642   return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
643 }
644 
645 #ifdef __OPTIMIZE__
646 extern __inline __m256i
647 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
648 _mm256_bslli_epi128 (__m256i __A, const int __N)
649 {
650   return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
651 }
652 
653 extern __inline __m256i
654 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
655 _mm256_slli_si256 (__m256i __A, const int __N)
656 {
657   return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
658 }
659 #else
660 #define _mm256_bslli_epi128(A, N) \
661   ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
662 #define _mm256_slli_si256(A, N) \
663   ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
664 #endif
665 
666 extern __inline __m256i
667 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
668 _mm256_slli_epi16 (__m256i __A, int __B)
669 {
670   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
671 }
672 
673 extern __inline __m256i
674 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
675 _mm256_sll_epi16 (__m256i __A, __m128i __B)
676 {
677   return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
678 }
679 
680 extern __inline __m256i
681 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
682 _mm256_slli_epi32 (__m256i __A, int __B)
683 {
684   return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
685 }
686 
687 extern __inline __m256i
688 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
689 _mm256_sll_epi32 (__m256i __A, __m128i __B)
690 {
691   return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
692 }
693 
694 extern __inline __m256i
695 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
696 _mm256_slli_epi64 (__m256i __A, int __B)
697 {
698   return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
699 }
700 
701 extern __inline __m256i
702 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_sll_epi64 (__m256i __A, __m128i __B)
704 {
705   return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
706 }
707 
708 extern __inline __m256i
709 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
710 _mm256_srai_epi16 (__m256i __A, int __B)
711 {
712   return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
713 }
714 
715 extern __inline __m256i
716 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
717 _mm256_sra_epi16 (__m256i __A, __m128i __B)
718 {
719   return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
720 }
721 
722 extern __inline __m256i
723 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
724 _mm256_srai_epi32 (__m256i __A, int __B)
725 {
726   return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
727 }
728 
729 extern __inline __m256i
730 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
731 _mm256_sra_epi32 (__m256i __A, __m128i __B)
732 {
733   return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
734 }
735 
736 #ifdef __OPTIMIZE__
737 extern __inline __m256i
738 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
739 _mm256_bsrli_epi128 (__m256i __A, const int __N)
740 {
741   return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
742 }
743 
744 extern __inline __m256i
745 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
746 _mm256_srli_si256 (__m256i __A, const int __N)
747 {
748   return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
749 }
750 #else
751 #define _mm256_bsrli_epi128(A, N) \
752   ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
753 #define _mm256_srli_si256(A, N) \
754   ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
755 #endif
756 
757 extern __inline __m256i
758 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
759 _mm256_srli_epi16 (__m256i __A, int __B)
760 {
761   return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
762 }
763 
764 extern __inline __m256i
765 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
766 _mm256_srl_epi16 (__m256i __A, __m128i __B)
767 {
768   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
769 }
770 
771 extern __inline __m256i
772 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
773 _mm256_srli_epi32 (__m256i __A, int __B)
774 {
775   return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
776 }
777 
778 extern __inline __m256i
779 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
780 _mm256_srl_epi32 (__m256i __A, __m128i __B)
781 {
782   return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
783 }
784 
785 extern __inline __m256i
786 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
787 _mm256_srli_epi64 (__m256i __A, int __B)
788 {
789   return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
790 }
791 
792 extern __inline __m256i
793 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
794 _mm256_srl_epi64 (__m256i __A, __m128i __B)
795 {
796   return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
797 }
798 
799 extern __inline __m256i
800 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
801 _mm256_sub_epi8 (__m256i __A, __m256i __B)
802 {
803   return (__m256i) ((__v32qu)__A - (__v32qu)__B);
804 }
805 
806 extern __inline __m256i
807 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
808 _mm256_sub_epi16 (__m256i __A, __m256i __B)
809 {
810   return (__m256i) ((__v16hu)__A - (__v16hu)__B);
811 }
812 
813 extern __inline __m256i
814 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
815 _mm256_sub_epi32 (__m256i __A, __m256i __B)
816 {
817   return (__m256i) ((__v8su)__A - (__v8su)__B);
818 }
819 
820 extern __inline __m256i
821 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
822 _mm256_sub_epi64 (__m256i __A, __m256i __B)
823 {
824   return (__m256i) ((__v4du)__A - (__v4du)__B);
825 }
826 
827 extern __inline __m256i
828 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
829 _mm256_subs_epi8 (__m256i __A, __m256i __B)
830 {
831   return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
832 }
833 
834 extern __inline __m256i
835 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
836 _mm256_subs_epi16 (__m256i __A, __m256i __B)
837 {
838   return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
839 }
840 
841 extern __inline __m256i
842 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
843 _mm256_subs_epu8 (__m256i __A, __m256i __B)
844 {
845   return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
846 }
847 
848 extern __inline __m256i
849 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
850 _mm256_subs_epu16 (__m256i __A, __m256i __B)
851 {
852   return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
853 }
854 
855 extern __inline __m256i
856 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
857 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
858 {
859   return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
860 }
861 
862 extern __inline __m256i
863 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
864 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
865 {
866   return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
867 }
868 
869 extern __inline __m256i
870 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
871 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
872 {
873   return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
874 }
875 
876 extern __inline __m256i
877 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
878 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
879 {
880   return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
881 }
882 
883 extern __inline __m256i
884 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
885 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
886 {
887   return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
888 }
889 
890 extern __inline __m256i
891 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
892 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
893 {
894   return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
895 }
896 
897 extern __inline __m256i
898 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
899 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
900 {
901   return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
902 }
903 
904 extern __inline __m256i
905 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
906 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
907 {
908   return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
909 }
910 
911 extern __inline __m256i
912 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
913 _mm256_xor_si256 (__m256i __A, __m256i __B)
914 {
915   return (__m256i) ((__v4du)__A ^ (__v4du)__B);
916 }
917 
918 extern __inline __m256i
919 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
920 _mm256_stream_load_si256 (__m256i const *__X)
921 {
922   return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
923 }
924 
925 extern __inline __m128
926 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
927 _mm_broadcastss_ps (__m128 __X)
928 {
929   return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
930 }
931 
932 extern __inline __m256
933 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
934 _mm256_broadcastss_ps (__m128 __X)
935 {
936   return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
937 }
938 
939 extern __inline __m256d
940 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
941 _mm256_broadcastsd_pd (__m128d __X)
942 {
943   return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
944 }
945 
946 extern __inline __m256i
947 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
948 _mm256_broadcastsi128_si256 (__m128i __X)
949 {
950   return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
951 }
952 
953 #ifdef __OPTIMIZE__
954 extern __inline __m128i
955 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
956 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
957 {
958   return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
959 					      (__v4si)__Y,
960 					      __M);
961 }
962 #else
963 #define _mm_blend_epi32(X, Y, M)					\
964   ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),		\
965 					(__v4si)(__m128i)(Y), (int)(M)))
966 #endif
967 
968 #ifdef __OPTIMIZE__
969 extern __inline __m256i
970 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
971 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
972 {
973   return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
974 					      (__v8si)__Y,
975 					      __M);
976 }
977 #else
978 #define _mm256_blend_epi32(X, Y, M)					\
979   ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),		\
980 					(__v8si)(__m256i)(Y), (int)(M)))
981 #endif
982 
983 extern __inline __m256i
984 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
985 _mm256_broadcastb_epi8 (__m128i __X)
986 {
987   return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
988 }
989 
990 extern __inline __m256i
991 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
992 _mm256_broadcastw_epi16 (__m128i __X)
993 {
994   return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
995 }
996 
997 extern __inline __m256i
998 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
999 _mm256_broadcastd_epi32 (__m128i __X)
1000 {
1001   return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
1002 }
1003 
1004 extern __inline __m256i
1005 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1006 _mm256_broadcastq_epi64 (__m128i __X)
1007 {
1008   return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
1009 }
1010 
1011 extern __inline __m128i
1012 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1013 _mm_broadcastb_epi8 (__m128i __X)
1014 {
1015   return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1016 }
1017 
1018 extern __inline __m128i
1019 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1020 _mm_broadcastw_epi16 (__m128i __X)
1021 {
1022   return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1023 }
1024 
1025 extern __inline __m128i
1026 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm_broadcastd_epi32 (__m128i __X)
1028 {
1029   return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1030 }
1031 
1032 extern __inline __m128i
1033 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1034 _mm_broadcastq_epi64 (__m128i __X)
1035 {
1036   return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1037 }
1038 
1039 extern __inline __m256i
1040 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1041 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1042 {
1043   return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1044 }
1045 
1046 #ifdef __OPTIMIZE__
1047 extern __inline __m256d
1048 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1049 _mm256_permute4x64_pd (__m256d __X, const int __M)
1050 {
1051   return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1052 }
1053 #else
1054 #define _mm256_permute4x64_pd(X, M)			       \
1055   ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1056 #endif
1057 
1058 extern __inline __m256
1059 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1061 {
1062   return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1063 }
1064 
1065 #ifdef __OPTIMIZE__
1066 extern __inline __m256i
1067 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1068 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1069 {
1070   return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1071 }
1072 #else
1073 #define _mm256_permute4x64_epi64(X, M)			       \
1074   ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1075 #endif
1076 
1077 
1078 #ifdef __OPTIMIZE__
1079 extern __inline __m256i
1080 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1082 {
1083   return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1084 }
1085 #else
1086 #define _mm256_permute2x128_si256(X, Y, M)				\
1087   ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1088 #endif
1089 
1090 #ifdef __OPTIMIZE__
1091 extern __inline __m128i
1092 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1093 _mm256_extracti128_si256 (__m256i __X, const int __M)
1094 {
1095   return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1096 }
1097 #else
1098 #define _mm256_extracti128_si256(X, M)				\
1099   ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1100 #endif
1101 
1102 #ifdef __OPTIMIZE__
1103 extern __inline __m256i
1104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1105 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1106 {
1107   return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1108 }
1109 #else
1110 #define _mm256_inserti128_si256(X, Y, M)			 \
1111   ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1112 					   (__v2di)(__m128i)(Y), \
1113 					   (int)(M)))
1114 #endif
1115 
1116 extern __inline __m256i
1117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1118 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1119 {
1120   return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1121 						(__v8si)__M);
1122 }
1123 
1124 extern __inline __m256i
1125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1127 {
1128   return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1129 						(__v4di)__M);
1130 }
1131 
1132 extern __inline __m128i
1133 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1134 _mm_maskload_epi32 (int const *__X, __m128i __M )
1135 {
1136   return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1137 					     (__v4si)__M);
1138 }
1139 
1140 extern __inline __m128i
1141 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1142 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1143 {
1144   return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1145 					     (__v2di)__M);
1146 }
1147 
1148 extern __inline void
1149 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1151 {
1152   __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1153 }
1154 
1155 extern __inline void
1156 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1157 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1158 {
1159   __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1160 }
1161 
1162 extern __inline void
1163 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1164 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1165 {
1166   __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1167 }
1168 
1169 extern __inline void
1170 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1171 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1172 {
1173   __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1174 }
1175 
1176 extern __inline __m256i
1177 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1178 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1179 {
1180   return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1181 }
1182 
1183 extern __inline __m128i
1184 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1185 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1186 {
1187   return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1188 }
1189 
1190 extern __inline __m256i
1191 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1192 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1193 {
1194   return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1195 }
1196 
1197 extern __inline __m128i
1198 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1199 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1200 {
1201   return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1202 }
1203 
1204 extern __inline __m256i
1205 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1207 {
1208   return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1209 }
1210 
1211 extern __inline __m128i
1212 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1213 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1214 {
1215   return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1216 }
1217 
1218 extern __inline __m256i
1219 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1221 {
1222   return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1223 }
1224 
1225 extern __inline __m128i
1226 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1228 {
1229   return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1230 }
1231 
1232 extern __inline __m256i
1233 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1234 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1235 {
1236   return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1237 }
1238 
1239 extern __inline __m128i
1240 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1241 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1242 {
1243   return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1244 }
1245 
1246 #ifdef __OPTIMIZE__
1247 extern __inline __m128d
1248 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1249 _mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
1250 {
1251   __v2df __zero = _mm_setzero_pd ();
1252   __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
1253 
1254   return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
1255 						__base,
1256 						(__v4si)__index,
1257 						__mask,
1258 						__scale);
1259 }
1260 
1261 extern __inline __m128d
1262 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1263 _mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index,
1264 		       __m128d __mask, const int __scale)
1265 {
1266   return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src,
1267 						__base,
1268 						(__v4si)__index,
1269 						(__v2df)__mask,
1270 						__scale);
1271 }
1272 
1273 extern __inline __m256d
1274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
1276 {
1277   __v4df __zero = _mm256_setzero_pd ();
1278   __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
1279 
1280   return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
1281 						__base,
1282 						(__v4si)__index,
1283 						__mask,
1284 						__scale);
1285 }
1286 
1287 extern __inline __m256d
1288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm256_mask_i32gather_pd (__m256d __src, double const *__base,
1290 			  __m128i __index, __m256d __mask, const int __scale)
1291 {
1292   return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src,
1293 						__base,
1294 						(__v4si)__index,
1295 						(__v4df)__mask,
1296 						__scale);
1297 }
1298 
1299 extern __inline __m128d
1300 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1301 _mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
1302 {
1303   __v2df __src = _mm_setzero_pd ();
1304   __v2df __mask = _mm_cmpeq_pd (__src, __src);
1305 
1306   return (__m128d) __builtin_ia32_gatherdiv2df (__src,
1307 						__base,
1308 						(__v2di)__index,
1309 						__mask,
1310 						__scale);
1311 }
1312 
1313 extern __inline __m128d
1314 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1315 _mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index,
1316 		       __m128d __mask, const int __scale)
1317 {
1318   return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src,
1319 						__base,
1320 						(__v2di)__index,
1321 						(__v2df)__mask,
1322 						__scale);
1323 }
1324 
1325 extern __inline __m256d
1326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1327 _mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
1328 {
1329   __v4df __src = _mm256_setzero_pd ();
1330   __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
1331 
1332   return (__m256d) __builtin_ia32_gatherdiv4df (__src,
1333 						__base,
1334 						(__v4di)__index,
1335 						__mask,
1336 						__scale);
1337 }
1338 
1339 extern __inline __m256d
1340 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm256_mask_i64gather_pd (__m256d __src, double const *__base,
1342 			  __m256i __index, __m256d __mask, const int __scale)
1343 {
1344   return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src,
1345 						__base,
1346 						(__v4di)__index,
1347 						(__v4df)__mask,
1348 						__scale);
1349 }
1350 
1351 extern __inline __m128
1352 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1353 _mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
1354 {
1355   __v4sf __src = _mm_setzero_ps ();
1356   __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1357 
1358   return (__m128) __builtin_ia32_gathersiv4sf (__src,
1359 					       __base,
1360 					       (__v4si)__index,
1361 					       __mask,
1362 					       __scale);
1363 }
1364 
1365 extern __inline __m128
1366 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1367 _mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index,
1368 		       __m128 __mask, const int __scale)
1369 {
1370   return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src,
1371 					       __base,
1372 					       (__v4si)__index,
1373 					       (__v4sf)__mask,
1374 					       __scale);
1375 }
1376 
1377 extern __inline __m256
1378 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
1380 {
1381   __v8sf __src = _mm256_setzero_ps ();
1382   __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
1383 
1384   return (__m256) __builtin_ia32_gathersiv8sf (__src,
1385 					       __base,
1386 					       (__v8si)__index,
1387 					       __mask,
1388 					       __scale);
1389 }
1390 
1391 extern __inline __m256
1392 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1393 _mm256_mask_i32gather_ps (__m256 __src, float const *__base,
1394 			  __m256i __index, __m256 __mask, const int __scale)
1395 {
1396   return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src,
1397 					       __base,
1398 					       (__v8si)__index,
1399 					       (__v8sf)__mask,
1400 					       __scale);
1401 }
1402 
1403 extern __inline __m128
1404 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1405 _mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
1406 {
1407   __v4sf __src = _mm_setzero_ps ();
1408   __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1409 
1410   return (__m128) __builtin_ia32_gatherdiv4sf (__src,
1411 					       __base,
1412 					       (__v2di)__index,
1413 					       __mask,
1414 					       __scale);
1415 }
1416 
1417 extern __inline __m128
1418 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1419 _mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index,
1420 		       __m128 __mask, const int __scale)
1421 {
1422   return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src,
1423 						__base,
1424 						(__v2di)__index,
1425 						(__v4sf)__mask,
1426 						__scale);
1427 }
1428 
1429 extern __inline __m128
1430 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1431 _mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
1432 {
1433   __v4sf __src = _mm_setzero_ps ();
1434   __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1435 
1436   return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
1437 						  __base,
1438 						  (__v4di)__index,
1439 						  __mask,
1440 						  __scale);
1441 }
1442 
1443 extern __inline __m128
1444 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1445 _mm256_mask_i64gather_ps (__m128 __src, float const *__base,
1446 			  __m256i __index, __m128 __mask, const int __scale)
1447 {
1448   return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src,
1449 						  __base,
1450 						  (__v4di)__index,
1451 						  (__v4sf)__mask,
1452 						  __scale);
1453 }
1454 
1455 extern __inline __m128i
1456 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1457 _mm_i32gather_epi64 (long long int const *__base,
1458 		     __m128i __index, const int __scale)
1459 {
1460   __v2di __src = __extension__ (__v2di){ 0, 0 };
1461   __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
1462 
1463   return (__m128i) __builtin_ia32_gathersiv2di (__src,
1464 						__base,
1465 						(__v4si)__index,
1466 						__mask,
1467 						__scale);
1468 }
1469 
1470 extern __inline __m128i
1471 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1472 _mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base,
1473 			  __m128i __index, __m128i __mask, const int __scale)
1474 {
1475   return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src,
1476 						__base,
1477 						(__v4si)__index,
1478 						(__v2di)__mask,
1479 						__scale);
1480 }
1481 
1482 extern __inline __m256i
1483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1484 _mm256_i32gather_epi64 (long long int const *__base,
1485 			__m128i __index, const int __scale)
1486 {
1487   __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
1488   __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1489 
1490   return (__m256i) __builtin_ia32_gathersiv4di (__src,
1491 						__base,
1492 						(__v4si)__index,
1493 						__mask,
1494 						__scale);
1495 }
1496 
1497 extern __inline __m256i
1498 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1499 _mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base,
1500 			     __m128i __index, __m256i __mask,
1501 			     const int __scale)
1502 {
1503   return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src,
1504 						__base,
1505 						(__v4si)__index,
1506 						(__v4di)__mask,
1507 						__scale);
1508 }
1509 
1510 extern __inline __m128i
1511 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1512 _mm_i64gather_epi64 (long long int const *__base,
1513 		     __m128i __index, const int __scale)
1514 {
1515   __v2di __src = __extension__ (__v2di){ 0, 0 };
1516   __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
1517 
1518   return (__m128i) __builtin_ia32_gatherdiv2di (__src,
1519 						__base,
1520 						(__v2di)__index,
1521 						__mask,
1522 						__scale);
1523 }
1524 
1525 extern __inline __m128i
1526 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1527 _mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base,
1528 			  __m128i __index, __m128i __mask, const int __scale)
1529 {
1530   return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src,
1531 						__base,
1532 						(__v2di)__index,
1533 						(__v2di)__mask,
1534 						__scale);
1535 }
1536 
1537 extern __inline __m256i
1538 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1539 _mm256_i64gather_epi64 (long long int const *__base,
1540 			__m256i __index, const int __scale)
1541 {
1542   __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
1543   __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1544 
1545   return (__m256i) __builtin_ia32_gatherdiv4di (__src,
1546 						__base,
1547 						(__v4di)__index,
1548 						__mask,
1549 						__scale);
1550 }
1551 
1552 extern __inline __m256i
1553 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1554 _mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base,
1555 			     __m256i __index, __m256i __mask,
1556 			     const int __scale)
1557 {
1558   return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src,
1559 						__base,
1560 						(__v4di)__index,
1561 						(__v4di)__mask,
1562 						__scale);
1563 }
1564 
1565 extern __inline __m128i
1566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1567 _mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale)
1568 {
1569   __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1570   __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1571 
1572   return (__m128i) __builtin_ia32_gathersiv4si (__src,
1573 						__base,
1574 						(__v4si)__index,
1575 						__mask,
1576 						__scale);
1577 }
1578 
1579 extern __inline __m128i
1580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1581 _mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index,
1582 			  __m128i __mask, const int __scale)
1583 {
1584   return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src,
1585 						__base,
1586 						(__v4si)__index,
1587 						(__v4si)__mask,
1588 						__scale);
1589 }
1590 
1591 extern __inline __m256i
1592 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1593 _mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale)
1594 {
1595   __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1596   __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1597 
1598   return (__m256i) __builtin_ia32_gathersiv8si (__src,
1599 						__base,
1600 						(__v8si)__index,
1601 						__mask,
1602 						__scale);
1603 }
1604 
1605 extern __inline __m256i
1606 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1607 _mm256_mask_i32gather_epi32 (__m256i __src, int const *__base,
1608 			     __m256i __index, __m256i __mask,
1609 			     const int __scale)
1610 {
1611   return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src,
1612 						__base,
1613 						(__v8si)__index,
1614 						(__v8si)__mask,
1615 						__scale);
1616 }
1617 
1618 extern __inline __m128i
1619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1620 _mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale)
1621 {
1622   __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1623   __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1624 
1625   return (__m128i) __builtin_ia32_gatherdiv4si (__src,
1626 						__base,
1627 						(__v2di)__index,
1628 						__mask,
1629 						__scale);
1630 }
1631 
1632 extern __inline __m128i
1633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1634 _mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index,
1635 			  __m128i __mask, const int __scale)
1636 {
1637   return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src,
1638 						__base,
1639 						(__v2di)__index,
1640 						(__v4si)__mask,
1641 						__scale);
1642 }
1643 
1644 extern __inline __m128i
1645 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1646 _mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale)
1647 {
1648   __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1649   __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1650 
1651   return (__m128i) __builtin_ia32_gatherdiv4si256 (__src,
1652 						   __base,
1653 						   (__v4di)__index,
1654 						   __mask,
1655 						   __scale);
1656 }
1657 
1658 extern __inline __m128i
1659 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1660 _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
1661 			     __m256i __index, __m128i __mask,
1662 			     const int __scale)
1663 {
1664   return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src,
1665 						   __base,
1666 						   (__v4di)__index,
1667 						   (__v4si)__mask,
1668 						   __scale);
1669 }
1670 #else /* __OPTIMIZE__ */
1671 #define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
1672   (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
1673 					 (double const *)BASE,		\
1674 					 (__v4si)(__m128i)INDEX,	\
1675 					 (__v2df)_mm_set1_pd(		\
1676 					   (double)(long long int) -1), \
1677 					 (int)SCALE)
1678 
1679 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1680   (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
1681 					 (double const *)BASE,	 \
1682 					 (__v4si)(__m128i)INDEX, \
1683 					 (__v2df)(__m128d)MASK,	 \
1684 					 (int)SCALE)
1685 
1686 #define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
1687   (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
1688 					 (double const *)BASE,		\
1689 					 (__v4si)(__m128i)INDEX,	\
1690 					 (__v4df)_mm256_set1_pd(	\
1691 					   (double)(long long int) -1), \
1692 					 (int)SCALE)
1693 
1694 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1695   (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
1696 					 (double const *)BASE,	 \
1697 					 (__v4si)(__m128i)INDEX, \
1698 					 (__v4df)(__m256d)MASK,	 \
1699 					 (int)SCALE)
1700 
1701 #define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
1702   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
1703 					 (double const *)BASE,		\
1704 					 (__v2di)(__m128i)INDEX,	\
1705 					 (__v2df)_mm_set1_pd(		\
1706 					   (double)(long long int) -1), \
1707 					 (int)SCALE)
1708 
1709 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1710   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
1711 					 (double const *)BASE,	 \
1712 					 (__v2di)(__m128i)INDEX, \
1713 					 (__v2df)(__m128d)MASK,	 \
1714 					 (int)SCALE)
1715 
1716 #define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
1717   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
1718 					 (double const *)BASE,		\
1719 					 (__v4di)(__m256i)INDEX,	\
1720 					 (__v4df)_mm256_set1_pd(	\
1721 					   (double)(long long int) -1), \
1722 					 (int)SCALE)
1723 
1724 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1725   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
1726 					 (double const *)BASE,	 \
1727 					 (__v4di)(__m256i)INDEX, \
1728 					 (__v4df)(__m256d)MASK,	 \
1729 					 (int)SCALE)
1730 
1731 #define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
1732   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
1733 					(float const *)BASE,		\
1734 					(__v4si)(__m128i)INDEX,		\
1735 					_mm_set1_ps ((float)(int) -1),	\
1736 					(int)SCALE)
1737 
1738 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
1739   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
1740 					(float const *)BASE,	 \
1741 					(__v4si)(__m128i)INDEX,	 \
1742 					(__v4sf)(__m128d)MASK,	 \
1743 					(int)SCALE)
1744 
1745 #define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
1746   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1747 					(float const *)BASE,	       \
1748 					(__v8si)(__m256i)INDEX,	       \
1749 					(__v8sf)_mm256_set1_ps (       \
1750 					  (float)(int) -1),	       \
1751 					(int)SCALE)
1752 
1753 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1754   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
1755 					(float const *)BASE,	\
1756 					(__v8si)(__m256i)INDEX, \
1757 					(__v8sf)(__m256d)MASK,	\
1758 					(int)SCALE)
1759 
1760 #define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
1761   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
1762 					(float const *)BASE,		\
1763 					(__v2di)(__m128i)INDEX,		\
1764 					(__v4sf)_mm_set1_ps (		\
1765 					  (float)(int) -1),		\
1766 					(int)SCALE)
1767 
1768 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
1769   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
1770 					(float const *)BASE,	 \
1771 					(__v2di)(__m128i)INDEX,	 \
1772 					(__v4sf)(__m128d)MASK,	 \
1773 					(int)SCALE)
1774 
1775 #define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
1776   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
1777 					   (float const *)BASE,		\
1778 					   (__v4di)(__m256i)INDEX,	\
1779 					   (__v4sf)_mm_set1_ps(		\
1780 					     (float)(int) -1),		\
1781 					   (int)SCALE)
1782 
1783 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
1784   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
1785 					   (float const *)BASE,	   \
1786 					   (__v4di)(__m256i)INDEX, \
1787 					   (__v4sf)(__m128)MASK,   \
1788 					   (int)SCALE)
1789 
1790 #define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\
1791   (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1792 					 (long long const *)BASE,	\
1793 					 (__v4si)(__m128i)INDEX,	\
1794 					 (__v2di)_mm_set1_epi64x (-1),	\
1795 					 (int)SCALE)
1796 
1797 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
1798   (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,	  \
1799 					 (long long const *)BASE, \
1800 					 (__v4si)(__m128i)INDEX,  \
1801 					 (__v2di)(__m128i)MASK,	  \
1802 					 (int)SCALE)
1803 
1804 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE)			   \
1805   (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1806 					 (long long const *)BASE,	   \
1807 					 (__v4si)(__m128i)INDEX,	   \
1808 					 (__v4di)_mm256_set1_epi64x (-1),  \
1809 					 (int)SCALE)
1810 
1811 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1812   (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,	   \
1813 					 (long long const *)BASE,  \
1814 					 (__v4si)(__m128i)INDEX,   \
1815 					 (__v4di)(__m256i)MASK,	   \
1816 					 (int)SCALE)
1817 
1818 #define _mm_i64gather_epi64(BASE, INDEX, SCALE)				\
1819   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1820 					 (long long const *)BASE,	\
1821 					 (__v2di)(__m128i)INDEX,	\
1822 					 (__v2di)_mm_set1_epi64x (-1),	\
1823 					 (int)SCALE)
1824 
1825 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
1826   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,	  \
1827 					 (long long const *)BASE, \
1828 					 (__v2di)(__m128i)INDEX,  \
1829 					 (__v2di)(__m128i)MASK,	  \
1830 					 (int)SCALE)
1831 
1832 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE)			   \
1833   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1834 					 (long long const *)BASE,	   \
1835 					 (__v4di)(__m256i)INDEX,	   \
1836 					 (__v4di)_mm256_set1_epi64x (-1),  \
1837 					 (int)SCALE)
1838 
1839 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1840   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,	   \
1841 					 (long long const *)BASE,  \
1842 					 (__v4di)(__m256i)INDEX,   \
1843 					 (__v4di)(__m256i)MASK,	   \
1844 					 (int)SCALE)
1845 
1846 #define _mm_i32gather_epi32(BASE, INDEX, SCALE)				\
1847   (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),	\
1848 					 (int const *)BASE,		\
1849 					 (__v4si)(__m128i)INDEX,	\
1850 					 (__v4si)_mm_set1_epi32 (-1),	\
1851 					 (int)SCALE)
1852 
1853 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1854   (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,	\
1855 					(int const *)BASE,	\
1856 					(__v4si)(__m128i)INDEX, \
1857 					(__v4si)(__m128i)MASK,	\
1858 					(int)SCALE)
1859 
1860 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE)			   \
1861   (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1862 					 (int const *)BASE,		   \
1863 					 (__v8si)(__m256i)INDEX,	   \
1864 					 (__v8si)_mm256_set1_epi32 (-1),   \
1865 					 (int)SCALE)
1866 
1867 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1868   (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,	   \
1869 					(int const *)BASE,	   \
1870 					(__v8si)(__m256i)INDEX,	   \
1871 					(__v8si)(__m256i)MASK,	   \
1872 					(int)SCALE)
1873 
1874 #define _mm_i64gather_epi32(BASE, INDEX, SCALE)				\
1875   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),	\
1876 					 (int const *)BASE,		\
1877 					 (__v2di)(__m128i)INDEX,	\
1878 					 (__v4si)_mm_set1_epi32 (-1),	\
1879 					 (int)SCALE)
1880 
1881 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1882   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,	\
1883 					(int const *)BASE,	\
1884 					(__v2di)(__m128i)INDEX, \
1885 					(__v4si)(__m128i)MASK,	\
1886 					(int)SCALE)
1887 
1888 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE)			   \
1889   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1890 					    (int const *)BASE,		   \
1891 					    (__v4di)(__m256i)INDEX,	   \
1892 					    (__v4si)_mm_set1_epi32(-1),	   \
1893 					    (int)SCALE)
1894 
1895 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1896   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
1897 					   (int const *)BASE,	   \
1898 					   (__v4di)(__m256i)INDEX, \
1899 					   (__v4si)(__m128i)MASK,  \
1900 					   (int)SCALE)
1901 #endif  /* __OPTIMIZE__ */
1902 
1903 #ifdef __DISABLE_AVX2__
1904 #undef __DISABLE_AVX2__
1905 #pragma GCC pop_options
1906 #endif /* __DISABLE_AVX2__ */
1907 
1908 #endif /* _AVX2INTRIN_H_INCLUDED */
1909