1 /* Copyright (C) 2011-2018 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 #ifndef _IMMINTRIN_H_INCLUDED 25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 #ifndef _AVX2INTRIN_H_INCLUDED 29 #define _AVX2INTRIN_H_INCLUDED 30 31 #ifndef __AVX2__ 32 #pragma GCC push_options 33 #pragma GCC target("avx2") 34 #define __DISABLE_AVX2__ 35 #endif /* __AVX2__ */ 36 37 /* Sum absolute 8-bit integer difference of adjacent groups of 4 38 byte integers in the first 2 operands. Starting offsets within 39 operands are determined by the 3rd mask operand. */ 40 #ifdef __OPTIMIZE__ 41 extern __inline __m256i 42 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 43 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) 44 { 45 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, 46 (__v32qi)__Y, __M); 47 } 48 #else 49 #define _mm256_mpsadbw_epu8(X, Y, M) \ 50 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ 51 (__v32qi)(__m256i)(Y), (int)(M))) 52 #endif 53 54 extern __inline __m256i 55 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 56 _mm256_abs_epi8 (__m256i __A) 57 { 58 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); 59 } 60 61 extern __inline __m256i 62 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 63 _mm256_abs_epi16 (__m256i __A) 64 { 65 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); 66 } 67 68 extern __inline __m256i 69 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 70 _mm256_abs_epi32 (__m256i __A) 71 { 72 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); 73 } 74 75 extern __inline __m256i 76 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 77 _mm256_packs_epi32 (__m256i __A, __m256i __B) 78 { 79 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); 80 } 81 82 extern __inline __m256i 83 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 84 _mm256_packs_epi16 (__m256i __A, __m256i __B) 85 { 86 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); 87 } 88 89 extern __inline __m256i 90 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 91 _mm256_packus_epi32 (__m256i __A, __m256i __B) 92 { 93 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); 94 } 95 96 extern __inline __m256i 97 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 98 _mm256_packus_epi16 (__m256i __A, __m256i __B) 99 { 100 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); 101 } 102 103 extern __inline __m256i 104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 105 _mm256_add_epi8 (__m256i __A, __m256i __B) 106 { 107 return (__m256i) ((__v32qu)__A + (__v32qu)__B); 108 } 109 110 extern __inline __m256i 111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 112 _mm256_add_epi16 (__m256i __A, __m256i __B) 113 { 114 return (__m256i) ((__v16hu)__A + (__v16hu)__B); 115 } 116 117 extern __inline __m256i 118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 119 _mm256_add_epi32 (__m256i __A, __m256i __B) 120 { 121 return (__m256i) ((__v8su)__A + (__v8su)__B); 122 } 123 124 extern __inline __m256i 125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 126 _mm256_add_epi64 (__m256i __A, __m256i __B) 127 { 128 return (__m256i) ((__v4du)__A + (__v4du)__B); 129 } 130 131 extern __inline __m256i 132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 133 _mm256_adds_epi8 (__m256i __A, __m256i __B) 134 { 135 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); 136 } 137 138 extern __inline __m256i 139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 140 _mm256_adds_epi16 (__m256i __A, __m256i __B) 141 { 142 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); 143 } 144 145 extern __inline __m256i 146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 147 _mm256_adds_epu8 (__m256i __A, __m256i __B) 148 { 149 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); 150 } 151 152 extern __inline __m256i 153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 154 _mm256_adds_epu16 (__m256i __A, __m256i __B) 155 { 156 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); 157 } 158 159 #ifdef __OPTIMIZE__ 160 extern __inline __m256i 161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 162 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) 163 { 164 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, 165 (__v4di)__B, 166 __N * 8); 167 } 168 #else 169 /* In that case (__N*8) will be in vreg, and insn will not be matched. */ 170 /* Use define instead */ 171 #define _mm256_alignr_epi8(A, B, N) \ 172 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ 173 (__v4di)(__m256i)(B), \ 174 (int)(N) * 8)) 175 #endif 176 177 extern __inline __m256i 178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 179 _mm256_and_si256 (__m256i __A, __m256i __B) 180 { 181 return (__m256i) ((__v4du)__A & (__v4du)__B); 182 } 183 184 extern __inline __m256i 185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 186 _mm256_andnot_si256 (__m256i __A, __m256i __B) 187 { 188 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); 189 } 190 191 extern __inline __m256i 192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 193 _mm256_avg_epu8 (__m256i __A, __m256i __B) 194 { 195 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); 196 } 197 198 extern __inline __m256i 199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 200 _mm256_avg_epu16 (__m256i __A, __m256i __B) 201 { 202 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); 203 } 204 205 extern __inline __m256i 206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 207 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) 208 { 209 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, 210 (__v32qi)__Y, 211 (__v32qi)__M); 212 } 213 214 #ifdef __OPTIMIZE__ 215 extern __inline __m256i 216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 217 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) 218 { 219 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, 220 (__v16hi)__Y, 221 __M); 222 } 223 #else 224 #define _mm256_blend_epi16(X, Y, M) \ 225 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ 226 (__v16hi)(__m256i)(Y), (int)(M))) 227 #endif 228 229 extern __inline __m256i 230 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 231 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B) 232 { 233 return (__m256i) ((__v32qi)__A == (__v32qi)__B); 234 } 235 236 extern __inline __m256i 237 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 238 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B) 239 { 240 return (__m256i) ((__v16hi)__A == (__v16hi)__B); 241 } 242 243 extern __inline __m256i 244 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 245 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B) 246 { 247 return (__m256i) ((__v8si)__A == (__v8si)__B); 248 } 249 250 extern __inline __m256i 251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 252 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B) 253 { 254 return (__m256i) ((__v4di)__A == (__v4di)__B); 255 } 256 257 extern __inline __m256i 258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 259 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B) 260 { 261 return (__m256i) ((__v32qi)__A > (__v32qi)__B); 262 } 263 264 extern __inline __m256i 265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 266 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B) 267 { 268 return (__m256i) ((__v16hi)__A > (__v16hi)__B); 269 } 270 271 extern __inline __m256i 272 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 273 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B) 274 { 275 return (__m256i) ((__v8si)__A > (__v8si)__B); 276 } 277 278 extern __inline __m256i 279 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 280 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B) 281 { 282 return (__m256i) ((__v4di)__A > (__v4di)__B); 283 } 284 285 extern __inline __m256i 286 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 287 _mm256_hadd_epi16 (__m256i __X, __m256i __Y) 288 { 289 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, 290 (__v16hi)__Y); 291 } 292 293 extern __inline __m256i 294 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 295 _mm256_hadd_epi32 (__m256i __X, __m256i __Y) 296 { 297 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); 298 } 299 300 extern __inline __m256i 301 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 302 _mm256_hadds_epi16 (__m256i __X, __m256i __Y) 303 { 304 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, 305 (__v16hi)__Y); 306 } 307 308 extern __inline __m256i 309 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 310 _mm256_hsub_epi16 (__m256i __X, __m256i __Y) 311 { 312 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, 313 (__v16hi)__Y); 314 } 315 316 extern __inline __m256i 317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 318 _mm256_hsub_epi32 (__m256i __X, __m256i __Y) 319 { 320 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); 321 } 322 323 extern __inline __m256i 324 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 325 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y) 326 { 327 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, 328 (__v16hi)__Y); 329 } 330 331 extern __inline __m256i 332 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 333 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y) 334 { 335 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, 336 (__v32qi)__Y); 337 } 338 339 extern __inline __m256i 340 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 341 _mm256_madd_epi16 (__m256i __A, __m256i __B) 342 { 343 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, 344 (__v16hi)__B); 345 } 346 347 extern __inline __m256i 348 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 349 _mm256_max_epi8 (__m256i __A, __m256i __B) 350 { 351 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); 352 } 353 354 extern __inline __m256i 355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 356 _mm256_max_epi16 (__m256i __A, __m256i __B) 357 { 358 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); 359 } 360 361 extern __inline __m256i 362 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 363 _mm256_max_epi32 (__m256i __A, __m256i __B) 364 { 365 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); 366 } 367 368 extern __inline __m256i 369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 370 _mm256_max_epu8 (__m256i __A, __m256i __B) 371 { 372 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); 373 } 374 375 extern __inline __m256i 376 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 377 _mm256_max_epu16 (__m256i __A, __m256i __B) 378 { 379 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); 380 } 381 382 extern __inline __m256i 383 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 384 _mm256_max_epu32 (__m256i __A, __m256i __B) 385 { 386 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); 387 } 388 389 extern __inline __m256i 390 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 391 _mm256_min_epi8 (__m256i __A, __m256i __B) 392 { 393 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); 394 } 395 396 extern __inline __m256i 397 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 398 _mm256_min_epi16 (__m256i __A, __m256i __B) 399 { 400 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); 401 } 402 403 extern __inline __m256i 404 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 405 _mm256_min_epi32 (__m256i __A, __m256i __B) 406 { 407 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); 408 } 409 410 extern __inline __m256i 411 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 412 _mm256_min_epu8 (__m256i __A, __m256i __B) 413 { 414 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); 415 } 416 417 extern __inline __m256i 418 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 419 _mm256_min_epu16 (__m256i __A, __m256i __B) 420 { 421 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); 422 } 423 424 extern __inline __m256i 425 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 426 _mm256_min_epu32 (__m256i __A, __m256i __B) 427 { 428 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); 429 } 430 431 extern __inline int 432 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 433 _mm256_movemask_epi8 (__m256i __A) 434 { 435 return __builtin_ia32_pmovmskb256 ((__v32qi)__A); 436 } 437 438 extern __inline __m256i 439 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 440 _mm256_cvtepi8_epi16 (__m128i __X) 441 { 442 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); 443 } 444 445 extern __inline __m256i 446 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 447 _mm256_cvtepi8_epi32 (__m128i __X) 448 { 449 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); 450 } 451 452 extern __inline __m256i 453 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 454 _mm256_cvtepi8_epi64 (__m128i __X) 455 { 456 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); 457 } 458 459 extern __inline __m256i 460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 461 _mm256_cvtepi16_epi32 (__m128i __X) 462 { 463 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); 464 } 465 466 extern __inline __m256i 467 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 468 _mm256_cvtepi16_epi64 (__m128i __X) 469 { 470 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); 471 } 472 473 extern __inline __m256i 474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 475 _mm256_cvtepi32_epi64 (__m128i __X) 476 { 477 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); 478 } 479 480 extern __inline __m256i 481 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 482 _mm256_cvtepu8_epi16 (__m128i __X) 483 { 484 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); 485 } 486 487 extern __inline __m256i 488 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 489 _mm256_cvtepu8_epi32 (__m128i __X) 490 { 491 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); 492 } 493 494 extern __inline __m256i 495 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 496 _mm256_cvtepu8_epi64 (__m128i __X) 497 { 498 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); 499 } 500 501 extern __inline __m256i 502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 503 _mm256_cvtepu16_epi32 (__m128i __X) 504 { 505 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); 506 } 507 508 extern __inline __m256i 509 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 510 _mm256_cvtepu16_epi64 (__m128i __X) 511 { 512 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); 513 } 514 515 extern __inline __m256i 516 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 517 _mm256_cvtepu32_epi64 (__m128i __X) 518 { 519 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); 520 } 521 522 extern __inline __m256i 523 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 524 _mm256_mul_epi32 (__m256i __X, __m256i __Y) 525 { 526 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); 527 } 528 529 extern __inline __m256i 530 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 531 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) 532 { 533 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, 534 (__v16hi)__Y); 535 } 536 537 extern __inline __m256i 538 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 539 _mm256_mulhi_epu16 (__m256i __A, __m256i __B) 540 { 541 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); 542 } 543 544 extern __inline __m256i 545 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 546 _mm256_mulhi_epi16 (__m256i __A, __m256i __B) 547 { 548 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); 549 } 550 551 extern __inline __m256i 552 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 553 _mm256_mullo_epi16 (__m256i __A, __m256i __B) 554 { 555 return (__m256i) ((__v16hu)__A * (__v16hu)__B); 556 } 557 558 extern __inline __m256i 559 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 560 _mm256_mullo_epi32 (__m256i __A, __m256i __B) 561 { 562 return (__m256i) ((__v8su)__A * (__v8su)__B); 563 } 564 565 extern __inline __m256i 566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 567 _mm256_mul_epu32 (__m256i __A, __m256i __B) 568 { 569 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); 570 } 571 572 extern __inline __m256i 573 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 574 _mm256_or_si256 (__m256i __A, __m256i __B) 575 { 576 return (__m256i) ((__v4du)__A | (__v4du)__B); 577 } 578 579 extern __inline __m256i 580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 581 _mm256_sad_epu8 (__m256i __A, __m256i __B) 582 { 583 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); 584 } 585 586 extern __inline __m256i 587 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 588 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y) 589 { 590 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, 591 (__v32qi)__Y); 592 } 593 594 #ifdef __OPTIMIZE__ 595 extern __inline __m256i 596 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 597 _mm256_shuffle_epi32 (__m256i __A, const int __mask) 598 { 599 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); 600 } 601 602 extern __inline __m256i 603 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 604 _mm256_shufflehi_epi16 (__m256i __A, const int __mask) 605 { 606 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); 607 } 608 609 extern __inline __m256i 610 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 611 _mm256_shufflelo_epi16 (__m256i __A, const int __mask) 612 { 613 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); 614 } 615 #else 616 #define _mm256_shuffle_epi32(A, N) \ 617 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) 618 #define _mm256_shufflehi_epi16(A, N) \ 619 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) 620 #define _mm256_shufflelo_epi16(A, N) \ 621 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) 622 #endif 623 624 extern __inline __m256i 625 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 626 _mm256_sign_epi8 (__m256i __X, __m256i __Y) 627 { 628 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); 629 } 630 631 extern __inline __m256i 632 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 633 _mm256_sign_epi16 (__m256i __X, __m256i __Y) 634 { 635 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); 636 } 637 638 extern __inline __m256i 639 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 640 _mm256_sign_epi32 (__m256i __X, __m256i __Y) 641 { 642 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); 643 } 644 645 #ifdef __OPTIMIZE__ 646 extern __inline __m256i 647 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 648 _mm256_bslli_epi128 (__m256i __A, const int __N) 649 { 650 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 651 } 652 653 extern __inline __m256i 654 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 655 _mm256_slli_si256 (__m256i __A, const int __N) 656 { 657 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 658 } 659 #else 660 #define _mm256_bslli_epi128(A, N) \ 661 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 662 #define _mm256_slli_si256(A, N) \ 663 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 664 #endif 665 666 extern __inline __m256i 667 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 668 _mm256_slli_epi16 (__m256i __A, int __B) 669 { 670 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); 671 } 672 673 extern __inline __m256i 674 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 675 _mm256_sll_epi16 (__m256i __A, __m128i __B) 676 { 677 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); 678 } 679 680 extern __inline __m256i 681 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 682 _mm256_slli_epi32 (__m256i __A, int __B) 683 { 684 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); 685 } 686 687 extern __inline __m256i 688 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 689 _mm256_sll_epi32 (__m256i __A, __m128i __B) 690 { 691 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); 692 } 693 694 extern __inline __m256i 695 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 696 _mm256_slli_epi64 (__m256i __A, int __B) 697 { 698 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); 699 } 700 701 extern __inline __m256i 702 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 703 _mm256_sll_epi64 (__m256i __A, __m128i __B) 704 { 705 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); 706 } 707 708 extern __inline __m256i 709 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 710 _mm256_srai_epi16 (__m256i __A, int __B) 711 { 712 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); 713 } 714 715 extern __inline __m256i 716 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 717 _mm256_sra_epi16 (__m256i __A, __m128i __B) 718 { 719 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); 720 } 721 722 extern __inline __m256i 723 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 724 _mm256_srai_epi32 (__m256i __A, int __B) 725 { 726 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); 727 } 728 729 extern __inline __m256i 730 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 731 _mm256_sra_epi32 (__m256i __A, __m128i __B) 732 { 733 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); 734 } 735 736 #ifdef __OPTIMIZE__ 737 extern __inline __m256i 738 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 739 _mm256_bsrli_epi128 (__m256i __A, const int __N) 740 { 741 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 742 } 743 744 extern __inline __m256i 745 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 746 _mm256_srli_si256 (__m256i __A, const int __N) 747 { 748 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 749 } 750 #else 751 #define _mm256_bsrli_epi128(A, N) \ 752 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 753 #define _mm256_srli_si256(A, N) \ 754 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 755 #endif 756 757 extern __inline __m256i 758 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 759 _mm256_srli_epi16 (__m256i __A, int __B) 760 { 761 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); 762 } 763 764 extern __inline __m256i 765 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 766 _mm256_srl_epi16 (__m256i __A, __m128i __B) 767 { 768 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); 769 } 770 771 extern __inline __m256i 772 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 773 _mm256_srli_epi32 (__m256i __A, int __B) 774 { 775 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); 776 } 777 778 extern __inline __m256i 779 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 780 _mm256_srl_epi32 (__m256i __A, __m128i __B) 781 { 782 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); 783 } 784 785 extern __inline __m256i 786 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 787 _mm256_srli_epi64 (__m256i __A, int __B) 788 { 789 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); 790 } 791 792 extern __inline __m256i 793 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 794 _mm256_srl_epi64 (__m256i __A, __m128i __B) 795 { 796 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); 797 } 798 799 extern __inline __m256i 800 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 801 _mm256_sub_epi8 (__m256i __A, __m256i __B) 802 { 803 return (__m256i) ((__v32qu)__A - (__v32qu)__B); 804 } 805 806 extern __inline __m256i 807 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 808 _mm256_sub_epi16 (__m256i __A, __m256i __B) 809 { 810 return (__m256i) ((__v16hu)__A - (__v16hu)__B); 811 } 812 813 extern __inline __m256i 814 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 815 _mm256_sub_epi32 (__m256i __A, __m256i __B) 816 { 817 return (__m256i) ((__v8su)__A - (__v8su)__B); 818 } 819 820 extern __inline __m256i 821 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 822 _mm256_sub_epi64 (__m256i __A, __m256i __B) 823 { 824 return (__m256i) ((__v4du)__A - (__v4du)__B); 825 } 826 827 extern __inline __m256i 828 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 829 _mm256_subs_epi8 (__m256i __A, __m256i __B) 830 { 831 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); 832 } 833 834 extern __inline __m256i 835 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 836 _mm256_subs_epi16 (__m256i __A, __m256i __B) 837 { 838 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); 839 } 840 841 extern __inline __m256i 842 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 843 _mm256_subs_epu8 (__m256i __A, __m256i __B) 844 { 845 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); 846 } 847 848 extern __inline __m256i 849 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 850 _mm256_subs_epu16 (__m256i __A, __m256i __B) 851 { 852 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); 853 } 854 855 extern __inline __m256i 856 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 857 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B) 858 { 859 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); 860 } 861 862 extern __inline __m256i 863 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 864 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B) 865 { 866 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); 867 } 868 869 extern __inline __m256i 870 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 871 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B) 872 { 873 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); 874 } 875 876 extern __inline __m256i 877 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 878 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B) 879 { 880 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); 881 } 882 883 extern __inline __m256i 884 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 885 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B) 886 { 887 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); 888 } 889 890 extern __inline __m256i 891 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 892 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B) 893 { 894 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); 895 } 896 897 extern __inline __m256i 898 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 899 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B) 900 { 901 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); 902 } 903 904 extern __inline __m256i 905 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 906 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B) 907 { 908 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); 909 } 910 911 extern __inline __m256i 912 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 913 _mm256_xor_si256 (__m256i __A, __m256i __B) 914 { 915 return (__m256i) ((__v4du)__A ^ (__v4du)__B); 916 } 917 918 extern __inline __m256i 919 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 920 _mm256_stream_load_si256 (__m256i const *__X) 921 { 922 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); 923 } 924 925 extern __inline __m128 926 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 927 _mm_broadcastss_ps (__m128 __X) 928 { 929 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); 930 } 931 932 extern __inline __m256 933 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 934 _mm256_broadcastss_ps (__m128 __X) 935 { 936 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); 937 } 938 939 extern __inline __m256d 940 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 941 _mm256_broadcastsd_pd (__m128d __X) 942 { 943 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); 944 } 945 946 extern __inline __m256i 947 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 948 _mm256_broadcastsi128_si256 (__m128i __X) 949 { 950 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); 951 } 952 953 #ifdef __OPTIMIZE__ 954 extern __inline __m128i 955 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 956 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) 957 { 958 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, 959 (__v4si)__Y, 960 __M); 961 } 962 #else 963 #define _mm_blend_epi32(X, Y, M) \ 964 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ 965 (__v4si)(__m128i)(Y), (int)(M))) 966 #endif 967 968 #ifdef __OPTIMIZE__ 969 extern __inline __m256i 970 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 971 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) 972 { 973 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, 974 (__v8si)__Y, 975 __M); 976 } 977 #else 978 #define _mm256_blend_epi32(X, Y, M) \ 979 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ 980 (__v8si)(__m256i)(Y), (int)(M))) 981 #endif 982 983 extern __inline __m256i 984 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 985 _mm256_broadcastb_epi8 (__m128i __X) 986 { 987 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); 988 } 989 990 extern __inline __m256i 991 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 992 _mm256_broadcastw_epi16 (__m128i __X) 993 { 994 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); 995 } 996 997 extern __inline __m256i 998 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 999 _mm256_broadcastd_epi32 (__m128i __X) 1000 { 1001 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); 1002 } 1003 1004 extern __inline __m256i 1005 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1006 _mm256_broadcastq_epi64 (__m128i __X) 1007 { 1008 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); 1009 } 1010 1011 extern __inline __m128i 1012 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1013 _mm_broadcastb_epi8 (__m128i __X) 1014 { 1015 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); 1016 } 1017 1018 extern __inline __m128i 1019 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1020 _mm_broadcastw_epi16 (__m128i __X) 1021 { 1022 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); 1023 } 1024 1025 extern __inline __m128i 1026 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1027 _mm_broadcastd_epi32 (__m128i __X) 1028 { 1029 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); 1030 } 1031 1032 extern __inline __m128i 1033 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1034 _mm_broadcastq_epi64 (__m128i __X) 1035 { 1036 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); 1037 } 1038 1039 extern __inline __m256i 1040 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1041 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) 1042 { 1043 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); 1044 } 1045 1046 #ifdef __OPTIMIZE__ 1047 extern __inline __m256d 1048 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1049 _mm256_permute4x64_pd (__m256d __X, const int __M) 1050 { 1051 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); 1052 } 1053 #else 1054 #define _mm256_permute4x64_pd(X, M) \ 1055 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) 1056 #endif 1057 1058 extern __inline __m256 1059 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1060 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) 1061 { 1062 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); 1063 } 1064 1065 #ifdef __OPTIMIZE__ 1066 extern __inline __m256i 1067 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1068 _mm256_permute4x64_epi64 (__m256i __X, const int __M) 1069 { 1070 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); 1071 } 1072 #else 1073 #define _mm256_permute4x64_epi64(X, M) \ 1074 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) 1075 #endif 1076 1077 1078 #ifdef __OPTIMIZE__ 1079 extern __inline __m256i 1080 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1081 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) 1082 { 1083 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); 1084 } 1085 #else 1086 #define _mm256_permute2x128_si256(X, Y, M) \ 1087 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) 1088 #endif 1089 1090 #ifdef __OPTIMIZE__ 1091 extern __inline __m128i 1092 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1093 _mm256_extracti128_si256 (__m256i __X, const int __M) 1094 { 1095 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); 1096 } 1097 #else 1098 #define _mm256_extracti128_si256(X, M) \ 1099 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) 1100 #endif 1101 1102 #ifdef __OPTIMIZE__ 1103 extern __inline __m256i 1104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1105 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) 1106 { 1107 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); 1108 } 1109 #else 1110 #define _mm256_inserti128_si256(X, Y, M) \ 1111 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ 1112 (__v2di)(__m128i)(Y), \ 1113 (int)(M))) 1114 #endif 1115 1116 extern __inline __m256i 1117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1118 _mm256_maskload_epi32 (int const *__X, __m256i __M ) 1119 { 1120 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, 1121 (__v8si)__M); 1122 } 1123 1124 extern __inline __m256i 1125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1126 _mm256_maskload_epi64 (long long const *__X, __m256i __M ) 1127 { 1128 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, 1129 (__v4di)__M); 1130 } 1131 1132 extern __inline __m128i 1133 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1134 _mm_maskload_epi32 (int const *__X, __m128i __M ) 1135 { 1136 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, 1137 (__v4si)__M); 1138 } 1139 1140 extern __inline __m128i 1141 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1142 _mm_maskload_epi64 (long long const *__X, __m128i __M ) 1143 { 1144 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, 1145 (__v2di)__M); 1146 } 1147 1148 extern __inline void 1149 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1150 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) 1151 { 1152 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 1153 } 1154 1155 extern __inline void 1156 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1157 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) 1158 { 1159 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 1160 } 1161 1162 extern __inline void 1163 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1164 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) 1165 { 1166 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 1167 } 1168 1169 extern __inline void 1170 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1171 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) 1172 { 1173 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 1174 } 1175 1176 extern __inline __m256i 1177 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1178 _mm256_sllv_epi32 (__m256i __X, __m256i __Y) 1179 { 1180 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); 1181 } 1182 1183 extern __inline __m128i 1184 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1185 _mm_sllv_epi32 (__m128i __X, __m128i __Y) 1186 { 1187 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); 1188 } 1189 1190 extern __inline __m256i 1191 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1192 _mm256_sllv_epi64 (__m256i __X, __m256i __Y) 1193 { 1194 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); 1195 } 1196 1197 extern __inline __m128i 1198 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1199 _mm_sllv_epi64 (__m128i __X, __m128i __Y) 1200 { 1201 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); 1202 } 1203 1204 extern __inline __m256i 1205 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1206 _mm256_srav_epi32 (__m256i __X, __m256i __Y) 1207 { 1208 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); 1209 } 1210 1211 extern __inline __m128i 1212 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1213 _mm_srav_epi32 (__m128i __X, __m128i __Y) 1214 { 1215 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); 1216 } 1217 1218 extern __inline __m256i 1219 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1220 _mm256_srlv_epi32 (__m256i __X, __m256i __Y) 1221 { 1222 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); 1223 } 1224 1225 extern __inline __m128i 1226 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1227 _mm_srlv_epi32 (__m128i __X, __m128i __Y) 1228 { 1229 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); 1230 } 1231 1232 extern __inline __m256i 1233 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1234 _mm256_srlv_epi64 (__m256i __X, __m256i __Y) 1235 { 1236 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); 1237 } 1238 1239 extern __inline __m128i 1240 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1241 _mm_srlv_epi64 (__m128i __X, __m128i __Y) 1242 { 1243 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); 1244 } 1245 1246 #ifdef __OPTIMIZE__ 1247 extern __inline __m128d 1248 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1249 _mm_i32gather_pd (double const *__base, __m128i __index, const int __scale) 1250 { 1251 __v2df __zero = _mm_setzero_pd (); 1252 __v2df __mask = _mm_cmpeq_pd (__zero, __zero); 1253 1254 return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (), 1255 __base, 1256 (__v4si)__index, 1257 __mask, 1258 __scale); 1259 } 1260 1261 extern __inline __m128d 1262 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1263 _mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index, 1264 __m128d __mask, const int __scale) 1265 { 1266 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src, 1267 __base, 1268 (__v4si)__index, 1269 (__v2df)__mask, 1270 __scale); 1271 } 1272 1273 extern __inline __m256d 1274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1275 _mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale) 1276 { 1277 __v4df __zero = _mm256_setzero_pd (); 1278 __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ); 1279 1280 return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (), 1281 __base, 1282 (__v4si)__index, 1283 __mask, 1284 __scale); 1285 } 1286 1287 extern __inline __m256d 1288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1289 _mm256_mask_i32gather_pd (__m256d __src, double const *__base, 1290 __m128i __index, __m256d __mask, const int __scale) 1291 { 1292 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src, 1293 __base, 1294 (__v4si)__index, 1295 (__v4df)__mask, 1296 __scale); 1297 } 1298 1299 extern __inline __m128d 1300 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1301 _mm_i64gather_pd (double const *__base, __m128i __index, const int __scale) 1302 { 1303 __v2df __src = _mm_setzero_pd (); 1304 __v2df __mask = _mm_cmpeq_pd (__src, __src); 1305 1306 return (__m128d) __builtin_ia32_gatherdiv2df (__src, 1307 __base, 1308 (__v2di)__index, 1309 __mask, 1310 __scale); 1311 } 1312 1313 extern __inline __m128d 1314 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1315 _mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index, 1316 __m128d __mask, const int __scale) 1317 { 1318 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src, 1319 __base, 1320 (__v2di)__index, 1321 (__v2df)__mask, 1322 __scale); 1323 } 1324 1325 extern __inline __m256d 1326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1327 _mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale) 1328 { 1329 __v4df __src = _mm256_setzero_pd (); 1330 __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ); 1331 1332 return (__m256d) __builtin_ia32_gatherdiv4df (__src, 1333 __base, 1334 (__v4di)__index, 1335 __mask, 1336 __scale); 1337 } 1338 1339 extern __inline __m256d 1340 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1341 _mm256_mask_i64gather_pd (__m256d __src, double const *__base, 1342 __m256i __index, __m256d __mask, const int __scale) 1343 { 1344 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src, 1345 __base, 1346 (__v4di)__index, 1347 (__v4df)__mask, 1348 __scale); 1349 } 1350 1351 extern __inline __m128 1352 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1353 _mm_i32gather_ps (float const *__base, __m128i __index, const int __scale) 1354 { 1355 __v4sf __src = _mm_setzero_ps (); 1356 __v4sf __mask = _mm_cmpeq_ps (__src, __src); 1357 1358 return (__m128) __builtin_ia32_gathersiv4sf (__src, 1359 __base, 1360 (__v4si)__index, 1361 __mask, 1362 __scale); 1363 } 1364 1365 extern __inline __m128 1366 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1367 _mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index, 1368 __m128 __mask, const int __scale) 1369 { 1370 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src, 1371 __base, 1372 (__v4si)__index, 1373 (__v4sf)__mask, 1374 __scale); 1375 } 1376 1377 extern __inline __m256 1378 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1379 _mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale) 1380 { 1381 __v8sf __src = _mm256_setzero_ps (); 1382 __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ); 1383 1384 return (__m256) __builtin_ia32_gathersiv8sf (__src, 1385 __base, 1386 (__v8si)__index, 1387 __mask, 1388 __scale); 1389 } 1390 1391 extern __inline __m256 1392 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1393 _mm256_mask_i32gather_ps (__m256 __src, float const *__base, 1394 __m256i __index, __m256 __mask, const int __scale) 1395 { 1396 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src, 1397 __base, 1398 (__v8si)__index, 1399 (__v8sf)__mask, 1400 __scale); 1401 } 1402 1403 extern __inline __m128 1404 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1405 _mm_i64gather_ps (float const *__base, __m128i __index, const int __scale) 1406 { 1407 __v4sf __src = _mm_setzero_ps (); 1408 __v4sf __mask = _mm_cmpeq_ps (__src, __src); 1409 1410 return (__m128) __builtin_ia32_gatherdiv4sf (__src, 1411 __base, 1412 (__v2di)__index, 1413 __mask, 1414 __scale); 1415 } 1416 1417 extern __inline __m128 1418 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1419 _mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index, 1420 __m128 __mask, const int __scale) 1421 { 1422 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src, 1423 __base, 1424 (__v2di)__index, 1425 (__v4sf)__mask, 1426 __scale); 1427 } 1428 1429 extern __inline __m128 1430 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1431 _mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale) 1432 { 1433 __v4sf __src = _mm_setzero_ps (); 1434 __v4sf __mask = _mm_cmpeq_ps (__src, __src); 1435 1436 return (__m128) __builtin_ia32_gatherdiv4sf256 (__src, 1437 __base, 1438 (__v4di)__index, 1439 __mask, 1440 __scale); 1441 } 1442 1443 extern __inline __m128 1444 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1445 _mm256_mask_i64gather_ps (__m128 __src, float const *__base, 1446 __m256i __index, __m128 __mask, const int __scale) 1447 { 1448 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src, 1449 __base, 1450 (__v4di)__index, 1451 (__v4sf)__mask, 1452 __scale); 1453 } 1454 1455 extern __inline __m128i 1456 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1457 _mm_i32gather_epi64 (long long int const *__base, 1458 __m128i __index, const int __scale) 1459 { 1460 __v2di __src = __extension__ (__v2di){ 0, 0 }; 1461 __v2di __mask = __extension__ (__v2di){ ~0, ~0 }; 1462 1463 return (__m128i) __builtin_ia32_gathersiv2di (__src, 1464 __base, 1465 (__v4si)__index, 1466 __mask, 1467 __scale); 1468 } 1469 1470 extern __inline __m128i 1471 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1472 _mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base, 1473 __m128i __index, __m128i __mask, const int __scale) 1474 { 1475 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src, 1476 __base, 1477 (__v4si)__index, 1478 (__v2di)__mask, 1479 __scale); 1480 } 1481 1482 extern __inline __m256i 1483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1484 _mm256_i32gather_epi64 (long long int const *__base, 1485 __m128i __index, const int __scale) 1486 { 1487 __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1488 __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1489 1490 return (__m256i) __builtin_ia32_gathersiv4di (__src, 1491 __base, 1492 (__v4si)__index, 1493 __mask, 1494 __scale); 1495 } 1496 1497 extern __inline __m256i 1498 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1499 _mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base, 1500 __m128i __index, __m256i __mask, 1501 const int __scale) 1502 { 1503 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src, 1504 __base, 1505 (__v4si)__index, 1506 (__v4di)__mask, 1507 __scale); 1508 } 1509 1510 extern __inline __m128i 1511 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1512 _mm_i64gather_epi64 (long long int const *__base, 1513 __m128i __index, const int __scale) 1514 { 1515 __v2di __src = __extension__ (__v2di){ 0, 0 }; 1516 __v2di __mask = __extension__ (__v2di){ ~0, ~0 }; 1517 1518 return (__m128i) __builtin_ia32_gatherdiv2di (__src, 1519 __base, 1520 (__v2di)__index, 1521 __mask, 1522 __scale); 1523 } 1524 1525 extern __inline __m128i 1526 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1527 _mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base, 1528 __m128i __index, __m128i __mask, const int __scale) 1529 { 1530 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src, 1531 __base, 1532 (__v2di)__index, 1533 (__v2di)__mask, 1534 __scale); 1535 } 1536 1537 extern __inline __m256i 1538 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1539 _mm256_i64gather_epi64 (long long int const *__base, 1540 __m256i __index, const int __scale) 1541 { 1542 __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1543 __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1544 1545 return (__m256i) __builtin_ia32_gatherdiv4di (__src, 1546 __base, 1547 (__v4di)__index, 1548 __mask, 1549 __scale); 1550 } 1551 1552 extern __inline __m256i 1553 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1554 _mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base, 1555 __m256i __index, __m256i __mask, 1556 const int __scale) 1557 { 1558 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src, 1559 __base, 1560 (__v4di)__index, 1561 (__v4di)__mask, 1562 __scale); 1563 } 1564 1565 extern __inline __m128i 1566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1567 _mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale) 1568 { 1569 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1570 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1571 1572 return (__m128i) __builtin_ia32_gathersiv4si (__src, 1573 __base, 1574 (__v4si)__index, 1575 __mask, 1576 __scale); 1577 } 1578 1579 extern __inline __m128i 1580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1581 _mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index, 1582 __m128i __mask, const int __scale) 1583 { 1584 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src, 1585 __base, 1586 (__v4si)__index, 1587 (__v4si)__mask, 1588 __scale); 1589 } 1590 1591 extern __inline __m256i 1592 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1593 _mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale) 1594 { 1595 __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1596 __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; 1597 1598 return (__m256i) __builtin_ia32_gathersiv8si (__src, 1599 __base, 1600 (__v8si)__index, 1601 __mask, 1602 __scale); 1603 } 1604 1605 extern __inline __m256i 1606 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1607 _mm256_mask_i32gather_epi32 (__m256i __src, int const *__base, 1608 __m256i __index, __m256i __mask, 1609 const int __scale) 1610 { 1611 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src, 1612 __base, 1613 (__v8si)__index, 1614 (__v8si)__mask, 1615 __scale); 1616 } 1617 1618 extern __inline __m128i 1619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1620 _mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale) 1621 { 1622 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1623 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1624 1625 return (__m128i) __builtin_ia32_gatherdiv4si (__src, 1626 __base, 1627 (__v2di)__index, 1628 __mask, 1629 __scale); 1630 } 1631 1632 extern __inline __m128i 1633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1634 _mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index, 1635 __m128i __mask, const int __scale) 1636 { 1637 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src, 1638 __base, 1639 (__v2di)__index, 1640 (__v4si)__mask, 1641 __scale); 1642 } 1643 1644 extern __inline __m128i 1645 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1646 _mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale) 1647 { 1648 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1649 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1650 1651 return (__m128i) __builtin_ia32_gatherdiv4si256 (__src, 1652 __base, 1653 (__v4di)__index, 1654 __mask, 1655 __scale); 1656 } 1657 1658 extern __inline __m128i 1659 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1660 _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, 1661 __m256i __index, __m128i __mask, 1662 const int __scale) 1663 { 1664 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src, 1665 __base, 1666 (__v4di)__index, 1667 (__v4si)__mask, 1668 __scale); 1669 } 1670 #else /* __OPTIMIZE__ */ 1671 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \ 1672 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ 1673 (double const *)BASE, \ 1674 (__v4si)(__m128i)INDEX, \ 1675 (__v2df)_mm_set1_pd( \ 1676 (double)(long long int) -1), \ 1677 (int)SCALE) 1678 1679 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1680 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ 1681 (double const *)BASE, \ 1682 (__v4si)(__m128i)INDEX, \ 1683 (__v2df)(__m128d)MASK, \ 1684 (int)SCALE) 1685 1686 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ 1687 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ 1688 (double const *)BASE, \ 1689 (__v4si)(__m128i)INDEX, \ 1690 (__v4df)_mm256_set1_pd( \ 1691 (double)(long long int) -1), \ 1692 (int)SCALE) 1693 1694 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1695 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ 1696 (double const *)BASE, \ 1697 (__v4si)(__m128i)INDEX, \ 1698 (__v4df)(__m256d)MASK, \ 1699 (int)SCALE) 1700 1701 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \ 1702 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ 1703 (double const *)BASE, \ 1704 (__v2di)(__m128i)INDEX, \ 1705 (__v2df)_mm_set1_pd( \ 1706 (double)(long long int) -1), \ 1707 (int)SCALE) 1708 1709 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1710 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ 1711 (double const *)BASE, \ 1712 (__v2di)(__m128i)INDEX, \ 1713 (__v2df)(__m128d)MASK, \ 1714 (int)SCALE) 1715 1716 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ 1717 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ 1718 (double const *)BASE, \ 1719 (__v4di)(__m256i)INDEX, \ 1720 (__v4df)_mm256_set1_pd( \ 1721 (double)(long long int) -1), \ 1722 (int)SCALE) 1723 1724 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1725 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ 1726 (double const *)BASE, \ 1727 (__v4di)(__m256i)INDEX, \ 1728 (__v4df)(__m256d)MASK, \ 1729 (int)SCALE) 1730 1731 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \ 1732 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ 1733 (float const *)BASE, \ 1734 (__v4si)(__m128i)INDEX, \ 1735 _mm_set1_ps ((float)(int) -1), \ 1736 (int)SCALE) 1737 1738 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1739 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ 1740 (float const *)BASE, \ 1741 (__v4si)(__m128i)INDEX, \ 1742 (__v4sf)(__m128d)MASK, \ 1743 (int)SCALE) 1744 1745 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ 1746 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ 1747 (float const *)BASE, \ 1748 (__v8si)(__m256i)INDEX, \ 1749 (__v8sf)_mm256_set1_ps ( \ 1750 (float)(int) -1), \ 1751 (int)SCALE) 1752 1753 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1754 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ 1755 (float const *)BASE, \ 1756 (__v8si)(__m256i)INDEX, \ 1757 (__v8sf)(__m256d)MASK, \ 1758 (int)SCALE) 1759 1760 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \ 1761 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ 1762 (float const *)BASE, \ 1763 (__v2di)(__m128i)INDEX, \ 1764 (__v4sf)_mm_set1_ps ( \ 1765 (float)(int) -1), \ 1766 (int)SCALE) 1767 1768 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1769 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ 1770 (float const *)BASE, \ 1771 (__v2di)(__m128i)INDEX, \ 1772 (__v4sf)(__m128d)MASK, \ 1773 (int)SCALE) 1774 1775 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ 1776 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ 1777 (float const *)BASE, \ 1778 (__v4di)(__m256i)INDEX, \ 1779 (__v4sf)_mm_set1_ps( \ 1780 (float)(int) -1), \ 1781 (int)SCALE) 1782 1783 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1784 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ 1785 (float const *)BASE, \ 1786 (__v4di)(__m256i)INDEX, \ 1787 (__v4sf)(__m128)MASK, \ 1788 (int)SCALE) 1789 1790 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ 1791 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ 1792 (long long const *)BASE, \ 1793 (__v4si)(__m128i)INDEX, \ 1794 (__v2di)_mm_set1_epi64x (-1), \ 1795 (int)SCALE) 1796 1797 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1798 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \ 1799 (long long const *)BASE, \ 1800 (__v4si)(__m128i)INDEX, \ 1801 (__v2di)(__m128i)MASK, \ 1802 (int)SCALE) 1803 1804 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ 1805 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ 1806 (long long const *)BASE, \ 1807 (__v4si)(__m128i)INDEX, \ 1808 (__v4di)_mm256_set1_epi64x (-1), \ 1809 (int)SCALE) 1810 1811 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1812 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \ 1813 (long long const *)BASE, \ 1814 (__v4si)(__m128i)INDEX, \ 1815 (__v4di)(__m256i)MASK, \ 1816 (int)SCALE) 1817 1818 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ 1819 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ 1820 (long long const *)BASE, \ 1821 (__v2di)(__m128i)INDEX, \ 1822 (__v2di)_mm_set1_epi64x (-1), \ 1823 (int)SCALE) 1824 1825 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1826 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \ 1827 (long long const *)BASE, \ 1828 (__v2di)(__m128i)INDEX, \ 1829 (__v2di)(__m128i)MASK, \ 1830 (int)SCALE) 1831 1832 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ 1833 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ 1834 (long long const *)BASE, \ 1835 (__v4di)(__m256i)INDEX, \ 1836 (__v4di)_mm256_set1_epi64x (-1), \ 1837 (int)SCALE) 1838 1839 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1840 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \ 1841 (long long const *)BASE, \ 1842 (__v4di)(__m256i)INDEX, \ 1843 (__v4di)(__m256i)MASK, \ 1844 (int)SCALE) 1845 1846 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ 1847 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ 1848 (int const *)BASE, \ 1849 (__v4si)(__m128i)INDEX, \ 1850 (__v4si)_mm_set1_epi32 (-1), \ 1851 (int)SCALE) 1852 1853 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1854 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \ 1855 (int const *)BASE, \ 1856 (__v4si)(__m128i)INDEX, \ 1857 (__v4si)(__m128i)MASK, \ 1858 (int)SCALE) 1859 1860 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ 1861 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ 1862 (int const *)BASE, \ 1863 (__v8si)(__m256i)INDEX, \ 1864 (__v8si)_mm256_set1_epi32 (-1), \ 1865 (int)SCALE) 1866 1867 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1868 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \ 1869 (int const *)BASE, \ 1870 (__v8si)(__m256i)INDEX, \ 1871 (__v8si)(__m256i)MASK, \ 1872 (int)SCALE) 1873 1874 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ 1875 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ 1876 (int const *)BASE, \ 1877 (__v2di)(__m128i)INDEX, \ 1878 (__v4si)_mm_set1_epi32 (-1), \ 1879 (int)SCALE) 1880 1881 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1882 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \ 1883 (int const *)BASE, \ 1884 (__v2di)(__m128i)INDEX, \ 1885 (__v4si)(__m128i)MASK, \ 1886 (int)SCALE) 1887 1888 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ 1889 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ 1890 (int const *)BASE, \ 1891 (__v4di)(__m256i)INDEX, \ 1892 (__v4si)_mm_set1_epi32(-1), \ 1893 (int)SCALE) 1894 1895 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1896 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \ 1897 (int const *)BASE, \ 1898 (__v4di)(__m256i)INDEX, \ 1899 (__v4si)(__m128i)MASK, \ 1900 (int)SCALE) 1901 #endif /* __OPTIMIZE__ */ 1902 1903 #ifdef __DISABLE_AVX2__ 1904 #undef __DISABLE_AVX2__ 1905 #pragma GCC pop_options 1906 #endif /* __DISABLE_AVX2__ */ 1907 1908 #endif /* _AVX2INTRIN_H_INCLUDED */ 1909