1 /* Copyright (C) 2011 2 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GCC is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 Under Section 7 of GPL version 3, you are granted additional 17 permissions described in the GCC Runtime Library Exception, version 18 3.1, as published by the Free Software Foundation. 19 20 You should have received a copy of the GNU General Public License and 21 a copy of the GCC Runtime Library Exception along with this program; 22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 <http://www.gnu.org/licenses/>. */ 24 25 #ifndef _IMMINTRIN_H_INCLUDED 26 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 27 #endif 28 29 /* Sum absolute 8-bit integer difference of adjacent groups of 4 30 byte integers in the first 2 operands. Starting offsets within 31 operands are determined by the 3rd mask operand. */ 32 #ifdef __OPTIMIZE__ 33 extern __inline __m256i 34 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 35 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) 36 { 37 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, 38 (__v32qi)__Y, __M); 39 } 40 #else 41 #define _mm256_mpsadbw_epu8(X, Y, M) \ 42 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ 43 (__v32qi)(__m256i)(Y), (int)(M))) 44 #endif 45 46 extern __inline __m256i 47 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 48 _mm256_abs_epi8 (__m256i __A) 49 { 50 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); 51 } 52 53 extern __inline __m256i 54 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 55 _mm256_abs_epi16 (__m256i __A) 56 { 57 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); 58 } 59 60 extern __inline __m256i 61 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 62 _mm256_abs_epi32 (__m256i __A) 63 { 64 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); 65 } 66 67 extern __inline __m256i 68 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 69 _mm256_packs_epi32 (__m256i __A, __m256i __B) 70 { 71 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); 72 } 73 74 extern __inline __m256i 75 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 76 _mm256_packs_epi16 (__m256i __A, __m256i __B) 77 { 78 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); 79 } 80 81 extern __inline __m256i 82 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 83 _mm256_packus_epi32 (__m256i __A, __m256i __B) 84 { 85 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); 86 } 87 88 extern __inline __m256i 89 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 90 _mm256_packus_epi16 (__m256i __A, __m256i __B) 91 { 92 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); 93 } 94 95 extern __inline __m256i 96 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 97 _mm256_add_epi8 (__m256i __A, __m256i __B) 98 { 99 return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B); 100 } 101 102 extern __inline __m256i 103 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 104 _mm256_add_epi16 (__m256i __A, __m256i __B) 105 { 106 return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B); 107 } 108 109 extern __inline __m256i 110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 111 _mm256_add_epi32 (__m256i __A, __m256i __B) 112 { 113 return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B); 114 } 115 116 extern __inline __m256i 117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 118 _mm256_add_epi64 (__m256i __A, __m256i __B) 119 { 120 return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B); 121 } 122 123 extern __inline __m256i 124 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 125 _mm256_adds_epi8 (__m256i __A, __m256i __B) 126 { 127 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); 128 } 129 130 extern __inline __m256i 131 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 132 _mm256_adds_epi16 (__m256i __A, __m256i __B) 133 { 134 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); 135 } 136 137 extern __inline __m256i 138 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 139 _mm256_adds_epu8 (__m256i __A, __m256i __B) 140 { 141 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); 142 } 143 144 extern __inline __m256i 145 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 146 _mm256_adds_epu16 (__m256i __A, __m256i __B) 147 { 148 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); 149 } 150 151 #ifdef __OPTIMIZE__ 152 extern __inline __m256i 153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 154 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) 155 { 156 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, 157 (__v4di)__B, 158 __N * 8); 159 } 160 #else 161 /* In that case (__N*8) will be in vreg, and insn will not be matched. */ 162 /* Use define instead */ 163 #define _mm256_alignr_epi8(A, B, N) \ 164 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ 165 (__v4di)(__m256i)(B), \ 166 (int)(N) * 8)) 167 #endif 168 169 extern __inline __m256i 170 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 171 _mm256_and_si256 (__m256i __A, __m256i __B) 172 { 173 return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B); 174 } 175 176 extern __inline __m256i 177 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 178 _mm256_andnot_si256 (__m256i __A, __m256i __B) 179 { 180 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); 181 } 182 183 extern __inline __m256i 184 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 185 _mm256_avg_epu8 (__m256i __A, __m256i __B) 186 { 187 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); 188 } 189 190 extern __inline __m256i 191 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 192 _mm256_avg_epu16 (__m256i __A, __m256i __B) 193 { 194 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); 195 } 196 197 extern __inline __m256i 198 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 199 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) 200 { 201 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, 202 (__v32qi)__Y, 203 (__v32qi)__M); 204 } 205 206 #ifdef __OPTIMIZE__ 207 extern __inline __m256i 208 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 209 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) 210 { 211 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, 212 (__v16hi)__Y, 213 __M); 214 } 215 #else 216 #define _mm256_blend_epi16(X, Y, M) \ 217 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ 218 (__v16hi)(__m256i)(Y), (int)(M))) 219 #endif 220 221 extern __inline __m256i 222 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 223 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B) 224 { 225 return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B); 226 } 227 228 extern __inline __m256i 229 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 230 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B) 231 { 232 return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B); 233 } 234 235 extern __inline __m256i 236 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 237 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B) 238 { 239 return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B); 240 } 241 242 extern __inline __m256i 243 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 244 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B) 245 { 246 return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B); 247 } 248 249 extern __inline __m256i 250 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 251 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B) 252 { 253 return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A, 254 (__v32qi)__B); 255 } 256 257 extern __inline __m256i 258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 259 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B) 260 { 261 return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A, 262 (__v16hi)__B); 263 } 264 265 extern __inline __m256i 266 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 267 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B) 268 { 269 return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A, 270 (__v8si)__B); 271 } 272 273 extern __inline __m256i 274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 275 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B) 276 { 277 return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B); 278 } 279 280 extern __inline __m256i 281 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 282 _mm256_hadd_epi16 (__m256i __X, __m256i __Y) 283 { 284 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, 285 (__v16hi)__Y); 286 } 287 288 extern __inline __m256i 289 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 290 _mm256_hadd_epi32 (__m256i __X, __m256i __Y) 291 { 292 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); 293 } 294 295 extern __inline __m256i 296 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 297 _mm256_hadds_epi16 (__m256i __X, __m256i __Y) 298 { 299 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, 300 (__v16hi)__Y); 301 } 302 303 extern __inline __m256i 304 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 305 _mm256_hsub_epi16 (__m256i __X, __m256i __Y) 306 { 307 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, 308 (__v16hi)__Y); 309 } 310 311 extern __inline __m256i 312 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 313 _mm256_hsub_epi32 (__m256i __X, __m256i __Y) 314 { 315 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); 316 } 317 318 extern __inline __m256i 319 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 320 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y) 321 { 322 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, 323 (__v16hi)__Y); 324 } 325 326 extern __inline __m256i 327 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 328 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y) 329 { 330 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, 331 (__v32qi)__Y); 332 } 333 334 extern __inline __m256i 335 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 336 _mm256_madd_epi16 (__m256i __A, __m256i __B) 337 { 338 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, 339 (__v16hi)__B); 340 } 341 342 extern __inline __m256i 343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 344 _mm256_max_epi8 (__m256i __A, __m256i __B) 345 { 346 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); 347 } 348 349 extern __inline __m256i 350 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 351 _mm256_max_epi16 (__m256i __A, __m256i __B) 352 { 353 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); 354 } 355 356 extern __inline __m256i 357 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 358 _mm256_max_epi32 (__m256i __A, __m256i __B) 359 { 360 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); 361 } 362 363 extern __inline __m256i 364 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 365 _mm256_max_epu8 (__m256i __A, __m256i __B) 366 { 367 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); 368 } 369 370 extern __inline __m256i 371 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 372 _mm256_max_epu16 (__m256i __A, __m256i __B) 373 { 374 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); 375 } 376 377 extern __inline __m256i 378 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 379 _mm256_max_epu32 (__m256i __A, __m256i __B) 380 { 381 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); 382 } 383 384 extern __inline __m256i 385 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 386 _mm256_min_epi8 (__m256i __A, __m256i __B) 387 { 388 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); 389 } 390 391 extern __inline __m256i 392 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 393 _mm256_min_epi16 (__m256i __A, __m256i __B) 394 { 395 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); 396 } 397 398 extern __inline __m256i 399 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 400 _mm256_min_epi32 (__m256i __A, __m256i __B) 401 { 402 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); 403 } 404 405 extern __inline __m256i 406 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 407 _mm256_min_epu8 (__m256i __A, __m256i __B) 408 { 409 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); 410 } 411 412 extern __inline __m256i 413 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 414 _mm256_min_epu16 (__m256i __A, __m256i __B) 415 { 416 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); 417 } 418 419 extern __inline __m256i 420 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 421 _mm256_min_epu32 (__m256i __A, __m256i __B) 422 { 423 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); 424 } 425 426 extern __inline int 427 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 428 _mm256_movemask_epi8 (__m256i __A) 429 { 430 return __builtin_ia32_pmovmskb256 ((__v32qi)__A); 431 } 432 433 extern __inline __m256i 434 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 435 _mm256_cvtepi8_epi16 (__m128i __X) 436 { 437 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); 438 } 439 440 extern __inline __m256i 441 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 442 _mm256_cvtepi8_epi32 (__m128i __X) 443 { 444 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); 445 } 446 447 extern __inline __m256i 448 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 449 _mm256_cvtepi8_epi64 (__m128i __X) 450 { 451 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); 452 } 453 454 extern __inline __m256i 455 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 456 _mm256_cvtepi16_epi32 (__m128i __X) 457 { 458 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); 459 } 460 461 extern __inline __m256i 462 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 463 _mm256_cvtepi16_epi64 (__m128i __X) 464 { 465 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); 466 } 467 468 extern __inline __m256i 469 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 470 _mm256_cvtepi32_epi64 (__m128i __X) 471 { 472 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); 473 } 474 475 extern __inline __m256i 476 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 477 _mm256_cvtepu8_epi16 (__m128i __X) 478 { 479 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); 480 } 481 482 extern __inline __m256i 483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 484 _mm256_cvtepu8_epi32 (__m128i __X) 485 { 486 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); 487 } 488 489 extern __inline __m256i 490 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 491 _mm256_cvtepu8_epi64 (__m128i __X) 492 { 493 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); 494 } 495 496 extern __inline __m256i 497 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 498 _mm256_cvtepu16_epi32 (__m128i __X) 499 { 500 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); 501 } 502 503 extern __inline __m256i 504 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 505 _mm256_cvtepu16_epi64 (__m128i __X) 506 { 507 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); 508 } 509 510 extern __inline __m256i 511 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 512 _mm256_cvtepu32_epi64 (__m128i __X) 513 { 514 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); 515 } 516 517 extern __inline __m256i 518 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 519 _mm256_mul_epi32 (__m256i __X, __m256i __Y) 520 { 521 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); 522 } 523 524 extern __inline __m256i 525 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 526 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) 527 { 528 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, 529 (__v16hi)__Y); 530 } 531 532 extern __inline __m256i 533 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 534 _mm256_mulhi_epu16 (__m256i __A, __m256i __B) 535 { 536 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); 537 } 538 539 extern __inline __m256i 540 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 541 _mm256_mulhi_epi16 (__m256i __A, __m256i __B) 542 { 543 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); 544 } 545 546 extern __inline __m256i 547 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 548 _mm256_mullo_epi16 (__m256i __A, __m256i __B) 549 { 550 return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B); 551 } 552 553 extern __inline __m256i 554 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 555 _mm256_mullo_epi32 (__m256i __A, __m256i __B) 556 { 557 return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B); 558 } 559 560 extern __inline __m256i 561 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 562 _mm256_mul_epu32 (__m256i __A, __m256i __B) 563 { 564 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); 565 } 566 567 extern __inline __m256i 568 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 569 _mm256_or_si256 (__m256i __A, __m256i __B) 570 { 571 return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B); 572 } 573 574 extern __inline __m256i 575 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 576 _mm256_sad_epu8 (__m256i __A, __m256i __B) 577 { 578 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); 579 } 580 581 extern __inline __m256i 582 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 583 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y) 584 { 585 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, 586 (__v32qi)__Y); 587 } 588 589 #ifdef __OPTIMIZE__ 590 extern __inline __m256i 591 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 592 _mm256_shuffle_epi32 (__m256i __A, const int __mask) 593 { 594 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); 595 } 596 597 extern __inline __m256i 598 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 599 _mm256_shufflehi_epi16 (__m256i __A, const int __mask) 600 { 601 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); 602 } 603 604 extern __inline __m256i 605 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 606 _mm256_shufflelo_epi16 (__m256i __A, const int __mask) 607 { 608 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); 609 } 610 #else 611 #define _mm256_shuffle_epi32(A, N) \ 612 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) 613 #define _mm256_shufflehi_epi16(A, N) \ 614 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) 615 #define _mm256_shufflelo_epi16(A, N) \ 616 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) 617 #endif 618 619 extern __inline __m256i 620 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 621 _mm256_sign_epi8 (__m256i __X, __m256i __Y) 622 { 623 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); 624 } 625 626 extern __inline __m256i 627 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 628 _mm256_sign_epi16 (__m256i __X, __m256i __Y) 629 { 630 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); 631 } 632 633 extern __inline __m256i 634 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 635 _mm256_sign_epi32 (__m256i __X, __m256i __Y) 636 { 637 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); 638 } 639 640 #ifdef __OPTIMIZE__ 641 extern __inline __m256i 642 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 643 _mm256_slli_si256 (__m256i __A, const int __N) 644 { 645 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 646 } 647 #else 648 #define _mm256_slli_si256(A, N) \ 649 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 650 #endif 651 652 extern __inline __m256i 653 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 654 _mm256_slli_epi16 (__m256i __A, int __B) 655 { 656 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); 657 } 658 659 extern __inline __m256i 660 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 661 _mm256_sll_epi16 (__m256i __A, __m128i __B) 662 { 663 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); 664 } 665 666 extern __inline __m256i 667 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 668 _mm256_slli_epi32 (__m256i __A, int __B) 669 { 670 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); 671 } 672 673 extern __inline __m256i 674 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 675 _mm256_sll_epi32 (__m256i __A, __m128i __B) 676 { 677 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); 678 } 679 680 extern __inline __m256i 681 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 682 _mm256_slli_epi64 (__m256i __A, int __B) 683 { 684 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); 685 } 686 687 extern __inline __m256i 688 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 689 _mm256_sll_epi64 (__m256i __A, __m128i __B) 690 { 691 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); 692 } 693 694 extern __inline __m256i 695 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 696 _mm256_srai_epi16 (__m256i __A, int __B) 697 { 698 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); 699 } 700 701 extern __inline __m256i 702 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 703 _mm256_sra_epi16 (__m256i __A, __m128i __B) 704 { 705 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); 706 } 707 708 extern __inline __m256i 709 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 710 _mm256_srai_epi32 (__m256i __A, int __B) 711 { 712 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); 713 } 714 715 extern __inline __m256i 716 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 717 _mm256_sra_epi32 (__m256i __A, __m128i __B) 718 { 719 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); 720 } 721 722 #ifdef __OPTIMIZE__ 723 extern __inline __m256i 724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 725 _mm256_srli_si256 (__m256i __A, const int __N) 726 { 727 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 728 } 729 #else 730 #define _mm256_srli_si256(A, N) \ 731 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 732 #endif 733 734 extern __inline __m256i 735 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 736 _mm256_srli_epi16 (__m256i __A, int __B) 737 { 738 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); 739 } 740 741 extern __inline __m256i 742 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 743 _mm256_srl_epi16 (__m256i __A, __m128i __B) 744 { 745 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); 746 } 747 748 extern __inline __m256i 749 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 750 _mm256_srli_epi32 (__m256i __A, int __B) 751 { 752 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); 753 } 754 755 extern __inline __m256i 756 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 757 _mm256_srl_epi32 (__m256i __A, __m128i __B) 758 { 759 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); 760 } 761 762 extern __inline __m256i 763 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 764 _mm256_srli_epi64 (__m256i __A, int __B) 765 { 766 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); 767 } 768 769 extern __inline __m256i 770 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 771 _mm256_srl_epi64 (__m256i __A, __m128i __B) 772 { 773 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); 774 } 775 776 extern __inline __m256i 777 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 778 _mm256_sub_epi8 (__m256i __A, __m256i __B) 779 { 780 return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B); 781 } 782 783 extern __inline __m256i 784 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 785 _mm256_sub_epi16 (__m256i __A, __m256i __B) 786 { 787 return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B); 788 } 789 790 extern __inline __m256i 791 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 792 _mm256_sub_epi32 (__m256i __A, __m256i __B) 793 { 794 return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B); 795 } 796 797 extern __inline __m256i 798 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 799 _mm256_sub_epi64 (__m256i __A, __m256i __B) 800 { 801 return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B); 802 } 803 804 extern __inline __m256i 805 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 806 _mm256_subs_epi8 (__m256i __A, __m256i __B) 807 { 808 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); 809 } 810 811 extern __inline __m256i 812 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 813 _mm256_subs_epi16 (__m256i __A, __m256i __B) 814 { 815 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); 816 } 817 818 extern __inline __m256i 819 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 820 _mm256_subs_epu8 (__m256i __A, __m256i __B) 821 { 822 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); 823 } 824 825 extern __inline __m256i 826 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 827 _mm256_subs_epu16 (__m256i __A, __m256i __B) 828 { 829 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); 830 } 831 832 extern __inline __m256i 833 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 834 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B) 835 { 836 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); 837 } 838 839 extern __inline __m256i 840 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 841 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B) 842 { 843 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); 844 } 845 846 extern __inline __m256i 847 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 848 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B) 849 { 850 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); 851 } 852 853 extern __inline __m256i 854 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 855 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B) 856 { 857 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); 858 } 859 860 extern __inline __m256i 861 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 862 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B) 863 { 864 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); 865 } 866 867 extern __inline __m256i 868 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 869 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B) 870 { 871 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); 872 } 873 874 extern __inline __m256i 875 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 876 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B) 877 { 878 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); 879 } 880 881 extern __inline __m256i 882 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 883 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B) 884 { 885 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); 886 } 887 888 extern __inline __m256i 889 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 890 _mm256_xor_si256 (__m256i __A, __m256i __B) 891 { 892 return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B); 893 } 894 895 extern __inline __m256i 896 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 897 _mm256_stream_load_si256 (__m256i const *__X) 898 { 899 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); 900 } 901 902 extern __inline __m128 903 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 904 _mm_broadcastss_ps (__m128 __X) 905 { 906 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); 907 } 908 909 extern __inline __m256 910 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 911 _mm256_broadcastss_ps (__m128 __X) 912 { 913 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); 914 } 915 916 extern __inline __m256d 917 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 918 _mm256_broadcastsd_pd (__m128d __X) 919 { 920 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); 921 } 922 923 extern __inline __m256i 924 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 925 _mm_broadcastsi128_si256 (__m128i __X) 926 { 927 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); 928 } 929 930 #ifdef __OPTIMIZE__ 931 extern __inline __m128i 932 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 933 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) 934 { 935 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, 936 (__v4si)__Y, 937 __M); 938 } 939 #else 940 #define _mm_blend_epi32(X, Y, M) \ 941 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ 942 (__v4si)(__m128i)(Y), (int)(M))) 943 #endif 944 945 #ifdef __OPTIMIZE__ 946 extern __inline __m256i 947 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 948 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) 949 { 950 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, 951 (__v8si)__Y, 952 __M); 953 } 954 #else 955 #define _mm256_blend_epi32(X, Y, M) \ 956 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ 957 (__v8si)(__m256i)(Y), (int)(M))) 958 #endif 959 960 extern __inline __m256i 961 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 962 _mm256_broadcastb_epi8 (__m128i __X) 963 { 964 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); 965 } 966 967 extern __inline __m256i 968 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 969 _mm256_broadcastw_epi16 (__m128i __X) 970 { 971 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); 972 } 973 974 extern __inline __m256i 975 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 976 _mm256_broadcastd_epi32 (__m128i __X) 977 { 978 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); 979 } 980 981 extern __inline __m256i 982 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 983 _mm256_broadcastq_epi64 (__m128i __X) 984 { 985 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); 986 } 987 988 extern __inline __m128i 989 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 990 _mm_broadcastb_epi8 (__m128i __X) 991 { 992 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); 993 } 994 995 extern __inline __m128i 996 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 997 _mm_broadcastw_epi16 (__m128i __X) 998 { 999 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); 1000 } 1001 1002 extern __inline __m128i 1003 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1004 _mm_broadcastd_epi32 (__m128i __X) 1005 { 1006 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); 1007 } 1008 1009 extern __inline __m128i 1010 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1011 _mm_broadcastq_epi64 (__m128i __X) 1012 { 1013 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); 1014 } 1015 1016 extern __inline __m256i 1017 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1018 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) 1019 { 1020 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); 1021 } 1022 1023 #ifdef __OPTIMIZE__ 1024 extern __inline __m256d 1025 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1026 _mm256_permute4x64_pd (__m256d __X, const int __M) 1027 { 1028 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); 1029 } 1030 #else 1031 #define _mm256_permute4x64_pd(X, M) \ 1032 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) 1033 #endif 1034 1035 extern __inline __m256 1036 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1037 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) 1038 { 1039 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); 1040 } 1041 1042 #ifdef __OPTIMIZE__ 1043 extern __inline __m256i 1044 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1045 _mm256_permute4x64_epi64 (__m256i __X, const int __M) 1046 { 1047 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); 1048 } 1049 #else 1050 #define _mm256_permute4x64_epi64(X, M) \ 1051 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) 1052 #endif 1053 1054 1055 #ifdef __OPTIMIZE__ 1056 extern __inline __m256i 1057 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1058 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) 1059 { 1060 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); 1061 } 1062 #else 1063 #define _mm256_permute2x128_si256(X, Y, M) \ 1064 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) 1065 #endif 1066 1067 #ifdef __OPTIMIZE__ 1068 extern __inline __m128i 1069 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1070 _mm256_extracti128_si256 (__m256i __X, const int __M) 1071 { 1072 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); 1073 } 1074 #else 1075 #define _mm256_extracti128_si256(X, M) \ 1076 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) 1077 #endif 1078 1079 #ifdef __OPTIMIZE__ 1080 extern __inline __m256i 1081 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1082 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) 1083 { 1084 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); 1085 } 1086 #else 1087 #define _mm256_inserti128_si256(X, Y, M) \ 1088 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ 1089 (__v2di)(__m128i)(Y), \ 1090 (int)(M))) 1091 #endif 1092 1093 extern __inline __m256i 1094 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1095 _mm256_maskload_epi32 (int const *__X, __m256i __M ) 1096 { 1097 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, 1098 (__v8si)__M); 1099 } 1100 1101 extern __inline __m256i 1102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1103 _mm256_maskload_epi64 (long long const *__X, __m256i __M ) 1104 { 1105 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, 1106 (__v4di)__M); 1107 } 1108 1109 extern __inline __m128i 1110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1111 _mm_maskload_epi32 (int const *__X, __m128i __M ) 1112 { 1113 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, 1114 (__v4si)__M); 1115 } 1116 1117 extern __inline __m128i 1118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1119 _mm_maskload_epi64 (long long const *__X, __m128i __M ) 1120 { 1121 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, 1122 (__v2di)__M); 1123 } 1124 1125 extern __inline void 1126 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1127 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) 1128 { 1129 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 1130 } 1131 1132 extern __inline void 1133 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1134 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) 1135 { 1136 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 1137 } 1138 1139 extern __inline void 1140 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1141 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) 1142 { 1143 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 1144 } 1145 1146 extern __inline void 1147 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1148 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) 1149 { 1150 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 1151 } 1152 1153 extern __inline __m256i 1154 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1155 _mm256_sllv_epi32 (__m256i __X, __m256i __Y) 1156 { 1157 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); 1158 } 1159 1160 extern __inline __m128i 1161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1162 _mm_sllv_epi32 (__m128i __X, __m128i __Y) 1163 { 1164 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); 1165 } 1166 1167 extern __inline __m256i 1168 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1169 _mm256_sllv_epi64 (__m256i __X, __m256i __Y) 1170 { 1171 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); 1172 } 1173 1174 extern __inline __m128i 1175 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1176 _mm_sllv_epi64 (__m128i __X, __m128i __Y) 1177 { 1178 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); 1179 } 1180 1181 extern __inline __m256i 1182 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1183 _mm256_srav_epi32 (__m256i __X, __m256i __Y) 1184 { 1185 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); 1186 } 1187 1188 extern __inline __m128i 1189 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1190 _mm_srav_epi32 (__m128i __X, __m128i __Y) 1191 { 1192 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); 1193 } 1194 1195 extern __inline __m256i 1196 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1197 _mm256_srlv_epi32 (__m256i __X, __m256i __Y) 1198 { 1199 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); 1200 } 1201 1202 extern __inline __m128i 1203 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1204 _mm_srlv_epi32 (__m128i __X, __m128i __Y) 1205 { 1206 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); 1207 } 1208 1209 extern __inline __m256i 1210 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1211 _mm256_srlv_epi64 (__m256i __X, __m256i __Y) 1212 { 1213 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); 1214 } 1215 1216 extern __inline __m128i 1217 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1218 _mm_srlv_epi64 (__m128i __X, __m128i __Y) 1219 { 1220 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); 1221 } 1222 1223 #ifdef __OPTIMIZE__ 1224 extern __inline __m128d 1225 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1226 _mm_i32gather_pd (double const *base, __m128i index, const int scale) 1227 { 1228 __v2df src = _mm_setzero_pd (); 1229 __v2df mask = _mm_cmpeq_pd (src, src); 1230 1231 return (__m128d) __builtin_ia32_gathersiv2df (src, 1232 base, 1233 (__v4si)index, 1234 mask, 1235 scale); 1236 } 1237 1238 extern __inline __m128d 1239 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1240 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index, 1241 __m128d mask, const int scale) 1242 { 1243 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src, 1244 base, 1245 (__v4si)index, 1246 (__v2df)mask, 1247 scale); 1248 } 1249 1250 extern __inline __m256d 1251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1252 _mm256_i32gather_pd (double const *base, __m128i index, const int scale) 1253 { 1254 __v4df src = _mm256_setzero_pd (); 1255 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); 1256 1257 return (__m256d) __builtin_ia32_gathersiv4df (src, 1258 base, 1259 (__v4si)index, 1260 mask, 1261 scale); 1262 } 1263 1264 extern __inline __m256d 1265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1266 _mm256_mask_i32gather_pd (__m256d src, double const *base, 1267 __m128i index, __m256d mask, const int scale) 1268 { 1269 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src, 1270 base, 1271 (__v4si)index, 1272 (__v4df)mask, 1273 scale); 1274 } 1275 1276 extern __inline __m128d 1277 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1278 _mm_i64gather_pd (double const *base, __m128i index, const int scale) 1279 { 1280 __v2df src = _mm_setzero_pd (); 1281 __v2df mask = _mm_cmpeq_pd (src, src); 1282 1283 return (__m128d) __builtin_ia32_gatherdiv2df (src, 1284 base, 1285 (__v2di)index, 1286 mask, 1287 scale); 1288 } 1289 1290 extern __inline __m128d 1291 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1292 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index, 1293 __m128d mask, const int scale) 1294 { 1295 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src, 1296 base, 1297 (__v2di)index, 1298 (__v2df)mask, 1299 scale); 1300 } 1301 1302 extern __inline __m256d 1303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1304 _mm256_i64gather_pd (double const *base, __m256i index, const int scale) 1305 { 1306 __v4df src = _mm256_setzero_pd (); 1307 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); 1308 1309 return (__m256d) __builtin_ia32_gatherdiv4df (src, 1310 base, 1311 (__v4di)index, 1312 mask, 1313 scale); 1314 } 1315 1316 extern __inline __m256d 1317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1318 _mm256_mask_i64gather_pd (__m256d src, double const *base, 1319 __m256i index, __m256d mask, const int scale) 1320 { 1321 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src, 1322 base, 1323 (__v4di)index, 1324 (__v4df)mask, 1325 scale); 1326 } 1327 1328 extern __inline __m128 1329 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1330 _mm_i32gather_ps (float const *base, __m128i index, const int scale) 1331 { 1332 __v4sf src = _mm_setzero_ps (); 1333 __v4sf mask = _mm_cmpeq_ps (src, src); 1334 1335 return (__m128) __builtin_ia32_gathersiv4sf (src, 1336 base, 1337 (__v4si)index, 1338 mask, 1339 scale); 1340 } 1341 1342 extern __inline __m128 1343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1344 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index, 1345 __m128 mask, const int scale) 1346 { 1347 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src, 1348 base, 1349 (__v4si)index, 1350 (__v4sf)mask, 1351 scale); 1352 } 1353 1354 extern __inline __m256 1355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1356 _mm256_i32gather_ps (float const *base, __m256i index, const int scale) 1357 { 1358 __v8sf src = _mm256_setzero_ps (); 1359 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); 1360 1361 return (__m256) __builtin_ia32_gathersiv8sf (src, 1362 base, 1363 (__v8si)index, 1364 mask, 1365 scale); 1366 } 1367 1368 extern __inline __m256 1369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1370 _mm256_mask_i32gather_ps (__m256 src, float const *base, 1371 __m256i index, __m256 mask, const int scale) 1372 { 1373 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src, 1374 base, 1375 (__v8si)index, 1376 (__v8sf)mask, 1377 scale); 1378 } 1379 1380 extern __inline __m128 1381 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1382 _mm_i64gather_ps (float const *base, __m128i index, const int scale) 1383 { 1384 __v4sf src = _mm_setzero_ps (); 1385 __v4sf mask = _mm_cmpeq_ps (src, src); 1386 1387 return (__m128) __builtin_ia32_gatherdiv4sf (src, 1388 base, 1389 (__v2di)index, 1390 mask, 1391 scale); 1392 } 1393 1394 extern __inline __m128 1395 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1396 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index, 1397 __m128 mask, const int scale) 1398 { 1399 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src, 1400 base, 1401 (__v2di)index, 1402 (__v4sf)mask, 1403 scale); 1404 } 1405 1406 extern __inline __m128 1407 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1408 _mm256_i64gather_ps (float const *base, __m256i index, const int scale) 1409 { 1410 __v4sf src = _mm_setzero_ps (); 1411 __v4sf mask = _mm_cmpeq_ps (src, src); 1412 1413 return (__m128) __builtin_ia32_gatherdiv4sf256 (src, 1414 base, 1415 (__v4di)index, 1416 mask, 1417 scale); 1418 } 1419 1420 extern __inline __m128 1421 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1422 _mm256_mask_i64gather_ps (__m128 src, float const *base, 1423 __m256i index, __m128 mask, const int scale) 1424 { 1425 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src, 1426 base, 1427 (__v4di)index, 1428 (__v4sf)mask, 1429 scale); 1430 } 1431 1432 extern __inline __m128i 1433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1434 _mm_i32gather_epi64 (long long int const *base, 1435 __m128i index, const int scale) 1436 { 1437 __v2di src = __extension__ (__v2di){ 0, 0 }; 1438 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1439 1440 return (__m128i) __builtin_ia32_gathersiv2di (src, 1441 base, 1442 (__v4si)index, 1443 mask, 1444 scale); 1445 } 1446 1447 extern __inline __m128i 1448 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1449 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base, 1450 __m128i index, __m128i mask, const int scale) 1451 { 1452 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src, 1453 base, 1454 (__v4si)index, 1455 (__v2di)mask, 1456 scale); 1457 } 1458 1459 extern __inline __m256i 1460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1461 _mm256_i32gather_epi64 (long long int const *base, 1462 __m128i index, const int scale) 1463 { 1464 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1465 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1466 1467 return (__m256i) __builtin_ia32_gathersiv4di (src, 1468 base, 1469 (__v4si)index, 1470 mask, 1471 scale); 1472 } 1473 1474 extern __inline __m256i 1475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1476 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base, 1477 __m128i index, __m256i mask, const int scale) 1478 { 1479 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src, 1480 base, 1481 (__v4si)index, 1482 (__v4di)mask, 1483 scale); 1484 } 1485 1486 extern __inline __m128i 1487 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1488 _mm_i64gather_epi64 (long long int const *base, 1489 __m128i index, const int scale) 1490 { 1491 __v2di src = __extension__ (__v2di){ 0, 0 }; 1492 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1493 1494 return (__m128i) __builtin_ia32_gatherdiv2di (src, 1495 base, 1496 (__v2di)index, 1497 mask, 1498 scale); 1499 } 1500 1501 extern __inline __m128i 1502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1503 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index, 1504 __m128i mask, const int scale) 1505 { 1506 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src, 1507 base, 1508 (__v2di)index, 1509 (__v2di)mask, 1510 scale); 1511 } 1512 1513 extern __inline __m256i 1514 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1515 _mm256_i64gather_epi64 (long long int const *base, 1516 __m256i index, const int scale) 1517 { 1518 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1519 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1520 1521 return (__m256i) __builtin_ia32_gatherdiv4di (src, 1522 base, 1523 (__v4di)index, 1524 mask, 1525 scale); 1526 } 1527 1528 extern __inline __m256i 1529 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1530 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base, 1531 __m256i index, __m256i mask, const int scale) 1532 { 1533 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src, 1534 base, 1535 (__v4di)index, 1536 (__v4di)mask, 1537 scale); 1538 } 1539 1540 extern __inline __m128i 1541 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1542 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale) 1543 { 1544 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1545 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1546 1547 return (__m128i) __builtin_ia32_gathersiv4si (src, 1548 base, 1549 (__v4si)index, 1550 mask, 1551 scale); 1552 } 1553 1554 extern __inline __m128i 1555 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1556 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index, 1557 __m128i mask, const int scale) 1558 { 1559 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src, 1560 base, 1561 (__v4si)index, 1562 (__v4si)mask, 1563 scale); 1564 } 1565 1566 extern __inline __m256i 1567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1568 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale) 1569 { 1570 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1571 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; 1572 1573 return (__m256i) __builtin_ia32_gathersiv8si (src, 1574 base, 1575 (__v8si)index, 1576 mask, 1577 scale); 1578 } 1579 1580 extern __inline __m256i 1581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1582 _mm256_mask_i32gather_epi32 (__m256i src, int const *base, 1583 __m256i index, __m256i mask, const int scale) 1584 { 1585 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src, 1586 base, 1587 (__v8si)index, 1588 (__v8si)mask, 1589 scale); 1590 } 1591 1592 extern __inline __m128i 1593 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1594 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale) 1595 { 1596 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1597 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1598 1599 return (__m128i) __builtin_ia32_gatherdiv4si (src, 1600 base, 1601 (__v2di)index, 1602 mask, 1603 scale); 1604 } 1605 1606 extern __inline __m128i 1607 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1608 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index, 1609 __m128i mask, const int scale) 1610 { 1611 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src, 1612 base, 1613 (__v2di)index, 1614 (__v4si)mask, 1615 scale); 1616 } 1617 1618 extern __inline __m128i 1619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1620 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale) 1621 { 1622 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1623 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1624 1625 return (__m128i) __builtin_ia32_gatherdiv4si256 (src, 1626 base, 1627 (__v4di)index, 1628 mask, 1629 scale); 1630 } 1631 1632 extern __inline __m128i 1633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1634 _mm256_mask_i64gather_epi32 (__m128i src, int const *base, 1635 __m256i index, __m128i mask, const int scale) 1636 { 1637 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src, 1638 base, 1639 (__v4di)index, 1640 (__v4si)mask, 1641 scale); 1642 } 1643 #else /* __OPTIMIZE__ */ 1644 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \ 1645 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ 1646 (double const *)BASE, \ 1647 (__v4si)(__m128i)INDEX, \ 1648 (__v2df)_mm_set1_pd( \ 1649 (double)(long long int) -1), \ 1650 (int)SCALE) 1651 1652 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1653 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ 1654 (double const *)BASE, \ 1655 (__v4si)(__m128i)INDEX, \ 1656 (__v2df)(__m128d)MASK, \ 1657 (int)SCALE) 1658 1659 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ 1660 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ 1661 (double const *)BASE, \ 1662 (__v4si)(__m128i)INDEX, \ 1663 (__v4df)_mm256_set1_pd( \ 1664 (double)(long long int) -1), \ 1665 (int)SCALE) 1666 1667 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1668 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ 1669 (double const *)BASE, \ 1670 (__v4si)(__m128i)INDEX, \ 1671 (__v4df)(__m256d)MASK, \ 1672 (int)SCALE) 1673 1674 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \ 1675 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ 1676 (double const *)BASE, \ 1677 (__v2di)(__m128i)INDEX, \ 1678 (__v2df)_mm_set1_pd( \ 1679 (double)(long long int) -1), \ 1680 (int)SCALE) 1681 1682 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1683 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ 1684 (double const *)BASE, \ 1685 (__v2di)(__m128i)INDEX, \ 1686 (__v2df)(__m128d)MASK, \ 1687 (int)SCALE) 1688 1689 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ 1690 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ 1691 (double const *)BASE, \ 1692 (__v4di)(__m256i)INDEX, \ 1693 (__v4df)_mm256_set1_pd( \ 1694 (double)(long long int) -1), \ 1695 (int)SCALE) 1696 1697 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1698 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ 1699 (double const *)BASE, \ 1700 (__v4di)(__m256i)INDEX, \ 1701 (__v4df)(__m256d)MASK, \ 1702 (int)SCALE) 1703 1704 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \ 1705 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ 1706 (float const *)BASE, \ 1707 (__v4si)(__m128i)INDEX, \ 1708 _mm_set1_ps ((float)(int) -1), \ 1709 (int)SCALE) 1710 1711 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1712 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ 1713 (float const *)BASE, \ 1714 (__v4si)(__m128i)INDEX, \ 1715 (__v4sf)(__m128d)MASK, \ 1716 (int)SCALE) 1717 1718 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ 1719 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ 1720 (float const *)BASE, \ 1721 (__v8si)(__m256i)INDEX, \ 1722 (__v8sf)_mm256_set1_ps ( \ 1723 (float)(int) -1), \ 1724 (int)SCALE) 1725 1726 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1727 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ 1728 (float const *)BASE, \ 1729 (__v8si)(__m256i)INDEX, \ 1730 (__v8sf)(__m256d)MASK, \ 1731 (int)SCALE) 1732 1733 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \ 1734 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ 1735 (float const *)BASE, \ 1736 (__v2di)(__m128i)INDEX, \ 1737 (__v4sf)_mm_set1_ps ( \ 1738 (float)(int) -1), \ 1739 (int)SCALE) 1740 1741 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1742 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ 1743 (float const *)BASE, \ 1744 (__v2di)(__m128i)INDEX, \ 1745 (__v4sf)(__m128d)MASK, \ 1746 (int)SCALE) 1747 1748 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ 1749 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ 1750 (float const *)BASE, \ 1751 (__v4di)(__m256i)INDEX, \ 1752 (__v4sf)_mm_set1_ps( \ 1753 (float)(int) -1), \ 1754 (int)SCALE) 1755 1756 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1757 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ 1758 (float const *)BASE, \ 1759 (__v4di)(__m256i)INDEX, \ 1760 (__v4sf)(__m128)MASK, \ 1761 (int)SCALE) 1762 1763 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ 1764 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ 1765 (long long const *)BASE, \ 1766 (__v4si)(__m128i)INDEX, \ 1767 (__v2di)_mm_set1_epi64x (-1), \ 1768 (int)SCALE) 1769 1770 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1771 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \ 1772 (long long const *)BASE, \ 1773 (__v4si)(__m128i)INDEX, \ 1774 (__v2di)(__m128i)MASK, \ 1775 (int)SCALE) 1776 1777 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ 1778 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ 1779 (long long const *)BASE, \ 1780 (__v4si)(__m128i)INDEX, \ 1781 (__v4di)_mm256_set1_epi64x (-1), \ 1782 (int)SCALE) 1783 1784 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1785 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \ 1786 (long long const *)BASE, \ 1787 (__v4si)(__m128i)INDEX, \ 1788 (__v4di)(__m256i)MASK, \ 1789 (int)SCALE) 1790 1791 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ 1792 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ 1793 (long long const *)BASE, \ 1794 (__v2di)(__m128i)INDEX, \ 1795 (__v2di)_mm_set1_epi64x (-1), \ 1796 (int)SCALE) 1797 1798 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1799 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \ 1800 (long long const *)BASE, \ 1801 (__v2di)(__m128i)INDEX, \ 1802 (__v2di)(__m128i)MASK, \ 1803 (int)SCALE) 1804 1805 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ 1806 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ 1807 (long long const *)BASE, \ 1808 (__v4di)(__m256i)INDEX, \ 1809 (__v4di)_mm256_set1_epi64x (-1), \ 1810 (int)SCALE) 1811 1812 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1813 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \ 1814 (long long const *)BASE, \ 1815 (__v4di)(__m256i)INDEX, \ 1816 (__v4di)(__m256i)MASK, \ 1817 (int)SCALE) 1818 1819 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ 1820 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ 1821 (int const *)BASE, \ 1822 (__v4si)(__m128i)INDEX, \ 1823 (__v4si)_mm_set1_epi32 (-1), \ 1824 (int)SCALE) 1825 1826 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1827 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \ 1828 (int const *)BASE, \ 1829 (__v4si)(__m128i)INDEX, \ 1830 (__v4si)(__m128i)MASK, \ 1831 (int)SCALE) 1832 1833 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ 1834 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ 1835 (int const *)BASE, \ 1836 (__v8si)(__m256i)INDEX, \ 1837 (__v8si)_mm256_set1_epi32 (-1), \ 1838 (int)SCALE) 1839 1840 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1841 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \ 1842 (int const *)BASE, \ 1843 (__v8si)(__m256i)INDEX, \ 1844 (__v8si)(__m256i)MASK, \ 1845 (int)SCALE) 1846 1847 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ 1848 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ 1849 (int const *)BASE, \ 1850 (__v2di)(__m128i)INDEX, \ 1851 (__v4si)_mm_set1_epi32 (-1), \ 1852 (int)SCALE) 1853 1854 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1855 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \ 1856 (int const *)BASE, \ 1857 (__v2di)(__m128i)INDEX, \ 1858 (__v4si)(__m128i)MASK, \ 1859 (int)SCALE) 1860 1861 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ 1862 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ 1863 (int const *)BASE, \ 1864 (__v4di)(__m256i)INDEX, \ 1865 (__v4si)_mm_set1_epi32(-1), \ 1866 (int)SCALE) 1867 1868 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1869 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \ 1870 (int const *)BASE, \ 1871 (__v4di)(__m256i)INDEX, \ 1872 (__v4si)(__m128i)MASK, \ 1873 (int)SCALE) 1874 #endif /* __OPTIMIZE__ */ 1875