1;; 2;; Copyright (c) 2019-2020, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27 28%include "include/os.asm" 29%include "include/reg_sizes.asm" 30 31section .data 32default rel 33 34align 16 35idx_tab8: 36 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 37 db 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 38 39align 16 40add_16: 41 db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 42 db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 43 44align 16 45idx_tab16: 46 dw 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 47 48align 16 49add_8: 50 dw 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8 51 52align 16 53idx_tab32: 54 dd 0x0, 0x1, 0x2, 0x3 55 56align 16 57add_4: 58 dd 0x4, 0x4, 0x4, 0x4 59 60align 16 61idx_tab64: 62 dq 0x0, 0x1 63 64align 16 65add_2: 66 dq 0x2, 0x2 67 68align 16 69bcast_mask: 70 db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 71 db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 72 73align 64 74idx_rows_avx: 75 dd 0x00000000, 0x00000000, 0x00000000, 0x00000000 76 dd 0x10101010, 0x10101010, 0x10101010, 0x10101010 77 dd 0x20202020, 0x20202020, 0x20202020, 0x20202020 78 dd 0x30303030, 0x30303030, 0x30303030, 0x30303030 79 dd 0x40404040, 0x40404040, 0x40404040, 0x40404040 80 dd 0x50505050, 0x50505050, 0x50505050, 0x50505050 81 dd 0x60606060, 0x60606060, 0x60606060, 0x60606060 82 dd 0x70707070, 0x70707070, 0x70707070, 0x70707070 83 dd 0x80808080, 0x80808080, 0x80808080, 0x80808080 84 dd 0x90909090, 0x90909090, 0x90909090, 0x90909090 85 dd 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0 86 dd 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0 87 dd 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0 88 dd 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0 89 dd 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0 90 dd 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0 91 92align 64 93idx_rows_avx2: 94 dd 0x00000000, 0x00000000, 0x00000000, 0x00000000 95 dd 0x00000000, 0x00000000, 0x00000000, 0x00000000 96 dd 0x10101010, 0x10101010, 0x10101010, 0x10101010 97 dd 0x10101010, 0x10101010, 0x10101010, 0x10101010 98 dd 0x20202020, 0x20202020, 0x20202020, 0x20202020 99 dd 0x20202020, 0x20202020, 0x20202020, 0x20202020 100 dd 0x30303030, 0x30303030, 0x30303030, 0x30303030 101 dd 0x30303030, 0x30303030, 0x30303030, 0x30303030 102 dd 0x40404040, 0x40404040, 0x40404040, 0x40404040 103 dd 0x40404040, 0x40404040, 0x40404040, 0x40404040 104 dd 0x50505050, 0x50505050, 0x50505050, 0x50505050 105 dd 0x50505050, 0x50505050, 0x50505050, 0x50505050 106 dd 0x60606060, 0x60606060, 0x60606060, 0x60606060 107 dd 0x60606060, 0x60606060, 0x60606060, 0x60606060 108 dd 0x70707070, 0x70707070, 0x70707070, 0x70707070 109 dd 0x70707070, 0x70707070, 0x70707070, 0x70707070 110 dd 0x80808080, 0x80808080, 0x80808080, 0x80808080 111 dd 0x80808080, 0x80808080, 0x80808080, 0x80808080 112 dd 0x90909090, 0x90909090, 0x90909090, 0x90909090 113 dd 0x90909090, 0x90909090, 0x90909090, 0x90909090 114 dd 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0 115 dd 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0 116 dd 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0 117 dd 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0 118 dd 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0 119 dd 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0 120 dd 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0 121 dd 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0 122 dd 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0 123 dd 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0 124 dd 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0 125 dd 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0 126 ;; extra 127 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f 128 dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f 129 130section .text 131 132%ifdef LINUX 133 %define arg1 rdi 134 %define arg2 rsi 135 %define arg3 rdx 136%else 137 %define arg1 rcx 138 %define arg2 rdx 139 %define arg3 r8 140%endif 141 142%define bcast_idx xmm0 143%define xadd xmm1 144%define accum_val xmm2 145%define xindices xmm3 146%define xtmp xmm4 147%define xtmp2 xmm5 148%define tmp r9 149%define offset r10 150 151%define table arg1 152%define idx arg2 153%define size arg3 154 155; uint8_t lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size); 156; arg 1 : pointer to table to look up 157; arg 2 : index to look up 158; arg 3 : size of table to look up (multiple of 16 bytes) 159align 32 160MKGLOBAL(lookup_8bit_sse,function,internal) 161lookup_8bit_sse: 162 163 ;; Number of loop iters = matrix size / 4 (number of values in XMM) 164 shr size, 4 165 je exit8_sse 166 167 xor offset, offset 168 169 ;; Broadcast idx to look up 170 movd bcast_idx, DWORD(idx) 171 pxor xtmp, xtmp 172 pxor accum_val, accum_val 173 pshufb bcast_idx, xtmp 174 175 movdqa xadd, [rel add_16] 176 movdqa xindices, [rel idx_tab8] 177 178loop8_sse: 179 movdqa xtmp, xindices 180 181 ;; Compare indices with idx 182 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) 183 pcmpeqb xtmp, bcast_idx 184 185 ;; Load next 16 values 186 movdqa xtmp2, [table + offset] 187 188 ;; This generates data with all 0s except the value we are looking for in the index to look up 189 pand xtmp2, xtmp 190 191 por accum_val, xtmp2 192 193 ;; Get next 16 indices 194 paddb xindices, xadd 195 196 add offset, 16 197 dec size 198 199 jne loop8_sse 200 201 ;; Extract value from XMM register 202 movdqa xtmp, accum_val 203 pslldq xtmp, 8 ; shift left by 64 bits 204 por accum_val, xtmp 205 206 movdqa xtmp, accum_val 207 pslldq xtmp, 4 ; shift left by 32 bits 208 por accum_val, xtmp 209 210 movdqa xtmp, accum_val 211 pslldq xtmp, 2 ; shift left by 16 bits 212 por accum_val, xtmp 213 214 movdqa xtmp, accum_val 215 pslldq xtmp, 1 ; shift left by 8 bits 216 por accum_val, xtmp 217 218 pextrb rax, accum_val, 15 219 220exit8_sse: 221 ret 222 223; uint8_t lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size); 224; arg 1 : pointer to table to look up 225; arg 2 : index to look up 226; arg 3 : size of table to look up (multiple of 16 bytes) 227align 32 228MKGLOBAL(lookup_8bit_avx,function,internal) 229lookup_8bit_avx: 230 ;; Number of loop iters = matrix size / 4 (number of values in XMM) 231 shr size, 4 232 je exit8_avx 233 234 xor offset, offset 235 236 ;; Broadcast idx to look up 237 vmovd bcast_idx, DWORD(idx) 238 vpxor xtmp, xtmp 239 vpxor accum_val, accum_val 240 vpshufb bcast_idx, xtmp 241 242 vmovdqa xadd, [rel add_16] 243 vmovdqa xindices, [rel idx_tab8] 244 245loop8_avx: 246 ;; Compare indices with idx 247 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) 248 vpcmpeqb xtmp, xindices, bcast_idx 249 250 ;; Load next 16 values 251 vmovdqa xtmp2, [table + offset] 252 253 ;; This generates data with all 0s except the value we are looking for in the index to look up 254 vpand xtmp2, xtmp 255 256 vpor accum_val, xtmp2 257 258 ;; Get next 16 indices 259 vpaddb xindices, xadd 260 261 add offset, 16 262 dec size 263 264 jne loop8_avx 265 266 ;; Extract value from XMM register 267 vpslldq xtmp, accum_val, 8 ; shift left by 64 bits 268 vpor accum_val, xtmp 269 270 vpslldq xtmp, accum_val, 4 ; shift left by 32 bits 271 vpor accum_val, xtmp 272 273 vpslldq xtmp, accum_val, 2 ; shift left by 16 bits 274 vpor accum_val, xtmp 275 276 vpslldq xtmp, accum_val, 1 ; shift left by 8 bits 277 vpor accum_val, xtmp 278 279 vpextrb rax, accum_val, 15 280 281exit8_avx: 282 283 ret 284 285; uint8_t lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size); 286; arg 1 : pointer to table to look up 287; arg 2 : index to look up 288; arg 3 : size of table to look up 289align 32 290MKGLOBAL(lookup_16bit_sse,function,internal) 291lookup_16bit_sse: 292 293 ;; Number of loop iters = matrix size / 8 (number of values in XMM) 294 shr size, 3 295 je exit16_sse 296 297 xor offset, offset 298 299 ;; Broadcast idx to look up 300 movd bcast_idx, DWORD(idx) 301 movdqa xtmp, [rel bcast_mask] 302 pxor accum_val, accum_val 303 pshufb bcast_idx, xtmp 304 305 movdqa xadd, [rel add_8] 306 movdqa xindices, [rel idx_tab16] 307 308loop16_sse: 309 310 movdqa xtmp, xindices 311 312 ;; Compare indices with idx 313 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) 314 pcmpeqw xtmp, bcast_idx 315 316 ;; Load next 8 values 317 movdqa xtmp2, [table + offset] 318 319 ;; This generates data with all 0s except the value we are looking for in the index to look up 320 pand xtmp2, xtmp 321 322 por accum_val, xtmp2 323 324 ;; Get next 8 indices 325 paddw xindices, xadd 326 add offset, 16 327 dec size 328 329 jne loop16_sse 330 331 ;; Extract value from XMM register 332 movdqa xtmp, accum_val 333 pslldq xtmp, 8 ; shift left by 64 bits 334 por accum_val, xtmp 335 336 movdqa xtmp, accum_val 337 pslldq xtmp, 4 ; shift left by 32 bits 338 por accum_val, xtmp 339 340 movdqa xtmp, accum_val 341 pslldq xtmp, 2 ; shift left by 16 bits 342 por accum_val, xtmp 343 344 pextrw rax, accum_val, 7 345 346exit16_sse: 347 ret 348 349; uint8_t lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size); 350; arg 1 : pointer to table to look up 351; arg 2 : index to look up 352; arg 3 : size of table to look up 353align 32 354MKGLOBAL(lookup_16bit_avx,function,internal) 355lookup_16bit_avx: 356 357 ;; Number of loop iters = matrix size / 8 (number of values in XMM) 358 shr size, 3 359 je exit16_avx 360 361 xor offset, offset 362 363 ;; Broadcast idx to look up 364 vmovd bcast_idx, DWORD(idx) 365 vmovdqa xtmp, [rel bcast_mask] 366 vpxor accum_val, accum_val 367 vpshufb bcast_idx, xtmp 368 369 vmovdqa xadd, [rel add_8] 370 vmovdqa xindices, [rel idx_tab16] 371 372loop16_avx: 373 374 ;; Compare indices with idx 375 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) 376 vpcmpeqw xtmp, xindices, bcast_idx 377 378 ;; Load next 16 values 379 vmovdqa xtmp2, [table + offset] 380 381 ;; This generates data with all 0s except the value we are looking for in the index to look up 382 vpand xtmp2, xtmp 383 384 vpor accum_val, xtmp2 385 386 ;; Get next 8 indices 387 vpaddw xindices, xadd 388 add offset, 16 389 dec size 390 391 jne loop16_avx 392 393 ;; Extract value from XMM register 394 vpslldq xtmp, accum_val, 8 ; shift left by 64 bits 395 vpor accum_val, xtmp 396 397 vpslldq xtmp, accum_val, 4 ; shift left by 32 bits 398 vpor accum_val, xtmp 399 400 vpslldq xtmp, accum_val, 2 ; shift left by 16 bits 401 vpor accum_val, xtmp 402 403 vpextrw rax, accum_val, 7 404 405exit16_avx: 406 ret 407 408; uint32_t lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size); 409; arg 1 : pointer to table to look up 410; arg 2 : index to look up 411; arg 3 : size of table to look up 412align 32 413MKGLOBAL(lookup_32bit_sse,function,internal) 414lookup_32bit_sse: 415 416 ;; Number of loop iters = matrix size / 4 (number of values in XMM) 417 shr size, 2 418 je exit32_sse 419 420 xor offset, offset 421 422 ;; Broadcast idx to look up 423 movd bcast_idx, DWORD(idx) 424 pxor accum_val, accum_val 425 pshufd bcast_idx, bcast_idx, 0 426 427 movdqa xadd, [rel add_4] 428 movdqa xindices, [rel idx_tab32] 429 430loop32_sse: 431 movdqa xtmp, xindices 432 433 ;; Compare indices with idx 434 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) 435 pcmpeqd xtmp, bcast_idx 436 437 ;; Load next 4 values 438 movdqa xtmp2, [table + offset] 439 440 ;; This generates data with all 0s except the value we are looking for in the index to look up 441 pand xtmp2, xtmp 442 443 por accum_val, xtmp2 444 445 ;; Get next 4 indices 446 paddd xindices, xadd 447 add offset, 16 448 dec size 449 450 jne loop32_sse 451 452 ;; Extract value from XMM register 453 movdqa xtmp, accum_val 454 psrldq xtmp, 8 ; shift right by 64 bits 455 por accum_val, xtmp 456 457 movdqa xtmp, accum_val 458 psrldq xtmp, 4 ; shift right by 32 bits 459 por accum_val, xtmp 460 461 movd eax, accum_val 462 463exit32_sse: 464 ret 465 466 467; uint32_t lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size); 468; arg 1 : pointer to table to look up 469; arg 2 : index to look up 470; arg 3 : size of table to look up 471align 32 472MKGLOBAL(lookup_32bit_avx,function,internal) 473lookup_32bit_avx: 474 ;; Number of loop iters = matrix size / 4 (number of values in XMM) 475 shr size, 2 476 je exit32_avx 477 478 xor offset, offset 479 480 ;; Broadcast idx to look up 481 vmovd bcast_idx, DWORD(idx) 482 vpxor accum_val, accum_val 483 vpshufd bcast_idx, bcast_idx, 0 484 485 vmovdqa xadd, [rel add_4] 486 vmovdqa xindices, [rel idx_tab32] 487 488loop32_avx: 489 ;; Compare indices with idx 490 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) 491 vpcmpeqd xtmp, xindices, bcast_idx 492 493 ;; Load next 4 values 494 vmovdqa xtmp2, [table + offset] 495 496 ;; This generates data with all 0s except the value we are looking for in the index to look up 497 vpand xtmp2, xtmp 498 499 vpor accum_val, xtmp2 500 501 ;; Get next 4 indices 502 vpaddd xindices, xadd 503 add offset, 16 504 dec size 505 506 jne loop32_avx 507 508 ;; Extract value from XMM register 509 vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits 510 vpor accum_val, xtmp 511 512 vpsrldq xtmp, accum_val, 4 ; shift right by 32 bits 513 vpor accum_val, xtmp 514 515 vmovd eax, accum_val 516 517exit32_avx: 518 ret 519 520 521; uint64_t lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size); 522; arg 1 : pointer to table to look up 523; arg 2 : index to look up 524; arg 3 : size of table to look up 525align 32 526MKGLOBAL(lookup_64bit_sse,function,internal) 527lookup_64bit_sse: 528 ;; Number of loop iters = matrix size / 2 (number of values in XMM) 529 shr size, 1 530 je exit64_sse 531 532 xor offset, offset 533 534 ;; Broadcast idx to look up 535 movq bcast_idx, idx 536 pxor accum_val, accum_val 537 pinsrq bcast_idx, idx, 1 538 539 movdqa xadd, [rel add_2] 540 movdqa xindices, [rel idx_tab64] 541 542loop64_sse: 543 movdqa xtmp, xindices 544 545 ;; Compare indices with idx 546 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) 547 pcmpeqq xtmp, bcast_idx 548 549 ;; Load next 2 values 550 movdqa xtmp2, [table + offset] 551 552 ;; This generates data with all 0s except the value we are looking for in the index to look up 553 pand xtmp2, xtmp 554 555 por accum_val, xtmp2 556 557 ;; Get next 2 indices 558 paddq xindices, xadd 559 add offset, 16 560 dec size 561 562 jne loop64_sse 563 564 ;; Extract value from XMM register 565 movdqa xtmp, accum_val 566 psrldq xtmp, 8 ; shift right by 64 bits 567 por accum_val, xtmp 568 569 movq rax, accum_val 570 571exit64_sse: 572 ret 573 574 575; uint64_t lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size); 576; arg 1 : pointer to table to look up 577; arg 2 : index to look up 578; arg 3 : size of table to look up 579align 32 580MKGLOBAL(lookup_64bit_avx,function,internal) 581lookup_64bit_avx: 582 ;; Number of loop iters = matrix size / 2 (number of values in XMM) 583 shr size, 1 584 je exit64_avx 585 586 xor offset, offset 587 588 vmovq bcast_idx, idx 589 vpxor accum_val, accum_val 590 vpinsrq bcast_idx, idx, 1 591 592 vmovdqa xadd, [rel add_2] 593 vmovdqa xindices, [rel idx_tab64] 594 595loop64_avx: 596 ;; Compare indices with idx 597 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) 598 vpcmpeqq xtmp, xindices, bcast_idx 599 600 ;; Load next 2 values 601 vmovdqa xtmp2, [table + offset] 602 603 ;; This generates data with all 0s except the value we are looking for in the index to look up 604 vpand xtmp2, xtmp 605 606 vpor accum_val, xtmp2 607 608 ;; Get next 2 indices 609 vpaddq xindices, xadd 610 add offset, 16 611 dec size 612 613 jne loop64_avx 614 615 ;; Extract value from XMM register 616 vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits 617 vpor accum_val, xtmp 618 619 vmovq rax, accum_val 620 621exit64_avx: 622 ret 623 624; __m128i lookup_16x8bit_sse(const __m128i indexes, const void *table) 625; arg 1 : vector with 16 8-bit indexes to be looked up 626; arg 2 : pointer to a 256 element table 627align 32 628MKGLOBAL(lookup_16x8bit_sse,function,internal) 629lookup_16x8bit_sse: 630%define arg_indexes xmm0 631%define arg_return xmm0 632%define arg_table arg1 633 634%ifndef LINUX 635%undef arg_table 636%define arg_table arg2 637 638 ; Read indices from memory, as __m128i parameters are stored 639 ; in stack (aligned to 16 bytes) and its address is passed through GP register on Windows 640 movdqa arg_indexes, [arg1] 641 mov rax, rsp 642 sub rsp, (10 * 16) 643 and rsp, ~15 644 ;; xmm6:xmm15 need to be maintained for Windows 645 movdqa [rsp + 0*16], xmm6 646 movdqa [rsp + 1*16], xmm7 647 movdqa [rsp + 2*16], xmm8 648 movdqa [rsp + 3*16], xmm9 649 movdqa [rsp + 4*16], xmm10 650 movdqa [rsp + 5*16], xmm11 651 movdqa [rsp + 6*16], xmm12 652 movdqa [rsp + 7*16], xmm13 653 movdqa [rsp + 8*16], xmm14 654 movdqa [rsp + 9*16], xmm15 655%endif 656 movdqa xmm15, [rel idx_rows_avx + (15 * 16)] 657 movdqa xmm14, xmm15 658 psrlq xmm14, 4 659 movdqa xmm1, arg_indexes 660 movdqa xmm2, arg_indexes 661 pand xmm1, xmm15 ;; top nibble part of the index 662 pand xmm2, xmm14 ;; low nibble part of the index 663 664 movdqa xmm9, xmm1 665 movdqa xmm10, xmm1 666 movdqa xmm11, xmm1 667 movdqa xmm12, xmm1 668 movdqa xmm13, xmm1 669 movdqa xmm14, xmm1 670 pcmpeqb xmm9, [rel idx_rows_avx + (0 * 16)] 671 movdqa xmm3, [arg_table + (0 * 16)] 672 pcmpeqb xmm10, [rel idx_rows_avx + (1 * 16)] 673 movdqa xmm4, [arg_table + (1 * 16)] 674 pcmpeqb xmm11, [rel idx_rows_avx + (2 * 16)] 675 movdqa xmm5, [arg_table + (2 * 16)] 676 pcmpeqb xmm12, [rel idx_rows_avx + (3 * 16)] 677 movdqa xmm6, [arg_table + (3 * 16)] 678 pcmpeqb xmm13, [rel idx_rows_avx + (4 * 16)] 679 movdqa xmm7, [arg_table + (4 * 16)] 680 pcmpeqb xmm14, [rel idx_rows_avx + (5 * 16)] 681 movdqa xmm8, [arg_table + (5 * 16)] 682 683 pshufb xmm3, xmm2 684 pshufb xmm4, xmm2 685 pshufb xmm5, xmm2 686 pshufb xmm6, xmm2 687 pshufb xmm7, xmm2 688 pshufb xmm8, xmm2 689 690 pand xmm9, xmm3 691 pand xmm10, xmm4 692 pand xmm11, xmm5 693 pand xmm12, xmm6 694 pand xmm13, xmm7 695 pand xmm14, xmm8 696 697 por xmm9, xmm10 698 por xmm11, xmm12 699 por xmm14, xmm13 700 movdqa arg_return, xmm9 701 por arg_return, xmm11 702 703 ;; xmm8 and xmm14 are used for final OR result from now on. 704 ;; arg_return & xmm14 carry current OR result. 705 706 movdqa xmm9, xmm1 707 movdqa xmm10, xmm1 708 movdqa xmm11, xmm1 709 movdqa xmm12, xmm1 710 movdqa xmm13, xmm1 711 712 pcmpeqb xmm9, [rel idx_rows_avx + (6 * 16)] 713 movdqa xmm3, [arg_table + (6 * 16)] 714 pcmpeqb xmm10, [rel idx_rows_avx + (7 * 16)] 715 movdqa xmm4, [arg_table + (7 * 16)] 716 pcmpeqb xmm11, [rel idx_rows_avx + (8 * 16)] 717 movdqa xmm5, [arg_table + (8 * 16)] 718 pcmpeqb xmm12, [rel idx_rows_avx + (9 * 16)] 719 movdqa xmm6, [arg_table + (9 * 16)] 720 pcmpeqb xmm13, [rel idx_rows_avx + (10 * 16)] 721 movdqa xmm7, [arg_table + (10 * 16)] 722 723 pshufb xmm3, xmm2 724 pshufb xmm4, xmm2 725 pshufb xmm5, xmm2 726 pshufb xmm6, xmm2 727 pshufb xmm7, xmm2 728 729 pand xmm9, xmm3 730 pand xmm10, xmm4 731 pand xmm11, xmm5 732 pand xmm12, xmm6 733 pand xmm13, xmm7 734 735 por xmm9, xmm10 736 por xmm11, xmm12 737 por xmm14, xmm13 738 por arg_return, xmm9 739 por xmm14, xmm11 740 741 ;; arg_return & xmm15 carry current OR result 742 743 movdqa xmm9, xmm1 744 movdqa xmm10, xmm1 745 movdqa xmm11, xmm1 746 movdqa xmm12, xmm1 747 movdqa xmm13, xmm1 748 749 pcmpeqb xmm9, [rel idx_rows_avx + (11 * 16)] 750 movdqa xmm3, [arg_table + (11 * 16)] 751 pcmpeqb xmm10, [rel idx_rows_avx + (12 * 16)] 752 movdqa xmm4, [arg_table + (12 * 16)] 753 pcmpeqb xmm11, [rel idx_rows_avx + (13 * 16)] 754 movdqa xmm5, [arg_table + (13 * 16)] 755 pcmpeqb xmm12, [rel idx_rows_avx + (14 * 16)] 756 movdqa xmm6, [arg_table + (14 * 16)] 757 pcmpeqb xmm13, [rel idx_rows_avx + (15 * 16)] 758 movdqa xmm7, [arg_table + (15 * 16)] 759 760 pshufb xmm3, xmm2 761 pshufb xmm4, xmm2 762 pshufb xmm5, xmm2 763 pshufb xmm6, xmm2 764 pshufb xmm7, xmm2 765 766 pand xmm9, xmm3 767 pand xmm10, xmm4 768 pand xmm11, xmm5 769 pand xmm12, xmm6 770 pand xmm13, xmm7 771 772 por xmm9, xmm10 773 por xmm11, xmm12 774 por xmm14, xmm13 775 por arg_return, xmm9 776 por xmm14, xmm11 777 por arg_return, xmm14 778 779%ifndef LINUX 780 movdqa xmm15, [rsp + 9*16] 781 movdqa xmm14, [rsp + 8*16] 782 movdqa xmm13, [rsp + 7*16] 783 movdqa xmm12, [rsp + 6*16] 784 movdqa xmm11, [rsp + 5*16] 785 movdqa xmm10, [rsp + 4*16] 786 movdqa xmm9, [rsp + 3*16] 787 movdqa xmm8, [rsp + 2*16] 788 movdqa xmm7, [rsp + 1*16] 789 movdqa xmm6, [rsp + 0*16] 790%ifdef SAFE_DATA 791 pxor xmm5, xmm5 792 movdqa [rsp + 0*16], xmm5 793 movdqa [rsp + 1*16], xmm5 794 movdqa [rsp + 2*16], xmm5 795 movdqa [rsp + 3*16], xmm5 796 movdqa [rsp + 4*16], xmm5 797 movdqa [rsp + 5*16], xmm5 798 movdqa [rsp + 6*16], xmm5 799 movdqa [rsp + 7*16], xmm5 800 movdqa [rsp + 8*16], xmm5 801 movdqa [rsp + 9*16], xmm5 802%endif ; SAFE_DATA 803 mov rsp, rax 804%endif ; !LINUX 805 ret 806%undef arg_indexes 807%undef arg_return 808%undef arg_table 809 810; __m128i lookup_16x8bit_avx(const __m128i indexes, const void *table) 811; arg 1 : vector with 16 8-bit indexes to be looked up 812; arg 2 : pointer to a 256 element table 813align 32 814MKGLOBAL(lookup_16x8bit_avx,function,internal) 815lookup_16x8bit_avx: 816%define arg_indexes xmm0 817%define arg_return xmm0 818%define arg_table arg1 819 820%ifndef LINUX 821%undef arg_table 822%define arg_table arg2 823 824 ; Read indices from memory, as __m128i parameters are stored 825 ; in stack (aligned to 16 bytes) and its address is passed through GP register on Windows 826 vmovdqa arg_indexes, [arg1] 827 mov rax, rsp 828 sub rsp, (10 * 16) 829 and rsp, ~15 830 ;; xmm6:xmm15 need to be maintained for Windows 831 vmovdqa [rsp + 0*16], xmm6 832 vmovdqa [rsp + 1*16], xmm7 833 vmovdqa [rsp + 2*16], xmm8 834 vmovdqa [rsp + 3*16], xmm9 835 vmovdqa [rsp + 4*16], xmm10 836 vmovdqa [rsp + 5*16], xmm11 837 vmovdqa [rsp + 6*16], xmm12 838 vmovdqa [rsp + 7*16], xmm13 839 vmovdqa [rsp + 8*16], xmm14 840 vmovdqa [rsp + 9*16], xmm15 841%endif ; !LINUX 842 843 vmovdqa xmm15, [rel idx_rows_avx + (15 * 16)] 844 vpsrlq xmm2, xmm15, 4 845 846 vpand xmm1, xmm15, arg_indexes ;; top nibble part of the index 847 vpand xmm2, xmm2, arg_indexes ;; low nibble part of the index 848 849 vpcmpeqb xmm9, xmm1, [rel idx_rows_avx + (0 * 16)] 850 vmovdqa xmm3, [arg_table + (0 * 16)] 851 vpcmpeqb xmm10, xmm1, [rel idx_rows_avx + (1 * 16)] 852 vmovdqa xmm4, [arg_table + (1 * 16)] 853 vpcmpeqb xmm11, xmm1, [rel idx_rows_avx + (2 * 16)] 854 vmovdqa xmm5, [arg_table + (2 * 16)] 855 vpcmpeqb xmm12, xmm1, [rel idx_rows_avx + (3 * 16)] 856 vmovdqa xmm6, [arg_table + (3 * 16)] 857 vpcmpeqb xmm13, xmm1, [rel idx_rows_avx + (4 * 16)] 858 vmovdqa xmm7, [arg_table + (4 * 16)] 859 vpcmpeqb xmm14, xmm1, [rel idx_rows_avx + (5 * 16)] 860 vmovdqa xmm8, [arg_table + (5 * 16)] 861 862 vpshufb xmm3, xmm3, xmm2 863 vpshufb xmm4, xmm4, xmm2 864 vpshufb xmm5, xmm5, xmm2 865 vpshufb xmm6, xmm6, xmm2 866 vpshufb xmm7, xmm7, xmm2 867 vpshufb xmm8, xmm8, xmm2 868 869 vpand xmm9, xmm9, xmm3 870 vpand xmm10, xmm10, xmm4 871 vpand xmm11, xmm11, xmm5 872 vpand xmm12, xmm12, xmm6 873 vpand xmm13, xmm13, xmm7 874 vpand xmm14, xmm14, xmm8 875 876 vpor xmm9, xmm9, xmm10 877 vpor xmm11, xmm11, xmm12 878 vpor xmm14, xmm13, xmm14 879 vpor arg_return, xmm9, xmm11 880 881 ;; xmm8 and xmm14 are used for final OR result from now on. 882 ;; arg_return & xmm14 carry current OR result. 883 884 vpcmpeqb xmm9, xmm1, [rel idx_rows_avx + (6 * 16)] 885 vmovdqa xmm3, [arg_table + (6 * 16)] 886 vpcmpeqb xmm10, xmm1, [rel idx_rows_avx + (7 * 16)] 887 vmovdqa xmm4, [arg_table + (7 * 16)] 888 vpcmpeqb xmm11, xmm1, [rel idx_rows_avx + (8 * 16)] 889 vmovdqa xmm5, [arg_table + (8 * 16)] 890 vpcmpeqb xmm12, xmm1, [rel idx_rows_avx + (9 * 16)] 891 vmovdqa xmm6, [arg_table + (9 * 16)] 892 vpcmpeqb xmm13, xmm1, [rel idx_rows_avx + (10 * 16)] 893 vmovdqa xmm7, [arg_table + (10 * 16)] 894 895 vpshufb xmm3, xmm3, xmm2 896 vpshufb xmm4, xmm4, xmm2 897 vpshufb xmm5, xmm5, xmm2 898 vpshufb xmm6, xmm6, xmm2 899 vpshufb xmm7, xmm7, xmm2 900 901 vpand xmm9, xmm9, xmm3 902 vpand xmm10, xmm10, xmm4 903 vpand xmm11, xmm11, xmm5 904 vpand xmm12, xmm12, xmm6 905 vpand xmm13, xmm13, xmm7 906 907 vpor xmm9, xmm9, xmm10 908 vpor xmm11, xmm11, xmm12 909 vpor xmm15, xmm9, xmm11 910 vpor xmm8, xmm14, xmm13 911 912 ;; arg_return, xmm15 & xmm8 carry current OR result 913 914 vpcmpeqb xmm9, xmm1, [rel idx_rows_avx + (11 * 16)] 915 vmovdqa xmm3, [arg_table + (11 * 16)] 916 vpcmpeqb xmm10, xmm1, [rel idx_rows_avx + (12 * 16)] 917 vmovdqa xmm4, [arg_table + (12 * 16)] 918 vpcmpeqb xmm11, xmm1, [rel idx_rows_avx + (13 * 16)] 919 vmovdqa xmm5, [arg_table + (13 * 16)] 920 vpcmpeqb xmm12, xmm1, [rel idx_rows_avx + (14 * 16)] 921 vmovdqa xmm6, [arg_table + (14 * 16)] 922 vpcmpeqb xmm13, xmm1, [rel idx_rows_avx + (15 * 16)] 923 vmovdqa xmm7, [arg_table + (15 * 16)] 924 925 vpshufb xmm3, xmm3, xmm2 926 vpshufb xmm4, xmm4, xmm2 927 vpshufb xmm5, xmm5, xmm2 928 vpshufb xmm6, xmm6, xmm2 929 vpshufb xmm7, xmm7, xmm2 930 931 vpand xmm9, xmm9, xmm3 932 vpand xmm10, xmm10, xmm4 933 vpand xmm11, xmm11, xmm5 934 vpand xmm12, xmm12, xmm6 935 vpand xmm13, xmm13, xmm7 936 937 vpor xmm14, xmm15, xmm8 938 vpor xmm9, xmm9, xmm10 939 vpor xmm11, xmm11, xmm12 940 vpor xmm13, xmm13, xmm14 941 vpor xmm15, xmm9, xmm11 942 vpor arg_return, arg_return, xmm13 943 vpor arg_return, arg_return, xmm15 944 945%ifndef LINUX 946 vmovdqa xmm15, [rsp + 9*16] 947 vmovdqa xmm14, [rsp + 8*16] 948 vmovdqa xmm13, [rsp + 7*16] 949 vmovdqa xmm12, [rsp + 6*16] 950 vmovdqa xmm11, [rsp + 5*16] 951 vmovdqa xmm10, [rsp + 4*16] 952 vmovdqa xmm9, [rsp + 3*16] 953 vmovdqa xmm8, [rsp + 2*16] 954 vmovdqa xmm7, [rsp + 1*16] 955 vmovdqa xmm6, [rsp + 0*16] 956%ifdef SAFE_DATA 957 vpxor xmm5, xmm5, xmm5 958 vmovdqa [rsp + 0*16], xmm5 959 vmovdqa [rsp + 1*16], xmm5 960 vmovdqa [rsp + 2*16], xmm5 961 vmovdqa [rsp + 3*16], xmm5 962 vmovdqa [rsp + 4*16], xmm5 963 vmovdqa [rsp + 5*16], xmm5 964 vmovdqa [rsp + 6*16], xmm5 965 vmovdqa [rsp + 7*16], xmm5 966 vmovdqa [rsp + 8*16], xmm5 967 vmovdqa [rsp + 9*16], xmm5 968%endif 969 mov rsp, rax 970%endif ; !LINUX 971 ret 972%undef arg_indexes 973%undef arg_return 974%undef arg_table 975 976; __m256i lookup_32x8bit_avx2(const __m256i indexes, const void *table) 977; arg 1 : vector with 32 8-bit indexes to be looked up 978; arg 2 : pointer to a 256 element table 979align 32 980MKGLOBAL(lookup_32x8bit_avx2,function,internal) 981lookup_32x8bit_avx2: 982%define arg_indexes ymm0 983%define arg_return ymm0 984%define arg_table arg1 985 986%ifndef LINUX 987%undef arg_table 988%define arg_table arg2 989 990 mov rax, rsp 991 sub rsp, (10 * 16) 992 and rsp, ~31 993 ;; xmm6:xmm15 need to be maintained for Windows 994 vmovdqa [rsp + 0*16], xmm6 995 vmovdqa [rsp + 1*16], xmm7 996 vmovdqa [rsp + 2*16], xmm8 997 vmovdqa [rsp + 3*16], xmm9 998 vmovdqa [rsp + 4*16], xmm10 999 vmovdqa [rsp + 5*16], xmm11 1000 vmovdqa [rsp + 6*16], xmm12 1001 vmovdqa [rsp + 7*16], xmm13 1002 vmovdqa [rsp + 8*16], xmm14 1003 vmovdqa [rsp + 9*16], xmm15 1004%endif ; !LINUX 1005 1006 vmovdqa ymm15, [rel idx_rows_avx2 + (15 * 32)] 1007 vpsrlq ymm2, ymm15, 4 1008 1009 vpand ymm1, ymm15, arg_indexes ;; top nibble part of the index 1010 vpand ymm2, ymm2, arg_indexes ;; low nibble part of the index 1011 1012 vpcmpeqb ymm9, ymm1, [rel idx_rows_avx2 + (0 * 32)] 1013 vbroadcastf128 ymm3, [arg_table + (0 * 16)] 1014 vpcmpeqb ymm10, ymm1, [rel idx_rows_avx2 + (1 * 32)] 1015 vbroadcastf128 ymm4, [arg_table + (1 * 16)] 1016 vpcmpeqb ymm11, ymm1, [rel idx_rows_avx2 + (2 * 32)] 1017 vbroadcastf128 ymm5, [arg_table + (2 * 16)] 1018 vpcmpeqb ymm12, ymm1, [rel idx_rows_avx2 + (3 * 32)] 1019 vbroadcastf128 ymm6, [arg_table + (3 * 16)] 1020 vpcmpeqb ymm13, ymm1, [rel idx_rows_avx2 + (4 * 32)] 1021 vbroadcastf128 ymm7, [arg_table + (4 * 16)] 1022 vpcmpeqb ymm14, ymm1, [rel idx_rows_avx2 + (5 * 32)] 1023 vbroadcastf128 ymm8, [arg_table + (5 * 16)] 1024 1025 vpshufb ymm3, ymm3, ymm2 1026 vpshufb ymm4, ymm4, ymm2 1027 vpshufb ymm5, ymm5, ymm2 1028 vpshufb ymm6, ymm6, ymm2 1029 vpshufb ymm7, ymm7, ymm2 1030 vpshufb ymm8, ymm8, ymm2 1031 1032 vpand ymm9, ymm9, ymm3 1033 vpand ymm10, ymm10, ymm4 1034 vpand ymm11, ymm11, ymm5 1035 vpand ymm12, ymm12, ymm6 1036 vpand ymm13, ymm13, ymm7 1037 vpand ymm14, ymm14, ymm8 1038 1039 vpor ymm9, ymm9, ymm10 1040 vpor ymm11, ymm11, ymm12 1041 vpor ymm14, ymm13, ymm14 1042 vpor arg_return, ymm9, ymm11 1043 1044 ;; ymm8 and ymm14 are used for final OR result from now on. 1045 ;; arg_return & ymm14 carry current OR result. 1046 1047 vpcmpeqb ymm9, ymm1, [rel idx_rows_avx2 + (6 * 32)] 1048 vbroadcastf128 ymm3, [arg_table + (6 * 16)] 1049 vpcmpeqb ymm10, ymm1, [rel idx_rows_avx2 + (7 * 32)] 1050 vbroadcastf128 ymm4, [arg_table + (7 * 16)] 1051 vpcmpeqb ymm11, ymm1, [rel idx_rows_avx2 + (8 * 32)] 1052 vbroadcastf128 ymm5, [arg_table + (8 * 16)] 1053 vpcmpeqb ymm12, ymm1, [rel idx_rows_avx2 + (9 * 32)] 1054 vbroadcastf128 ymm6, [arg_table + (9 * 16)] 1055 vpcmpeqb ymm13, ymm1, [rel idx_rows_avx2 + (10 * 32)] 1056 vbroadcastf128 ymm7, [arg_table + (10 * 16)] 1057 1058 vpshufb ymm3, ymm3, ymm2 1059 vpshufb ymm4, ymm4, ymm2 1060 vpshufb ymm5, ymm5, ymm2 1061 vpshufb ymm6, ymm6, ymm2 1062 vpshufb ymm7, ymm7, ymm2 1063 1064 vpand ymm9, ymm9, ymm3 1065 vpand ymm10, ymm10, ymm4 1066 vpand ymm11, ymm11, ymm5 1067 vpand ymm12, ymm12, ymm6 1068 vpand ymm13, ymm13, ymm7 1069 1070 vpor ymm9, ymm9, ymm10 1071 vpor ymm11, ymm11, ymm12 1072 vpor ymm15, ymm9, ymm11 1073 vpor ymm8, ymm14, ymm13 1074 1075 ;; arg_return, ymm15 & ymm8 carry current OR result 1076 1077 vpcmpeqb ymm9, ymm1, [rel idx_rows_avx2 + (11 * 32)] 1078 vbroadcastf128 ymm3, [arg_table + (11 * 16)] 1079 vpcmpeqb ymm10, ymm1, [rel idx_rows_avx2 + (12 * 32)] 1080 vbroadcastf128 ymm4, [arg_table + (12 * 16)] 1081 vpcmpeqb ymm11, ymm1, [rel idx_rows_avx2 + (13 * 32)] 1082 vbroadcastf128 ymm5, [arg_table + (13 * 16)] 1083 vpcmpeqb ymm12, ymm1, [rel idx_rows_avx2 + (14 * 32)] 1084 vbroadcastf128 ymm6, [arg_table + (14 * 16)] 1085 vpcmpeqb ymm13, ymm1, [rel idx_rows_avx2 + (15 * 32)] 1086 vbroadcastf128 ymm7, [arg_table + (15 * 16)] 1087 1088 vpshufb ymm3, ymm3, ymm2 1089 vpshufb ymm4, ymm4, ymm2 1090 vpshufb ymm5, ymm5, ymm2 1091 vpshufb ymm6, ymm6, ymm2 1092 vpshufb ymm7, ymm7, ymm2 1093 1094 vpand ymm9, ymm9, ymm3 1095 vpand ymm10, ymm10, ymm4 1096 vpand ymm11, ymm11, ymm5 1097 vpand ymm12, ymm12, ymm6 1098 vpand ymm13, ymm13, ymm7 1099 1100 vpor ymm14, ymm15, ymm8 1101 vpor ymm9, ymm9, ymm10 1102 vpor ymm11, ymm11, ymm12 1103 vpor ymm13, ymm13, ymm14 1104 vpor ymm15, ymm9, ymm11 1105 vpor arg_return, arg_return, ymm13 1106 vpor arg_return, arg_return, ymm15 1107 1108%ifndef LINUX 1109 vmovdqa xmm15, [rsp + 9*16] 1110 vmovdqa xmm14, [rsp + 8*16] 1111 vmovdqa xmm13, [rsp + 7*16] 1112 vmovdqa xmm12, [rsp + 6*16] 1113 vmovdqa xmm11, [rsp + 5*16] 1114 vmovdqa xmm10, [rsp + 4*16] 1115 vmovdqa xmm9, [rsp + 3*16] 1116 vmovdqa xmm8, [rsp + 2*16] 1117 vmovdqa xmm7, [rsp + 1*16] 1118 vmovdqa xmm6, [rsp + 0*16] 1119%ifdef SAFE_DATA 1120 vpxor ymm5, ymm5, ymm5 1121 vmovdqa [rsp + 0*16], ymm5 1122 vmovdqa [rsp + 2*16], ymm5 1123 vmovdqa [rsp + 4*16], ymm5 1124 vmovdqa [rsp + 6*16], ymm5 1125 vmovdqa [rsp + 8*16], ymm5 1126%endif 1127 mov rsp, rax 1128%endif ; !LINUX 1129 ret 1130%undef arg_indexes 1131%undef arg_return 1132%undef arg_table 1133 1134%ifdef LINUX 1135section .note.GNU-stack noalloc noexec nowrite progbits 1136%endif 1137