1; This file is generated from a similarly-named Perl script in the BoringSSL 2; source tree. Do not edit by hand. 3 4%ifdef BORINGSSL_PREFIX 5%include "boringssl_prefix_symbols_nasm.inc" 6%endif 7%ifidn __OUTPUT_FORMAT__,obj 8section code use32 class=code align=64 9%elifidn __OUTPUT_FORMAT__,win32 10%ifdef __YASM_VERSION_ID__ 11%if __YASM_VERSION_ID__ < 01010000h 12%error yasm version 1.1.0 or later needed. 13%endif 14; Yasm automatically includes .00 and complains about redefining it. 15; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html 16%else 17$@feat.00 equ 1 18%endif 19section .text code align=64 20%else 21section .text code 22%endif 23global _GFp_ChaCha20_ctr32 24align 16 25_GFp_ChaCha20_ctr32: 26L$_GFp_ChaCha20_ctr32_begin: 27 push ebp 28 push ebx 29 push esi 30 push edi 31 xor eax,eax 32 cmp eax,DWORD [28+esp] 33 je NEAR L$000no_data 34 call L$pic_point 35L$pic_point: 36 pop eax 37 lea ebp,[_GFp_ia32cap_P] 38 test DWORD [ebp],16777216 39 jz NEAR L$001x86 40 test DWORD [4+ebp],512 41 jz NEAR L$001x86 42 jmp NEAR L$ssse3_shortcut 43L$001x86: 44 mov esi,DWORD [32+esp] 45 mov edi,DWORD [36+esp] 46 sub esp,132 47 mov eax,DWORD [esi] 48 mov ebx,DWORD [4+esi] 49 mov ecx,DWORD [8+esi] 50 mov edx,DWORD [12+esi] 51 mov DWORD [80+esp],eax 52 mov DWORD [84+esp],ebx 53 mov DWORD [88+esp],ecx 54 mov DWORD [92+esp],edx 55 mov eax,DWORD [16+esi] 56 mov ebx,DWORD [20+esi] 57 mov ecx,DWORD [24+esi] 58 mov edx,DWORD [28+esi] 59 mov DWORD [96+esp],eax 60 mov DWORD [100+esp],ebx 61 mov DWORD [104+esp],ecx 62 mov DWORD [108+esp],edx 63 mov eax,DWORD [edi] 64 mov ebx,DWORD [4+edi] 65 mov ecx,DWORD [8+edi] 66 mov edx,DWORD [12+edi] 67 sub eax,1 68 mov DWORD [112+esp],eax 69 mov DWORD [116+esp],ebx 70 mov DWORD [120+esp],ecx 71 mov DWORD [124+esp],edx 72 jmp NEAR L$002entry 73align 16 74L$003outer_loop: 75 mov DWORD [156+esp],ebx 76 mov DWORD [152+esp],eax 77 mov DWORD [160+esp],ecx 78L$002entry: 79 mov eax,1634760805 80 mov DWORD [4+esp],857760878 81 mov DWORD [8+esp],2036477234 82 mov DWORD [12+esp],1797285236 83 mov ebx,DWORD [84+esp] 84 mov ebp,DWORD [88+esp] 85 mov ecx,DWORD [104+esp] 86 mov esi,DWORD [108+esp] 87 mov edx,DWORD [116+esp] 88 mov edi,DWORD [120+esp] 89 mov DWORD [20+esp],ebx 90 mov DWORD [24+esp],ebp 91 mov DWORD [40+esp],ecx 92 mov DWORD [44+esp],esi 93 mov DWORD [52+esp],edx 94 mov DWORD [56+esp],edi 95 mov ebx,DWORD [92+esp] 96 mov edi,DWORD [124+esp] 97 mov edx,DWORD [112+esp] 98 mov ebp,DWORD [80+esp] 99 mov ecx,DWORD [96+esp] 100 mov esi,DWORD [100+esp] 101 add edx,1 102 mov DWORD [28+esp],ebx 103 mov DWORD [60+esp],edi 104 mov DWORD [112+esp],edx 105 mov ebx,10 106 jmp NEAR L$004loop 107align 16 108L$004loop: 109 add eax,ebp 110 mov DWORD [128+esp],ebx 111 mov ebx,ebp 112 xor edx,eax 113 rol edx,16 114 add ecx,edx 115 xor ebx,ecx 116 mov edi,DWORD [52+esp] 117 rol ebx,12 118 mov ebp,DWORD [20+esp] 119 add eax,ebx 120 xor edx,eax 121 mov DWORD [esp],eax 122 rol edx,8 123 mov eax,DWORD [4+esp] 124 add ecx,edx 125 mov DWORD [48+esp],edx 126 xor ebx,ecx 127 add eax,ebp 128 rol ebx,7 129 xor edi,eax 130 mov DWORD [32+esp],ecx 131 rol edi,16 132 mov DWORD [16+esp],ebx 133 add esi,edi 134 mov ecx,DWORD [40+esp] 135 xor ebp,esi 136 mov edx,DWORD [56+esp] 137 rol ebp,12 138 mov ebx,DWORD [24+esp] 139 add eax,ebp 140 xor edi,eax 141 mov DWORD [4+esp],eax 142 rol edi,8 143 mov eax,DWORD [8+esp] 144 add esi,edi 145 mov DWORD [52+esp],edi 146 xor ebp,esi 147 add eax,ebx 148 rol ebp,7 149 xor edx,eax 150 mov DWORD [36+esp],esi 151 rol edx,16 152 mov DWORD [20+esp],ebp 153 add ecx,edx 154 mov esi,DWORD [44+esp] 155 xor ebx,ecx 156 mov edi,DWORD [60+esp] 157 rol ebx,12 158 mov ebp,DWORD [28+esp] 159 add eax,ebx 160 xor edx,eax 161 mov DWORD [8+esp],eax 162 rol edx,8 163 mov eax,DWORD [12+esp] 164 add ecx,edx 165 mov DWORD [56+esp],edx 166 xor ebx,ecx 167 add eax,ebp 168 rol ebx,7 169 xor edi,eax 170 rol edi,16 171 mov DWORD [24+esp],ebx 172 add esi,edi 173 xor ebp,esi 174 rol ebp,12 175 mov ebx,DWORD [20+esp] 176 add eax,ebp 177 xor edi,eax 178 mov DWORD [12+esp],eax 179 rol edi,8 180 mov eax,DWORD [esp] 181 add esi,edi 182 mov edx,edi 183 xor ebp,esi 184 add eax,ebx 185 rol ebp,7 186 xor edx,eax 187 rol edx,16 188 mov DWORD [28+esp],ebp 189 add ecx,edx 190 xor ebx,ecx 191 mov edi,DWORD [48+esp] 192 rol ebx,12 193 mov ebp,DWORD [24+esp] 194 add eax,ebx 195 xor edx,eax 196 mov DWORD [esp],eax 197 rol edx,8 198 mov eax,DWORD [4+esp] 199 add ecx,edx 200 mov DWORD [60+esp],edx 201 xor ebx,ecx 202 add eax,ebp 203 rol ebx,7 204 xor edi,eax 205 mov DWORD [40+esp],ecx 206 rol edi,16 207 mov DWORD [20+esp],ebx 208 add esi,edi 209 mov ecx,DWORD [32+esp] 210 xor ebp,esi 211 mov edx,DWORD [52+esp] 212 rol ebp,12 213 mov ebx,DWORD [28+esp] 214 add eax,ebp 215 xor edi,eax 216 mov DWORD [4+esp],eax 217 rol edi,8 218 mov eax,DWORD [8+esp] 219 add esi,edi 220 mov DWORD [48+esp],edi 221 xor ebp,esi 222 add eax,ebx 223 rol ebp,7 224 xor edx,eax 225 mov DWORD [44+esp],esi 226 rol edx,16 227 mov DWORD [24+esp],ebp 228 add ecx,edx 229 mov esi,DWORD [36+esp] 230 xor ebx,ecx 231 mov edi,DWORD [56+esp] 232 rol ebx,12 233 mov ebp,DWORD [16+esp] 234 add eax,ebx 235 xor edx,eax 236 mov DWORD [8+esp],eax 237 rol edx,8 238 mov eax,DWORD [12+esp] 239 add ecx,edx 240 mov DWORD [52+esp],edx 241 xor ebx,ecx 242 add eax,ebp 243 rol ebx,7 244 xor edi,eax 245 rol edi,16 246 mov DWORD [28+esp],ebx 247 add esi,edi 248 xor ebp,esi 249 mov edx,DWORD [48+esp] 250 rol ebp,12 251 mov ebx,DWORD [128+esp] 252 add eax,ebp 253 xor edi,eax 254 mov DWORD [12+esp],eax 255 rol edi,8 256 mov eax,DWORD [esp] 257 add esi,edi 258 mov DWORD [56+esp],edi 259 xor ebp,esi 260 rol ebp,7 261 dec ebx 262 jnz NEAR L$004loop 263 mov ebx,DWORD [160+esp] 264 add eax,1634760805 265 add ebp,DWORD [80+esp] 266 add ecx,DWORD [96+esp] 267 add esi,DWORD [100+esp] 268 cmp ebx,64 269 jb NEAR L$005tail 270 mov ebx,DWORD [156+esp] 271 add edx,DWORD [112+esp] 272 add edi,DWORD [120+esp] 273 xor eax,DWORD [ebx] 274 xor ebp,DWORD [16+ebx] 275 mov DWORD [esp],eax 276 mov eax,DWORD [152+esp] 277 xor ecx,DWORD [32+ebx] 278 xor esi,DWORD [36+ebx] 279 xor edx,DWORD [48+ebx] 280 xor edi,DWORD [56+ebx] 281 mov DWORD [16+eax],ebp 282 mov DWORD [32+eax],ecx 283 mov DWORD [36+eax],esi 284 mov DWORD [48+eax],edx 285 mov DWORD [56+eax],edi 286 mov ebp,DWORD [4+esp] 287 mov ecx,DWORD [8+esp] 288 mov esi,DWORD [12+esp] 289 mov edx,DWORD [20+esp] 290 mov edi,DWORD [24+esp] 291 add ebp,857760878 292 add ecx,2036477234 293 add esi,1797285236 294 add edx,DWORD [84+esp] 295 add edi,DWORD [88+esp] 296 xor ebp,DWORD [4+ebx] 297 xor ecx,DWORD [8+ebx] 298 xor esi,DWORD [12+ebx] 299 xor edx,DWORD [20+ebx] 300 xor edi,DWORD [24+ebx] 301 mov DWORD [4+eax],ebp 302 mov DWORD [8+eax],ecx 303 mov DWORD [12+eax],esi 304 mov DWORD [20+eax],edx 305 mov DWORD [24+eax],edi 306 mov ebp,DWORD [28+esp] 307 mov ecx,DWORD [40+esp] 308 mov esi,DWORD [44+esp] 309 mov edx,DWORD [52+esp] 310 mov edi,DWORD [60+esp] 311 add ebp,DWORD [92+esp] 312 add ecx,DWORD [104+esp] 313 add esi,DWORD [108+esp] 314 add edx,DWORD [116+esp] 315 add edi,DWORD [124+esp] 316 xor ebp,DWORD [28+ebx] 317 xor ecx,DWORD [40+ebx] 318 xor esi,DWORD [44+ebx] 319 xor edx,DWORD [52+ebx] 320 xor edi,DWORD [60+ebx] 321 lea ebx,[64+ebx] 322 mov DWORD [28+eax],ebp 323 mov ebp,DWORD [esp] 324 mov DWORD [40+eax],ecx 325 mov ecx,DWORD [160+esp] 326 mov DWORD [44+eax],esi 327 mov DWORD [52+eax],edx 328 mov DWORD [60+eax],edi 329 mov DWORD [eax],ebp 330 lea eax,[64+eax] 331 sub ecx,64 332 jnz NEAR L$003outer_loop 333 jmp NEAR L$006done 334L$005tail: 335 add edx,DWORD [112+esp] 336 add edi,DWORD [120+esp] 337 mov DWORD [esp],eax 338 mov DWORD [16+esp],ebp 339 mov DWORD [32+esp],ecx 340 mov DWORD [36+esp],esi 341 mov DWORD [48+esp],edx 342 mov DWORD [56+esp],edi 343 mov ebp,DWORD [4+esp] 344 mov ecx,DWORD [8+esp] 345 mov esi,DWORD [12+esp] 346 mov edx,DWORD [20+esp] 347 mov edi,DWORD [24+esp] 348 add ebp,857760878 349 add ecx,2036477234 350 add esi,1797285236 351 add edx,DWORD [84+esp] 352 add edi,DWORD [88+esp] 353 mov DWORD [4+esp],ebp 354 mov DWORD [8+esp],ecx 355 mov DWORD [12+esp],esi 356 mov DWORD [20+esp],edx 357 mov DWORD [24+esp],edi 358 mov ebp,DWORD [28+esp] 359 mov ecx,DWORD [40+esp] 360 mov esi,DWORD [44+esp] 361 mov edx,DWORD [52+esp] 362 mov edi,DWORD [60+esp] 363 add ebp,DWORD [92+esp] 364 add ecx,DWORD [104+esp] 365 add esi,DWORD [108+esp] 366 add edx,DWORD [116+esp] 367 add edi,DWORD [124+esp] 368 mov DWORD [28+esp],ebp 369 mov ebp,DWORD [156+esp] 370 mov DWORD [40+esp],ecx 371 mov ecx,DWORD [152+esp] 372 mov DWORD [44+esp],esi 373 xor esi,esi 374 mov DWORD [52+esp],edx 375 mov DWORD [60+esp],edi 376 xor eax,eax 377 xor edx,edx 378L$007tail_loop: 379 mov al,BYTE [ebp*1+esi] 380 mov dl,BYTE [esi*1+esp] 381 lea esi,[1+esi] 382 xor al,dl 383 mov BYTE [esi*1+ecx-1],al 384 dec ebx 385 jnz NEAR L$007tail_loop 386L$006done: 387 add esp,132 388L$000no_data: 389 pop edi 390 pop esi 391 pop ebx 392 pop ebp 393 ret 394align 16 395__ChaCha20_ssse3: 396 push ebp 397 push ebx 398 push esi 399 push edi 400L$ssse3_shortcut: 401 mov edi,DWORD [20+esp] 402 mov esi,DWORD [24+esp] 403 mov ecx,DWORD [28+esp] 404 mov edx,DWORD [32+esp] 405 mov ebx,DWORD [36+esp] 406 mov ebp,esp 407 sub esp,524 408 and esp,-64 409 mov DWORD [512+esp],ebp 410 lea eax,[(L$ssse3_data-L$pic_point)+eax] 411 movdqu xmm3,[ebx] 412 cmp ecx,256 413 jb NEAR L$0081x 414 mov DWORD [516+esp],edx 415 mov DWORD [520+esp],ebx 416 sub ecx,256 417 lea ebp,[384+esp] 418 movdqu xmm7,[edx] 419 pshufd xmm0,xmm3,0 420 pshufd xmm1,xmm3,85 421 pshufd xmm2,xmm3,170 422 pshufd xmm3,xmm3,255 423 paddd xmm0,[48+eax] 424 pshufd xmm4,xmm7,0 425 pshufd xmm5,xmm7,85 426 psubd xmm0,[64+eax] 427 pshufd xmm6,xmm7,170 428 pshufd xmm7,xmm7,255 429 movdqa [64+ebp],xmm0 430 movdqa [80+ebp],xmm1 431 movdqa [96+ebp],xmm2 432 movdqa [112+ebp],xmm3 433 movdqu xmm3,[16+edx] 434 movdqa [ebp-64],xmm4 435 movdqa [ebp-48],xmm5 436 movdqa [ebp-32],xmm6 437 movdqa [ebp-16],xmm7 438 movdqa xmm7,[32+eax] 439 lea ebx,[128+esp] 440 pshufd xmm0,xmm3,0 441 pshufd xmm1,xmm3,85 442 pshufd xmm2,xmm3,170 443 pshufd xmm3,xmm3,255 444 pshufd xmm4,xmm7,0 445 pshufd xmm5,xmm7,85 446 pshufd xmm6,xmm7,170 447 pshufd xmm7,xmm7,255 448 movdqa [ebp],xmm0 449 movdqa [16+ebp],xmm1 450 movdqa [32+ebp],xmm2 451 movdqa [48+ebp],xmm3 452 movdqa [ebp-128],xmm4 453 movdqa [ebp-112],xmm5 454 movdqa [ebp-96],xmm6 455 movdqa [ebp-80],xmm7 456 lea esi,[128+esi] 457 lea edi,[128+edi] 458 jmp NEAR L$009outer_loop 459align 16 460L$009outer_loop: 461 movdqa xmm1,[ebp-112] 462 movdqa xmm2,[ebp-96] 463 movdqa xmm3,[ebp-80] 464 movdqa xmm5,[ebp-48] 465 movdqa xmm6,[ebp-32] 466 movdqa xmm7,[ebp-16] 467 movdqa [ebx-112],xmm1 468 movdqa [ebx-96],xmm2 469 movdqa [ebx-80],xmm3 470 movdqa [ebx-48],xmm5 471 movdqa [ebx-32],xmm6 472 movdqa [ebx-16],xmm7 473 movdqa xmm2,[32+ebp] 474 movdqa xmm3,[48+ebp] 475 movdqa xmm4,[64+ebp] 476 movdqa xmm5,[80+ebp] 477 movdqa xmm6,[96+ebp] 478 movdqa xmm7,[112+ebp] 479 paddd xmm4,[64+eax] 480 movdqa [32+ebx],xmm2 481 movdqa [48+ebx],xmm3 482 movdqa [64+ebx],xmm4 483 movdqa [80+ebx],xmm5 484 movdqa [96+ebx],xmm6 485 movdqa [112+ebx],xmm7 486 movdqa [64+ebp],xmm4 487 movdqa xmm0,[ebp-128] 488 movdqa xmm6,xmm4 489 movdqa xmm3,[ebp-64] 490 movdqa xmm4,[ebp] 491 movdqa xmm5,[16+ebp] 492 mov edx,10 493 nop 494align 16 495L$010loop: 496 paddd xmm0,xmm3 497 movdqa xmm2,xmm3 498 pxor xmm6,xmm0 499 pshufb xmm6,[eax] 500 paddd xmm4,xmm6 501 pxor xmm2,xmm4 502 movdqa xmm3,[ebx-48] 503 movdqa xmm1,xmm2 504 pslld xmm2,12 505 psrld xmm1,20 506 por xmm2,xmm1 507 movdqa xmm1,[ebx-112] 508 paddd xmm0,xmm2 509 movdqa xmm7,[80+ebx] 510 pxor xmm6,xmm0 511 movdqa [ebx-128],xmm0 512 pshufb xmm6,[16+eax] 513 paddd xmm4,xmm6 514 movdqa [64+ebx],xmm6 515 pxor xmm2,xmm4 516 paddd xmm1,xmm3 517 movdqa xmm0,xmm2 518 pslld xmm2,7 519 psrld xmm0,25 520 pxor xmm7,xmm1 521 por xmm2,xmm0 522 movdqa [ebx],xmm4 523 pshufb xmm7,[eax] 524 movdqa [ebx-64],xmm2 525 paddd xmm5,xmm7 526 movdqa xmm4,[32+ebx] 527 pxor xmm3,xmm5 528 movdqa xmm2,[ebx-32] 529 movdqa xmm0,xmm3 530 pslld xmm3,12 531 psrld xmm0,20 532 por xmm3,xmm0 533 movdqa xmm0,[ebx-96] 534 paddd xmm1,xmm3 535 movdqa xmm6,[96+ebx] 536 pxor xmm7,xmm1 537 movdqa [ebx-112],xmm1 538 pshufb xmm7,[16+eax] 539 paddd xmm5,xmm7 540 movdqa [80+ebx],xmm7 541 pxor xmm3,xmm5 542 paddd xmm0,xmm2 543 movdqa xmm1,xmm3 544 pslld xmm3,7 545 psrld xmm1,25 546 pxor xmm6,xmm0 547 por xmm3,xmm1 548 movdqa [16+ebx],xmm5 549 pshufb xmm6,[eax] 550 movdqa [ebx-48],xmm3 551 paddd xmm4,xmm6 552 movdqa xmm5,[48+ebx] 553 pxor xmm2,xmm4 554 movdqa xmm3,[ebx-16] 555 movdqa xmm1,xmm2 556 pslld xmm2,12 557 psrld xmm1,20 558 por xmm2,xmm1 559 movdqa xmm1,[ebx-80] 560 paddd xmm0,xmm2 561 movdqa xmm7,[112+ebx] 562 pxor xmm6,xmm0 563 movdqa [ebx-96],xmm0 564 pshufb xmm6,[16+eax] 565 paddd xmm4,xmm6 566 movdqa [96+ebx],xmm6 567 pxor xmm2,xmm4 568 paddd xmm1,xmm3 569 movdqa xmm0,xmm2 570 pslld xmm2,7 571 psrld xmm0,25 572 pxor xmm7,xmm1 573 por xmm2,xmm0 574 pshufb xmm7,[eax] 575 movdqa [ebx-32],xmm2 576 paddd xmm5,xmm7 577 pxor xmm3,xmm5 578 movdqa xmm2,[ebx-48] 579 movdqa xmm0,xmm3 580 pslld xmm3,12 581 psrld xmm0,20 582 por xmm3,xmm0 583 movdqa xmm0,[ebx-128] 584 paddd xmm1,xmm3 585 pxor xmm7,xmm1 586 movdqa [ebx-80],xmm1 587 pshufb xmm7,[16+eax] 588 paddd xmm5,xmm7 589 movdqa xmm6,xmm7 590 pxor xmm3,xmm5 591 paddd xmm0,xmm2 592 movdqa xmm1,xmm3 593 pslld xmm3,7 594 psrld xmm1,25 595 pxor xmm6,xmm0 596 por xmm3,xmm1 597 pshufb xmm6,[eax] 598 movdqa [ebx-16],xmm3 599 paddd xmm4,xmm6 600 pxor xmm2,xmm4 601 movdqa xmm3,[ebx-32] 602 movdqa xmm1,xmm2 603 pslld xmm2,12 604 psrld xmm1,20 605 por xmm2,xmm1 606 movdqa xmm1,[ebx-112] 607 paddd xmm0,xmm2 608 movdqa xmm7,[64+ebx] 609 pxor xmm6,xmm0 610 movdqa [ebx-128],xmm0 611 pshufb xmm6,[16+eax] 612 paddd xmm4,xmm6 613 movdqa [112+ebx],xmm6 614 pxor xmm2,xmm4 615 paddd xmm1,xmm3 616 movdqa xmm0,xmm2 617 pslld xmm2,7 618 psrld xmm0,25 619 pxor xmm7,xmm1 620 por xmm2,xmm0 621 movdqa [32+ebx],xmm4 622 pshufb xmm7,[eax] 623 movdqa [ebx-48],xmm2 624 paddd xmm5,xmm7 625 movdqa xmm4,[ebx] 626 pxor xmm3,xmm5 627 movdqa xmm2,[ebx-16] 628 movdqa xmm0,xmm3 629 pslld xmm3,12 630 psrld xmm0,20 631 por xmm3,xmm0 632 movdqa xmm0,[ebx-96] 633 paddd xmm1,xmm3 634 movdqa xmm6,[80+ebx] 635 pxor xmm7,xmm1 636 movdqa [ebx-112],xmm1 637 pshufb xmm7,[16+eax] 638 paddd xmm5,xmm7 639 movdqa [64+ebx],xmm7 640 pxor xmm3,xmm5 641 paddd xmm0,xmm2 642 movdqa xmm1,xmm3 643 pslld xmm3,7 644 psrld xmm1,25 645 pxor xmm6,xmm0 646 por xmm3,xmm1 647 movdqa [48+ebx],xmm5 648 pshufb xmm6,[eax] 649 movdqa [ebx-32],xmm3 650 paddd xmm4,xmm6 651 movdqa xmm5,[16+ebx] 652 pxor xmm2,xmm4 653 movdqa xmm3,[ebx-64] 654 movdqa xmm1,xmm2 655 pslld xmm2,12 656 psrld xmm1,20 657 por xmm2,xmm1 658 movdqa xmm1,[ebx-80] 659 paddd xmm0,xmm2 660 movdqa xmm7,[96+ebx] 661 pxor xmm6,xmm0 662 movdqa [ebx-96],xmm0 663 pshufb xmm6,[16+eax] 664 paddd xmm4,xmm6 665 movdqa [80+ebx],xmm6 666 pxor xmm2,xmm4 667 paddd xmm1,xmm3 668 movdqa xmm0,xmm2 669 pslld xmm2,7 670 psrld xmm0,25 671 pxor xmm7,xmm1 672 por xmm2,xmm0 673 pshufb xmm7,[eax] 674 movdqa [ebx-16],xmm2 675 paddd xmm5,xmm7 676 pxor xmm3,xmm5 677 movdqa xmm0,xmm3 678 pslld xmm3,12 679 psrld xmm0,20 680 por xmm3,xmm0 681 movdqa xmm0,[ebx-128] 682 paddd xmm1,xmm3 683 movdqa xmm6,[64+ebx] 684 pxor xmm7,xmm1 685 movdqa [ebx-80],xmm1 686 pshufb xmm7,[16+eax] 687 paddd xmm5,xmm7 688 movdqa [96+ebx],xmm7 689 pxor xmm3,xmm5 690 movdqa xmm1,xmm3 691 pslld xmm3,7 692 psrld xmm1,25 693 por xmm3,xmm1 694 dec edx 695 jnz NEAR L$010loop 696 movdqa [ebx-64],xmm3 697 movdqa [ebx],xmm4 698 movdqa [16+ebx],xmm5 699 movdqa [64+ebx],xmm6 700 movdqa [96+ebx],xmm7 701 movdqa xmm1,[ebx-112] 702 movdqa xmm2,[ebx-96] 703 movdqa xmm3,[ebx-80] 704 paddd xmm0,[ebp-128] 705 paddd xmm1,[ebp-112] 706 paddd xmm2,[ebp-96] 707 paddd xmm3,[ebp-80] 708 movdqa xmm6,xmm0 709 punpckldq xmm0,xmm1 710 movdqa xmm7,xmm2 711 punpckldq xmm2,xmm3 712 punpckhdq xmm6,xmm1 713 punpckhdq xmm7,xmm3 714 movdqa xmm1,xmm0 715 punpcklqdq xmm0,xmm2 716 movdqa xmm3,xmm6 717 punpcklqdq xmm6,xmm7 718 punpckhqdq xmm1,xmm2 719 punpckhqdq xmm3,xmm7 720 movdqu xmm4,[esi-128] 721 movdqu xmm5,[esi-64] 722 movdqu xmm2,[esi] 723 movdqu xmm7,[64+esi] 724 lea esi,[16+esi] 725 pxor xmm4,xmm0 726 movdqa xmm0,[ebx-64] 727 pxor xmm5,xmm1 728 movdqa xmm1,[ebx-48] 729 pxor xmm6,xmm2 730 movdqa xmm2,[ebx-32] 731 pxor xmm7,xmm3 732 movdqa xmm3,[ebx-16] 733 movdqu [edi-128],xmm4 734 movdqu [edi-64],xmm5 735 movdqu [edi],xmm6 736 movdqu [64+edi],xmm7 737 lea edi,[16+edi] 738 paddd xmm0,[ebp-64] 739 paddd xmm1,[ebp-48] 740 paddd xmm2,[ebp-32] 741 paddd xmm3,[ebp-16] 742 movdqa xmm6,xmm0 743 punpckldq xmm0,xmm1 744 movdqa xmm7,xmm2 745 punpckldq xmm2,xmm3 746 punpckhdq xmm6,xmm1 747 punpckhdq xmm7,xmm3 748 movdqa xmm1,xmm0 749 punpcklqdq xmm0,xmm2 750 movdqa xmm3,xmm6 751 punpcklqdq xmm6,xmm7 752 punpckhqdq xmm1,xmm2 753 punpckhqdq xmm3,xmm7 754 movdqu xmm4,[esi-128] 755 movdqu xmm5,[esi-64] 756 movdqu xmm2,[esi] 757 movdqu xmm7,[64+esi] 758 lea esi,[16+esi] 759 pxor xmm4,xmm0 760 movdqa xmm0,[ebx] 761 pxor xmm5,xmm1 762 movdqa xmm1,[16+ebx] 763 pxor xmm6,xmm2 764 movdqa xmm2,[32+ebx] 765 pxor xmm7,xmm3 766 movdqa xmm3,[48+ebx] 767 movdqu [edi-128],xmm4 768 movdqu [edi-64],xmm5 769 movdqu [edi],xmm6 770 movdqu [64+edi],xmm7 771 lea edi,[16+edi] 772 paddd xmm0,[ebp] 773 paddd xmm1,[16+ebp] 774 paddd xmm2,[32+ebp] 775 paddd xmm3,[48+ebp] 776 movdqa xmm6,xmm0 777 punpckldq xmm0,xmm1 778 movdqa xmm7,xmm2 779 punpckldq xmm2,xmm3 780 punpckhdq xmm6,xmm1 781 punpckhdq xmm7,xmm3 782 movdqa xmm1,xmm0 783 punpcklqdq xmm0,xmm2 784 movdqa xmm3,xmm6 785 punpcklqdq xmm6,xmm7 786 punpckhqdq xmm1,xmm2 787 punpckhqdq xmm3,xmm7 788 movdqu xmm4,[esi-128] 789 movdqu xmm5,[esi-64] 790 movdqu xmm2,[esi] 791 movdqu xmm7,[64+esi] 792 lea esi,[16+esi] 793 pxor xmm4,xmm0 794 movdqa xmm0,[64+ebx] 795 pxor xmm5,xmm1 796 movdqa xmm1,[80+ebx] 797 pxor xmm6,xmm2 798 movdqa xmm2,[96+ebx] 799 pxor xmm7,xmm3 800 movdqa xmm3,[112+ebx] 801 movdqu [edi-128],xmm4 802 movdqu [edi-64],xmm5 803 movdqu [edi],xmm6 804 movdqu [64+edi],xmm7 805 lea edi,[16+edi] 806 paddd xmm0,[64+ebp] 807 paddd xmm1,[80+ebp] 808 paddd xmm2,[96+ebp] 809 paddd xmm3,[112+ebp] 810 movdqa xmm6,xmm0 811 punpckldq xmm0,xmm1 812 movdqa xmm7,xmm2 813 punpckldq xmm2,xmm3 814 punpckhdq xmm6,xmm1 815 punpckhdq xmm7,xmm3 816 movdqa xmm1,xmm0 817 punpcklqdq xmm0,xmm2 818 movdqa xmm3,xmm6 819 punpcklqdq xmm6,xmm7 820 punpckhqdq xmm1,xmm2 821 punpckhqdq xmm3,xmm7 822 movdqu xmm4,[esi-128] 823 movdqu xmm5,[esi-64] 824 movdqu xmm2,[esi] 825 movdqu xmm7,[64+esi] 826 lea esi,[208+esi] 827 pxor xmm4,xmm0 828 pxor xmm5,xmm1 829 pxor xmm6,xmm2 830 pxor xmm7,xmm3 831 movdqu [edi-128],xmm4 832 movdqu [edi-64],xmm5 833 movdqu [edi],xmm6 834 movdqu [64+edi],xmm7 835 lea edi,[208+edi] 836 sub ecx,256 837 jnc NEAR L$009outer_loop 838 add ecx,256 839 jz NEAR L$011done 840 mov ebx,DWORD [520+esp] 841 lea esi,[esi-128] 842 mov edx,DWORD [516+esp] 843 lea edi,[edi-128] 844 movd xmm2,DWORD [64+ebp] 845 movdqu xmm3,[ebx] 846 paddd xmm2,[96+eax] 847 pand xmm3,[112+eax] 848 por xmm3,xmm2 849L$0081x: 850 movdqa xmm0,[32+eax] 851 movdqu xmm1,[edx] 852 movdqu xmm2,[16+edx] 853 movdqa xmm6,[eax] 854 movdqa xmm7,[16+eax] 855 mov DWORD [48+esp],ebp 856 movdqa [esp],xmm0 857 movdqa [16+esp],xmm1 858 movdqa [32+esp],xmm2 859 movdqa [48+esp],xmm3 860 mov edx,10 861 jmp NEAR L$012loop1x 862align 16 863L$013outer1x: 864 movdqa xmm3,[80+eax] 865 movdqa xmm0,[esp] 866 movdqa xmm1,[16+esp] 867 movdqa xmm2,[32+esp] 868 paddd xmm3,[48+esp] 869 mov edx,10 870 movdqa [48+esp],xmm3 871 jmp NEAR L$012loop1x 872align 16 873L$012loop1x: 874 paddd xmm0,xmm1 875 pxor xmm3,xmm0 876db 102,15,56,0,222 877 paddd xmm2,xmm3 878 pxor xmm1,xmm2 879 movdqa xmm4,xmm1 880 psrld xmm1,20 881 pslld xmm4,12 882 por xmm1,xmm4 883 paddd xmm0,xmm1 884 pxor xmm3,xmm0 885db 102,15,56,0,223 886 paddd xmm2,xmm3 887 pxor xmm1,xmm2 888 movdqa xmm4,xmm1 889 psrld xmm1,25 890 pslld xmm4,7 891 por xmm1,xmm4 892 pshufd xmm2,xmm2,78 893 pshufd xmm1,xmm1,57 894 pshufd xmm3,xmm3,147 895 nop 896 paddd xmm0,xmm1 897 pxor xmm3,xmm0 898db 102,15,56,0,222 899 paddd xmm2,xmm3 900 pxor xmm1,xmm2 901 movdqa xmm4,xmm1 902 psrld xmm1,20 903 pslld xmm4,12 904 por xmm1,xmm4 905 paddd xmm0,xmm1 906 pxor xmm3,xmm0 907db 102,15,56,0,223 908 paddd xmm2,xmm3 909 pxor xmm1,xmm2 910 movdqa xmm4,xmm1 911 psrld xmm1,25 912 pslld xmm4,7 913 por xmm1,xmm4 914 pshufd xmm2,xmm2,78 915 pshufd xmm1,xmm1,147 916 pshufd xmm3,xmm3,57 917 dec edx 918 jnz NEAR L$012loop1x 919 paddd xmm0,[esp] 920 paddd xmm1,[16+esp] 921 paddd xmm2,[32+esp] 922 paddd xmm3,[48+esp] 923 cmp ecx,64 924 jb NEAR L$014tail 925 movdqu xmm4,[esi] 926 movdqu xmm5,[16+esi] 927 pxor xmm0,xmm4 928 movdqu xmm4,[32+esi] 929 pxor xmm1,xmm5 930 movdqu xmm5,[48+esi] 931 pxor xmm2,xmm4 932 pxor xmm3,xmm5 933 lea esi,[64+esi] 934 movdqu [edi],xmm0 935 movdqu [16+edi],xmm1 936 movdqu [32+edi],xmm2 937 movdqu [48+edi],xmm3 938 lea edi,[64+edi] 939 sub ecx,64 940 jnz NEAR L$013outer1x 941 jmp NEAR L$011done 942L$014tail: 943 movdqa [esp],xmm0 944 movdqa [16+esp],xmm1 945 movdqa [32+esp],xmm2 946 movdqa [48+esp],xmm3 947 xor eax,eax 948 xor edx,edx 949 xor ebp,ebp 950L$015tail_loop: 951 mov al,BYTE [ebp*1+esp] 952 mov dl,BYTE [ebp*1+esi] 953 lea ebp,[1+ebp] 954 xor al,dl 955 mov BYTE [ebp*1+edi-1],al 956 dec ecx 957 jnz NEAR L$015tail_loop 958L$011done: 959 mov esp,DWORD [512+esp] 960 pop edi 961 pop esi 962 pop ebx 963 pop ebp 964 ret 965align 64 966L$ssse3_data: 967db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 968db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 969dd 1634760805,857760878,2036477234,1797285236 970dd 0,1,2,3 971dd 4,4,4,4 972dd 1,0,0,0 973dd 4,0,0,0 974dd 0,-1,-1,-1 975align 64 976db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 977db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 978db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 979db 114,103,62,0 980segment .bss 981common _GFp_ia32cap_P 16 982