1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11.text 12 13 14 15.p2align 6 16L$zero: 17.long 0,0,0,0 18L$one: 19.long 1,0,0,0 20L$inc: 21.long 0,1,2,3 22L$four: 23.long 4,4,4,4 24L$incy: 25.long 0,2,4,6,1,3,5,7 26L$eight: 27.long 8,8,8,8,8,8,8,8 28L$rot16: 29.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 30L$rot24: 31.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 32L$sigma: 33.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 34.p2align 6 35L$zeroz: 36.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 37L$fourz: 38.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 39L$incz: 40.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 41L$sixteen: 42.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 43.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 44.globl _GFp_ChaCha20_ctr32 45.private_extern _GFp_ChaCha20_ctr32 46 47.p2align 6 48_GFp_ChaCha20_ctr32: 49 50 cmpq $0,%rdx 51 je L$no_data 52 movq _GFp_ia32cap_P+4(%rip),%r10 53 testl $512,%r10d 54 jnz L$ChaCha20_ssse3 55 56 pushq %rbx 57 58 pushq %rbp 59 60 pushq %r12 61 62 pushq %r13 63 64 pushq %r14 65 66 pushq %r15 67 68 subq $64+24,%rsp 69 70L$ctr32_body: 71 72 73 movdqu (%rcx),%xmm1 74 movdqu 16(%rcx),%xmm2 75 movdqu (%r8),%xmm3 76 movdqa L$one(%rip),%xmm4 77 78 79 movdqa %xmm1,16(%rsp) 80 movdqa %xmm2,32(%rsp) 81 movdqa %xmm3,48(%rsp) 82 movq %rdx,%rbp 83 jmp L$oop_outer 84 85.p2align 5 86L$oop_outer: 87 movl $0x61707865,%eax 88 movl $0x3320646e,%ebx 89 movl $0x79622d32,%ecx 90 movl $0x6b206574,%edx 91 movl 16(%rsp),%r8d 92 movl 20(%rsp),%r9d 93 movl 24(%rsp),%r10d 94 movl 28(%rsp),%r11d 95 movd %xmm3,%r12d 96 movl 52(%rsp),%r13d 97 movl 56(%rsp),%r14d 98 movl 60(%rsp),%r15d 99 100 movq %rbp,64+0(%rsp) 101 movl $10,%ebp 102 movq %rsi,64+8(%rsp) 103.byte 102,72,15,126,214 104 movq %rdi,64+16(%rsp) 105 movq %rsi,%rdi 106 shrq $32,%rdi 107 jmp L$oop 108 109.p2align 5 110L$oop: 111 addl %r8d,%eax 112 xorl %eax,%r12d 113 roll $16,%r12d 114 addl %r9d,%ebx 115 xorl %ebx,%r13d 116 roll $16,%r13d 117 addl %r12d,%esi 118 xorl %esi,%r8d 119 roll $12,%r8d 120 addl %r13d,%edi 121 xorl %edi,%r9d 122 roll $12,%r9d 123 addl %r8d,%eax 124 xorl %eax,%r12d 125 roll $8,%r12d 126 addl %r9d,%ebx 127 xorl %ebx,%r13d 128 roll $8,%r13d 129 addl %r12d,%esi 130 xorl %esi,%r8d 131 roll $7,%r8d 132 addl %r13d,%edi 133 xorl %edi,%r9d 134 roll $7,%r9d 135 movl %esi,32(%rsp) 136 movl %edi,36(%rsp) 137 movl 40(%rsp),%esi 138 movl 44(%rsp),%edi 139 addl %r10d,%ecx 140 xorl %ecx,%r14d 141 roll $16,%r14d 142 addl %r11d,%edx 143 xorl %edx,%r15d 144 roll $16,%r15d 145 addl %r14d,%esi 146 xorl %esi,%r10d 147 roll $12,%r10d 148 addl %r15d,%edi 149 xorl %edi,%r11d 150 roll $12,%r11d 151 addl %r10d,%ecx 152 xorl %ecx,%r14d 153 roll $8,%r14d 154 addl %r11d,%edx 155 xorl %edx,%r15d 156 roll $8,%r15d 157 addl %r14d,%esi 158 xorl %esi,%r10d 159 roll $7,%r10d 160 addl %r15d,%edi 161 xorl %edi,%r11d 162 roll $7,%r11d 163 addl %r9d,%eax 164 xorl %eax,%r15d 165 roll $16,%r15d 166 addl %r10d,%ebx 167 xorl %ebx,%r12d 168 roll $16,%r12d 169 addl %r15d,%esi 170 xorl %esi,%r9d 171 roll $12,%r9d 172 addl %r12d,%edi 173 xorl %edi,%r10d 174 roll $12,%r10d 175 addl %r9d,%eax 176 xorl %eax,%r15d 177 roll $8,%r15d 178 addl %r10d,%ebx 179 xorl %ebx,%r12d 180 roll $8,%r12d 181 addl %r15d,%esi 182 xorl %esi,%r9d 183 roll $7,%r9d 184 addl %r12d,%edi 185 xorl %edi,%r10d 186 roll $7,%r10d 187 movl %esi,40(%rsp) 188 movl %edi,44(%rsp) 189 movl 32(%rsp),%esi 190 movl 36(%rsp),%edi 191 addl %r11d,%ecx 192 xorl %ecx,%r13d 193 roll $16,%r13d 194 addl %r8d,%edx 195 xorl %edx,%r14d 196 roll $16,%r14d 197 addl %r13d,%esi 198 xorl %esi,%r11d 199 roll $12,%r11d 200 addl %r14d,%edi 201 xorl %edi,%r8d 202 roll $12,%r8d 203 addl %r11d,%ecx 204 xorl %ecx,%r13d 205 roll $8,%r13d 206 addl %r8d,%edx 207 xorl %edx,%r14d 208 roll $8,%r14d 209 addl %r13d,%esi 210 xorl %esi,%r11d 211 roll $7,%r11d 212 addl %r14d,%edi 213 xorl %edi,%r8d 214 roll $7,%r8d 215 decl %ebp 216 jnz L$oop 217 movl %edi,36(%rsp) 218 movl %esi,32(%rsp) 219 movq 64(%rsp),%rbp 220 movdqa %xmm2,%xmm1 221 movq 64+8(%rsp),%rsi 222 paddd %xmm4,%xmm3 223 movq 64+16(%rsp),%rdi 224 225 addl $0x61707865,%eax 226 addl $0x3320646e,%ebx 227 addl $0x79622d32,%ecx 228 addl $0x6b206574,%edx 229 addl 16(%rsp),%r8d 230 addl 20(%rsp),%r9d 231 addl 24(%rsp),%r10d 232 addl 28(%rsp),%r11d 233 addl 48(%rsp),%r12d 234 addl 52(%rsp),%r13d 235 addl 56(%rsp),%r14d 236 addl 60(%rsp),%r15d 237 paddd 32(%rsp),%xmm1 238 239 cmpq $64,%rbp 240 jb L$tail 241 242 xorl 0(%rsi),%eax 243 xorl 4(%rsi),%ebx 244 xorl 8(%rsi),%ecx 245 xorl 12(%rsi),%edx 246 xorl 16(%rsi),%r8d 247 xorl 20(%rsi),%r9d 248 xorl 24(%rsi),%r10d 249 xorl 28(%rsi),%r11d 250 movdqu 32(%rsi),%xmm0 251 xorl 48(%rsi),%r12d 252 xorl 52(%rsi),%r13d 253 xorl 56(%rsi),%r14d 254 xorl 60(%rsi),%r15d 255 leaq 64(%rsi),%rsi 256 pxor %xmm1,%xmm0 257 258 movdqa %xmm2,32(%rsp) 259 movd %xmm3,48(%rsp) 260 261 movl %eax,0(%rdi) 262 movl %ebx,4(%rdi) 263 movl %ecx,8(%rdi) 264 movl %edx,12(%rdi) 265 movl %r8d,16(%rdi) 266 movl %r9d,20(%rdi) 267 movl %r10d,24(%rdi) 268 movl %r11d,28(%rdi) 269 movdqu %xmm0,32(%rdi) 270 movl %r12d,48(%rdi) 271 movl %r13d,52(%rdi) 272 movl %r14d,56(%rdi) 273 movl %r15d,60(%rdi) 274 leaq 64(%rdi),%rdi 275 276 subq $64,%rbp 277 jnz L$oop_outer 278 279 jmp L$done 280 281.p2align 4 282L$tail: 283 movl %eax,0(%rsp) 284 movl %ebx,4(%rsp) 285 xorq %rbx,%rbx 286 movl %ecx,8(%rsp) 287 movl %edx,12(%rsp) 288 movl %r8d,16(%rsp) 289 movl %r9d,20(%rsp) 290 movl %r10d,24(%rsp) 291 movl %r11d,28(%rsp) 292 movdqa %xmm1,32(%rsp) 293 movl %r12d,48(%rsp) 294 movl %r13d,52(%rsp) 295 movl %r14d,56(%rsp) 296 movl %r15d,60(%rsp) 297 298L$oop_tail: 299 movzbl (%rsi,%rbx,1),%eax 300 movzbl (%rsp,%rbx,1),%edx 301 leaq 1(%rbx),%rbx 302 xorl %edx,%eax 303 movb %al,-1(%rdi,%rbx,1) 304 decq %rbp 305 jnz L$oop_tail 306 307L$done: 308 leaq 64+24+48(%rsp),%rsi 309 movq -48(%rsi),%r15 310 311 movq -40(%rsi),%r14 312 313 movq -32(%rsi),%r13 314 315 movq -24(%rsi),%r12 316 317 movq -16(%rsi),%rbp 318 319 movq -8(%rsi),%rbx 320 321 leaq (%rsi),%rsp 322 323L$no_data: 324 .byte 0xf3,0xc3 325 326 327 328.p2align 5 329ChaCha20_ssse3: 330L$ChaCha20_ssse3: 331 332 movq %rsp,%r9 333 334 cmpq $128,%rdx 335 ja L$ChaCha20_4x 336 337L$do_sse3_after_all: 338 subq $64+8,%rsp 339 movdqa L$sigma(%rip),%xmm0 340 movdqu (%rcx),%xmm1 341 movdqu 16(%rcx),%xmm2 342 movdqu (%r8),%xmm3 343 movdqa L$rot16(%rip),%xmm6 344 movdqa L$rot24(%rip),%xmm7 345 346 movdqa %xmm0,0(%rsp) 347 movdqa %xmm1,16(%rsp) 348 movdqa %xmm2,32(%rsp) 349 movdqa %xmm3,48(%rsp) 350 movq $10,%r8 351 jmp L$oop_ssse3 352 353.p2align 5 354L$oop_outer_ssse3: 355 movdqa L$one(%rip),%xmm3 356 movdqa 0(%rsp),%xmm0 357 movdqa 16(%rsp),%xmm1 358 movdqa 32(%rsp),%xmm2 359 paddd 48(%rsp),%xmm3 360 movq $10,%r8 361 movdqa %xmm3,48(%rsp) 362 jmp L$oop_ssse3 363 364.p2align 5 365L$oop_ssse3: 366 paddd %xmm1,%xmm0 367 pxor %xmm0,%xmm3 368.byte 102,15,56,0,222 369 paddd %xmm3,%xmm2 370 pxor %xmm2,%xmm1 371 movdqa %xmm1,%xmm4 372 psrld $20,%xmm1 373 pslld $12,%xmm4 374 por %xmm4,%xmm1 375 paddd %xmm1,%xmm0 376 pxor %xmm0,%xmm3 377.byte 102,15,56,0,223 378 paddd %xmm3,%xmm2 379 pxor %xmm2,%xmm1 380 movdqa %xmm1,%xmm4 381 psrld $25,%xmm1 382 pslld $7,%xmm4 383 por %xmm4,%xmm1 384 pshufd $78,%xmm2,%xmm2 385 pshufd $57,%xmm1,%xmm1 386 pshufd $147,%xmm3,%xmm3 387 nop 388 paddd %xmm1,%xmm0 389 pxor %xmm0,%xmm3 390.byte 102,15,56,0,222 391 paddd %xmm3,%xmm2 392 pxor %xmm2,%xmm1 393 movdqa %xmm1,%xmm4 394 psrld $20,%xmm1 395 pslld $12,%xmm4 396 por %xmm4,%xmm1 397 paddd %xmm1,%xmm0 398 pxor %xmm0,%xmm3 399.byte 102,15,56,0,223 400 paddd %xmm3,%xmm2 401 pxor %xmm2,%xmm1 402 movdqa %xmm1,%xmm4 403 psrld $25,%xmm1 404 pslld $7,%xmm4 405 por %xmm4,%xmm1 406 pshufd $78,%xmm2,%xmm2 407 pshufd $147,%xmm1,%xmm1 408 pshufd $57,%xmm3,%xmm3 409 decq %r8 410 jnz L$oop_ssse3 411 paddd 0(%rsp),%xmm0 412 paddd 16(%rsp),%xmm1 413 paddd 32(%rsp),%xmm2 414 paddd 48(%rsp),%xmm3 415 416 cmpq $64,%rdx 417 jb L$tail_ssse3 418 419 movdqu 0(%rsi),%xmm4 420 movdqu 16(%rsi),%xmm5 421 pxor %xmm4,%xmm0 422 movdqu 32(%rsi),%xmm4 423 pxor %xmm5,%xmm1 424 movdqu 48(%rsi),%xmm5 425 leaq 64(%rsi),%rsi 426 pxor %xmm4,%xmm2 427 pxor %xmm5,%xmm3 428 429 movdqu %xmm0,0(%rdi) 430 movdqu %xmm1,16(%rdi) 431 movdqu %xmm2,32(%rdi) 432 movdqu %xmm3,48(%rdi) 433 leaq 64(%rdi),%rdi 434 435 subq $64,%rdx 436 jnz L$oop_outer_ssse3 437 438 jmp L$done_ssse3 439 440.p2align 4 441L$tail_ssse3: 442 movdqa %xmm0,0(%rsp) 443 movdqa %xmm1,16(%rsp) 444 movdqa %xmm2,32(%rsp) 445 movdqa %xmm3,48(%rsp) 446 xorq %r8,%r8 447 448L$oop_tail_ssse3: 449 movzbl (%rsi,%r8,1),%eax 450 movzbl (%rsp,%r8,1),%ecx 451 leaq 1(%r8),%r8 452 xorl %ecx,%eax 453 movb %al,-1(%rdi,%r8,1) 454 decq %rdx 455 jnz L$oop_tail_ssse3 456 457L$done_ssse3: 458 leaq (%r9),%rsp 459 460L$ssse3_epilogue: 461 .byte 0xf3,0xc3 462 463 464 465.p2align 5 466ChaCha20_4x: 467L$ChaCha20_4x: 468 469 movq %rsp,%r9 470 471 movq %r10,%r11 472 shrq $32,%r10 473 testq $32,%r10 474 jnz L$ChaCha20_8x 475 cmpq $192,%rdx 476 ja L$proceed4x 477 478 andq $71303168,%r11 479 cmpq $4194304,%r11 480 je L$do_sse3_after_all 481 482L$proceed4x: 483 subq $0x140+8,%rsp 484 movdqa L$sigma(%rip),%xmm11 485 movdqu (%rcx),%xmm15 486 movdqu 16(%rcx),%xmm7 487 movdqu (%r8),%xmm3 488 leaq 256(%rsp),%rcx 489 leaq L$rot16(%rip),%r10 490 leaq L$rot24(%rip),%r11 491 492 pshufd $0x00,%xmm11,%xmm8 493 pshufd $0x55,%xmm11,%xmm9 494 movdqa %xmm8,64(%rsp) 495 pshufd $0xaa,%xmm11,%xmm10 496 movdqa %xmm9,80(%rsp) 497 pshufd $0xff,%xmm11,%xmm11 498 movdqa %xmm10,96(%rsp) 499 movdqa %xmm11,112(%rsp) 500 501 pshufd $0x00,%xmm15,%xmm12 502 pshufd $0x55,%xmm15,%xmm13 503 movdqa %xmm12,128-256(%rcx) 504 pshufd $0xaa,%xmm15,%xmm14 505 movdqa %xmm13,144-256(%rcx) 506 pshufd $0xff,%xmm15,%xmm15 507 movdqa %xmm14,160-256(%rcx) 508 movdqa %xmm15,176-256(%rcx) 509 510 pshufd $0x00,%xmm7,%xmm4 511 pshufd $0x55,%xmm7,%xmm5 512 movdqa %xmm4,192-256(%rcx) 513 pshufd $0xaa,%xmm7,%xmm6 514 movdqa %xmm5,208-256(%rcx) 515 pshufd $0xff,%xmm7,%xmm7 516 movdqa %xmm6,224-256(%rcx) 517 movdqa %xmm7,240-256(%rcx) 518 519 pshufd $0x00,%xmm3,%xmm0 520 pshufd $0x55,%xmm3,%xmm1 521 paddd L$inc(%rip),%xmm0 522 pshufd $0xaa,%xmm3,%xmm2 523 movdqa %xmm1,272-256(%rcx) 524 pshufd $0xff,%xmm3,%xmm3 525 movdqa %xmm2,288-256(%rcx) 526 movdqa %xmm3,304-256(%rcx) 527 528 jmp L$oop_enter4x 529 530.p2align 5 531L$oop_outer4x: 532 movdqa 64(%rsp),%xmm8 533 movdqa 80(%rsp),%xmm9 534 movdqa 96(%rsp),%xmm10 535 movdqa 112(%rsp),%xmm11 536 movdqa 128-256(%rcx),%xmm12 537 movdqa 144-256(%rcx),%xmm13 538 movdqa 160-256(%rcx),%xmm14 539 movdqa 176-256(%rcx),%xmm15 540 movdqa 192-256(%rcx),%xmm4 541 movdqa 208-256(%rcx),%xmm5 542 movdqa 224-256(%rcx),%xmm6 543 movdqa 240-256(%rcx),%xmm7 544 movdqa 256-256(%rcx),%xmm0 545 movdqa 272-256(%rcx),%xmm1 546 movdqa 288-256(%rcx),%xmm2 547 movdqa 304-256(%rcx),%xmm3 548 paddd L$four(%rip),%xmm0 549 550L$oop_enter4x: 551 movdqa %xmm6,32(%rsp) 552 movdqa %xmm7,48(%rsp) 553 movdqa (%r10),%xmm7 554 movl $10,%eax 555 movdqa %xmm0,256-256(%rcx) 556 jmp L$oop4x 557 558.p2align 5 559L$oop4x: 560 paddd %xmm12,%xmm8 561 paddd %xmm13,%xmm9 562 pxor %xmm8,%xmm0 563 pxor %xmm9,%xmm1 564.byte 102,15,56,0,199 565.byte 102,15,56,0,207 566 paddd %xmm0,%xmm4 567 paddd %xmm1,%xmm5 568 pxor %xmm4,%xmm12 569 pxor %xmm5,%xmm13 570 movdqa %xmm12,%xmm6 571 pslld $12,%xmm12 572 psrld $20,%xmm6 573 movdqa %xmm13,%xmm7 574 pslld $12,%xmm13 575 por %xmm6,%xmm12 576 psrld $20,%xmm7 577 movdqa (%r11),%xmm6 578 por %xmm7,%xmm13 579 paddd %xmm12,%xmm8 580 paddd %xmm13,%xmm9 581 pxor %xmm8,%xmm0 582 pxor %xmm9,%xmm1 583.byte 102,15,56,0,198 584.byte 102,15,56,0,206 585 paddd %xmm0,%xmm4 586 paddd %xmm1,%xmm5 587 pxor %xmm4,%xmm12 588 pxor %xmm5,%xmm13 589 movdqa %xmm12,%xmm7 590 pslld $7,%xmm12 591 psrld $25,%xmm7 592 movdqa %xmm13,%xmm6 593 pslld $7,%xmm13 594 por %xmm7,%xmm12 595 psrld $25,%xmm6 596 movdqa (%r10),%xmm7 597 por %xmm6,%xmm13 598 movdqa %xmm4,0(%rsp) 599 movdqa %xmm5,16(%rsp) 600 movdqa 32(%rsp),%xmm4 601 movdqa 48(%rsp),%xmm5 602 paddd %xmm14,%xmm10 603 paddd %xmm15,%xmm11 604 pxor %xmm10,%xmm2 605 pxor %xmm11,%xmm3 606.byte 102,15,56,0,215 607.byte 102,15,56,0,223 608 paddd %xmm2,%xmm4 609 paddd %xmm3,%xmm5 610 pxor %xmm4,%xmm14 611 pxor %xmm5,%xmm15 612 movdqa %xmm14,%xmm6 613 pslld $12,%xmm14 614 psrld $20,%xmm6 615 movdqa %xmm15,%xmm7 616 pslld $12,%xmm15 617 por %xmm6,%xmm14 618 psrld $20,%xmm7 619 movdqa (%r11),%xmm6 620 por %xmm7,%xmm15 621 paddd %xmm14,%xmm10 622 paddd %xmm15,%xmm11 623 pxor %xmm10,%xmm2 624 pxor %xmm11,%xmm3 625.byte 102,15,56,0,214 626.byte 102,15,56,0,222 627 paddd %xmm2,%xmm4 628 paddd %xmm3,%xmm5 629 pxor %xmm4,%xmm14 630 pxor %xmm5,%xmm15 631 movdqa %xmm14,%xmm7 632 pslld $7,%xmm14 633 psrld $25,%xmm7 634 movdqa %xmm15,%xmm6 635 pslld $7,%xmm15 636 por %xmm7,%xmm14 637 psrld $25,%xmm6 638 movdqa (%r10),%xmm7 639 por %xmm6,%xmm15 640 paddd %xmm13,%xmm8 641 paddd %xmm14,%xmm9 642 pxor %xmm8,%xmm3 643 pxor %xmm9,%xmm0 644.byte 102,15,56,0,223 645.byte 102,15,56,0,199 646 paddd %xmm3,%xmm4 647 paddd %xmm0,%xmm5 648 pxor %xmm4,%xmm13 649 pxor %xmm5,%xmm14 650 movdqa %xmm13,%xmm6 651 pslld $12,%xmm13 652 psrld $20,%xmm6 653 movdqa %xmm14,%xmm7 654 pslld $12,%xmm14 655 por %xmm6,%xmm13 656 psrld $20,%xmm7 657 movdqa (%r11),%xmm6 658 por %xmm7,%xmm14 659 paddd %xmm13,%xmm8 660 paddd %xmm14,%xmm9 661 pxor %xmm8,%xmm3 662 pxor %xmm9,%xmm0 663.byte 102,15,56,0,222 664.byte 102,15,56,0,198 665 paddd %xmm3,%xmm4 666 paddd %xmm0,%xmm5 667 pxor %xmm4,%xmm13 668 pxor %xmm5,%xmm14 669 movdqa %xmm13,%xmm7 670 pslld $7,%xmm13 671 psrld $25,%xmm7 672 movdqa %xmm14,%xmm6 673 pslld $7,%xmm14 674 por %xmm7,%xmm13 675 psrld $25,%xmm6 676 movdqa (%r10),%xmm7 677 por %xmm6,%xmm14 678 movdqa %xmm4,32(%rsp) 679 movdqa %xmm5,48(%rsp) 680 movdqa 0(%rsp),%xmm4 681 movdqa 16(%rsp),%xmm5 682 paddd %xmm15,%xmm10 683 paddd %xmm12,%xmm11 684 pxor %xmm10,%xmm1 685 pxor %xmm11,%xmm2 686.byte 102,15,56,0,207 687.byte 102,15,56,0,215 688 paddd %xmm1,%xmm4 689 paddd %xmm2,%xmm5 690 pxor %xmm4,%xmm15 691 pxor %xmm5,%xmm12 692 movdqa %xmm15,%xmm6 693 pslld $12,%xmm15 694 psrld $20,%xmm6 695 movdqa %xmm12,%xmm7 696 pslld $12,%xmm12 697 por %xmm6,%xmm15 698 psrld $20,%xmm7 699 movdqa (%r11),%xmm6 700 por %xmm7,%xmm12 701 paddd %xmm15,%xmm10 702 paddd %xmm12,%xmm11 703 pxor %xmm10,%xmm1 704 pxor %xmm11,%xmm2 705.byte 102,15,56,0,206 706.byte 102,15,56,0,214 707 paddd %xmm1,%xmm4 708 paddd %xmm2,%xmm5 709 pxor %xmm4,%xmm15 710 pxor %xmm5,%xmm12 711 movdqa %xmm15,%xmm7 712 pslld $7,%xmm15 713 psrld $25,%xmm7 714 movdqa %xmm12,%xmm6 715 pslld $7,%xmm12 716 por %xmm7,%xmm15 717 psrld $25,%xmm6 718 movdqa (%r10),%xmm7 719 por %xmm6,%xmm12 720 decl %eax 721 jnz L$oop4x 722 723 paddd 64(%rsp),%xmm8 724 paddd 80(%rsp),%xmm9 725 paddd 96(%rsp),%xmm10 726 paddd 112(%rsp),%xmm11 727 728 movdqa %xmm8,%xmm6 729 punpckldq %xmm9,%xmm8 730 movdqa %xmm10,%xmm7 731 punpckldq %xmm11,%xmm10 732 punpckhdq %xmm9,%xmm6 733 punpckhdq %xmm11,%xmm7 734 movdqa %xmm8,%xmm9 735 punpcklqdq %xmm10,%xmm8 736 movdqa %xmm6,%xmm11 737 punpcklqdq %xmm7,%xmm6 738 punpckhqdq %xmm10,%xmm9 739 punpckhqdq %xmm7,%xmm11 740 paddd 128-256(%rcx),%xmm12 741 paddd 144-256(%rcx),%xmm13 742 paddd 160-256(%rcx),%xmm14 743 paddd 176-256(%rcx),%xmm15 744 745 movdqa %xmm8,0(%rsp) 746 movdqa %xmm9,16(%rsp) 747 movdqa 32(%rsp),%xmm8 748 movdqa 48(%rsp),%xmm9 749 750 movdqa %xmm12,%xmm10 751 punpckldq %xmm13,%xmm12 752 movdqa %xmm14,%xmm7 753 punpckldq %xmm15,%xmm14 754 punpckhdq %xmm13,%xmm10 755 punpckhdq %xmm15,%xmm7 756 movdqa %xmm12,%xmm13 757 punpcklqdq %xmm14,%xmm12 758 movdqa %xmm10,%xmm15 759 punpcklqdq %xmm7,%xmm10 760 punpckhqdq %xmm14,%xmm13 761 punpckhqdq %xmm7,%xmm15 762 paddd 192-256(%rcx),%xmm4 763 paddd 208-256(%rcx),%xmm5 764 paddd 224-256(%rcx),%xmm8 765 paddd 240-256(%rcx),%xmm9 766 767 movdqa %xmm6,32(%rsp) 768 movdqa %xmm11,48(%rsp) 769 770 movdqa %xmm4,%xmm14 771 punpckldq %xmm5,%xmm4 772 movdqa %xmm8,%xmm7 773 punpckldq %xmm9,%xmm8 774 punpckhdq %xmm5,%xmm14 775 punpckhdq %xmm9,%xmm7 776 movdqa %xmm4,%xmm5 777 punpcklqdq %xmm8,%xmm4 778 movdqa %xmm14,%xmm9 779 punpcklqdq %xmm7,%xmm14 780 punpckhqdq %xmm8,%xmm5 781 punpckhqdq %xmm7,%xmm9 782 paddd 256-256(%rcx),%xmm0 783 paddd 272-256(%rcx),%xmm1 784 paddd 288-256(%rcx),%xmm2 785 paddd 304-256(%rcx),%xmm3 786 787 movdqa %xmm0,%xmm8 788 punpckldq %xmm1,%xmm0 789 movdqa %xmm2,%xmm7 790 punpckldq %xmm3,%xmm2 791 punpckhdq %xmm1,%xmm8 792 punpckhdq %xmm3,%xmm7 793 movdqa %xmm0,%xmm1 794 punpcklqdq %xmm2,%xmm0 795 movdqa %xmm8,%xmm3 796 punpcklqdq %xmm7,%xmm8 797 punpckhqdq %xmm2,%xmm1 798 punpckhqdq %xmm7,%xmm3 799 cmpq $256,%rdx 800 jb L$tail4x 801 802 movdqu 0(%rsi),%xmm6 803 movdqu 16(%rsi),%xmm11 804 movdqu 32(%rsi),%xmm2 805 movdqu 48(%rsi),%xmm7 806 pxor 0(%rsp),%xmm6 807 pxor %xmm12,%xmm11 808 pxor %xmm4,%xmm2 809 pxor %xmm0,%xmm7 810 811 movdqu %xmm6,0(%rdi) 812 movdqu 64(%rsi),%xmm6 813 movdqu %xmm11,16(%rdi) 814 movdqu 80(%rsi),%xmm11 815 movdqu %xmm2,32(%rdi) 816 movdqu 96(%rsi),%xmm2 817 movdqu %xmm7,48(%rdi) 818 movdqu 112(%rsi),%xmm7 819 leaq 128(%rsi),%rsi 820 pxor 16(%rsp),%xmm6 821 pxor %xmm13,%xmm11 822 pxor %xmm5,%xmm2 823 pxor %xmm1,%xmm7 824 825 movdqu %xmm6,64(%rdi) 826 movdqu 0(%rsi),%xmm6 827 movdqu %xmm11,80(%rdi) 828 movdqu 16(%rsi),%xmm11 829 movdqu %xmm2,96(%rdi) 830 movdqu 32(%rsi),%xmm2 831 movdqu %xmm7,112(%rdi) 832 leaq 128(%rdi),%rdi 833 movdqu 48(%rsi),%xmm7 834 pxor 32(%rsp),%xmm6 835 pxor %xmm10,%xmm11 836 pxor %xmm14,%xmm2 837 pxor %xmm8,%xmm7 838 839 movdqu %xmm6,0(%rdi) 840 movdqu 64(%rsi),%xmm6 841 movdqu %xmm11,16(%rdi) 842 movdqu 80(%rsi),%xmm11 843 movdqu %xmm2,32(%rdi) 844 movdqu 96(%rsi),%xmm2 845 movdqu %xmm7,48(%rdi) 846 movdqu 112(%rsi),%xmm7 847 leaq 128(%rsi),%rsi 848 pxor 48(%rsp),%xmm6 849 pxor %xmm15,%xmm11 850 pxor %xmm9,%xmm2 851 pxor %xmm3,%xmm7 852 movdqu %xmm6,64(%rdi) 853 movdqu %xmm11,80(%rdi) 854 movdqu %xmm2,96(%rdi) 855 movdqu %xmm7,112(%rdi) 856 leaq 128(%rdi),%rdi 857 858 subq $256,%rdx 859 jnz L$oop_outer4x 860 861 jmp L$done4x 862 863L$tail4x: 864 cmpq $192,%rdx 865 jae L$192_or_more4x 866 cmpq $128,%rdx 867 jae L$128_or_more4x 868 cmpq $64,%rdx 869 jae L$64_or_more4x 870 871 872 xorq %r10,%r10 873 874 movdqa %xmm12,16(%rsp) 875 movdqa %xmm4,32(%rsp) 876 movdqa %xmm0,48(%rsp) 877 jmp L$oop_tail4x 878 879.p2align 5 880L$64_or_more4x: 881 movdqu 0(%rsi),%xmm6 882 movdqu 16(%rsi),%xmm11 883 movdqu 32(%rsi),%xmm2 884 movdqu 48(%rsi),%xmm7 885 pxor 0(%rsp),%xmm6 886 pxor %xmm12,%xmm11 887 pxor %xmm4,%xmm2 888 pxor %xmm0,%xmm7 889 movdqu %xmm6,0(%rdi) 890 movdqu %xmm11,16(%rdi) 891 movdqu %xmm2,32(%rdi) 892 movdqu %xmm7,48(%rdi) 893 je L$done4x 894 895 movdqa 16(%rsp),%xmm6 896 leaq 64(%rsi),%rsi 897 xorq %r10,%r10 898 movdqa %xmm6,0(%rsp) 899 movdqa %xmm13,16(%rsp) 900 leaq 64(%rdi),%rdi 901 movdqa %xmm5,32(%rsp) 902 subq $64,%rdx 903 movdqa %xmm1,48(%rsp) 904 jmp L$oop_tail4x 905 906.p2align 5 907L$128_or_more4x: 908 movdqu 0(%rsi),%xmm6 909 movdqu 16(%rsi),%xmm11 910 movdqu 32(%rsi),%xmm2 911 movdqu 48(%rsi),%xmm7 912 pxor 0(%rsp),%xmm6 913 pxor %xmm12,%xmm11 914 pxor %xmm4,%xmm2 915 pxor %xmm0,%xmm7 916 917 movdqu %xmm6,0(%rdi) 918 movdqu 64(%rsi),%xmm6 919 movdqu %xmm11,16(%rdi) 920 movdqu 80(%rsi),%xmm11 921 movdqu %xmm2,32(%rdi) 922 movdqu 96(%rsi),%xmm2 923 movdqu %xmm7,48(%rdi) 924 movdqu 112(%rsi),%xmm7 925 pxor 16(%rsp),%xmm6 926 pxor %xmm13,%xmm11 927 pxor %xmm5,%xmm2 928 pxor %xmm1,%xmm7 929 movdqu %xmm6,64(%rdi) 930 movdqu %xmm11,80(%rdi) 931 movdqu %xmm2,96(%rdi) 932 movdqu %xmm7,112(%rdi) 933 je L$done4x 934 935 movdqa 32(%rsp),%xmm6 936 leaq 128(%rsi),%rsi 937 xorq %r10,%r10 938 movdqa %xmm6,0(%rsp) 939 movdqa %xmm10,16(%rsp) 940 leaq 128(%rdi),%rdi 941 movdqa %xmm14,32(%rsp) 942 subq $128,%rdx 943 movdqa %xmm8,48(%rsp) 944 jmp L$oop_tail4x 945 946.p2align 5 947L$192_or_more4x: 948 movdqu 0(%rsi),%xmm6 949 movdqu 16(%rsi),%xmm11 950 movdqu 32(%rsi),%xmm2 951 movdqu 48(%rsi),%xmm7 952 pxor 0(%rsp),%xmm6 953 pxor %xmm12,%xmm11 954 pxor %xmm4,%xmm2 955 pxor %xmm0,%xmm7 956 957 movdqu %xmm6,0(%rdi) 958 movdqu 64(%rsi),%xmm6 959 movdqu %xmm11,16(%rdi) 960 movdqu 80(%rsi),%xmm11 961 movdqu %xmm2,32(%rdi) 962 movdqu 96(%rsi),%xmm2 963 movdqu %xmm7,48(%rdi) 964 movdqu 112(%rsi),%xmm7 965 leaq 128(%rsi),%rsi 966 pxor 16(%rsp),%xmm6 967 pxor %xmm13,%xmm11 968 pxor %xmm5,%xmm2 969 pxor %xmm1,%xmm7 970 971 movdqu %xmm6,64(%rdi) 972 movdqu 0(%rsi),%xmm6 973 movdqu %xmm11,80(%rdi) 974 movdqu 16(%rsi),%xmm11 975 movdqu %xmm2,96(%rdi) 976 movdqu 32(%rsi),%xmm2 977 movdqu %xmm7,112(%rdi) 978 leaq 128(%rdi),%rdi 979 movdqu 48(%rsi),%xmm7 980 pxor 32(%rsp),%xmm6 981 pxor %xmm10,%xmm11 982 pxor %xmm14,%xmm2 983 pxor %xmm8,%xmm7 984 movdqu %xmm6,0(%rdi) 985 movdqu %xmm11,16(%rdi) 986 movdqu %xmm2,32(%rdi) 987 movdqu %xmm7,48(%rdi) 988 je L$done4x 989 990 movdqa 48(%rsp),%xmm6 991 leaq 64(%rsi),%rsi 992 xorq %r10,%r10 993 movdqa %xmm6,0(%rsp) 994 movdqa %xmm15,16(%rsp) 995 leaq 64(%rdi),%rdi 996 movdqa %xmm9,32(%rsp) 997 subq $192,%rdx 998 movdqa %xmm3,48(%rsp) 999 1000L$oop_tail4x: 1001 movzbl (%rsi,%r10,1),%eax 1002 movzbl (%rsp,%r10,1),%ecx 1003 leaq 1(%r10),%r10 1004 xorl %ecx,%eax 1005 movb %al,-1(%rdi,%r10,1) 1006 decq %rdx 1007 jnz L$oop_tail4x 1008 1009L$done4x: 1010 leaq (%r9),%rsp 1011 1012L$4x_epilogue: 1013 .byte 0xf3,0xc3 1014 1015 1016 1017.p2align 5 1018ChaCha20_8x: 1019L$ChaCha20_8x: 1020 1021 movq %rsp,%r9 1022 1023 subq $0x280+8,%rsp 1024 andq $-32,%rsp 1025 vzeroupper 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 vbroadcasti128 L$sigma(%rip),%ymm11 1037 vbroadcasti128 (%rcx),%ymm3 1038 vbroadcasti128 16(%rcx),%ymm15 1039 vbroadcasti128 (%r8),%ymm7 1040 leaq 256(%rsp),%rcx 1041 leaq 512(%rsp),%rax 1042 leaq L$rot16(%rip),%r10 1043 leaq L$rot24(%rip),%r11 1044 1045 vpshufd $0x00,%ymm11,%ymm8 1046 vpshufd $0x55,%ymm11,%ymm9 1047 vmovdqa %ymm8,128-256(%rcx) 1048 vpshufd $0xaa,%ymm11,%ymm10 1049 vmovdqa %ymm9,160-256(%rcx) 1050 vpshufd $0xff,%ymm11,%ymm11 1051 vmovdqa %ymm10,192-256(%rcx) 1052 vmovdqa %ymm11,224-256(%rcx) 1053 1054 vpshufd $0x00,%ymm3,%ymm0 1055 vpshufd $0x55,%ymm3,%ymm1 1056 vmovdqa %ymm0,256-256(%rcx) 1057 vpshufd $0xaa,%ymm3,%ymm2 1058 vmovdqa %ymm1,288-256(%rcx) 1059 vpshufd $0xff,%ymm3,%ymm3 1060 vmovdqa %ymm2,320-256(%rcx) 1061 vmovdqa %ymm3,352-256(%rcx) 1062 1063 vpshufd $0x00,%ymm15,%ymm12 1064 vpshufd $0x55,%ymm15,%ymm13 1065 vmovdqa %ymm12,384-512(%rax) 1066 vpshufd $0xaa,%ymm15,%ymm14 1067 vmovdqa %ymm13,416-512(%rax) 1068 vpshufd $0xff,%ymm15,%ymm15 1069 vmovdqa %ymm14,448-512(%rax) 1070 vmovdqa %ymm15,480-512(%rax) 1071 1072 vpshufd $0x00,%ymm7,%ymm4 1073 vpshufd $0x55,%ymm7,%ymm5 1074 vpaddd L$incy(%rip),%ymm4,%ymm4 1075 vpshufd $0xaa,%ymm7,%ymm6 1076 vmovdqa %ymm5,544-512(%rax) 1077 vpshufd $0xff,%ymm7,%ymm7 1078 vmovdqa %ymm6,576-512(%rax) 1079 vmovdqa %ymm7,608-512(%rax) 1080 1081 jmp L$oop_enter8x 1082 1083.p2align 5 1084L$oop_outer8x: 1085 vmovdqa 128-256(%rcx),%ymm8 1086 vmovdqa 160-256(%rcx),%ymm9 1087 vmovdqa 192-256(%rcx),%ymm10 1088 vmovdqa 224-256(%rcx),%ymm11 1089 vmovdqa 256-256(%rcx),%ymm0 1090 vmovdqa 288-256(%rcx),%ymm1 1091 vmovdqa 320-256(%rcx),%ymm2 1092 vmovdqa 352-256(%rcx),%ymm3 1093 vmovdqa 384-512(%rax),%ymm12 1094 vmovdqa 416-512(%rax),%ymm13 1095 vmovdqa 448-512(%rax),%ymm14 1096 vmovdqa 480-512(%rax),%ymm15 1097 vmovdqa 512-512(%rax),%ymm4 1098 vmovdqa 544-512(%rax),%ymm5 1099 vmovdqa 576-512(%rax),%ymm6 1100 vmovdqa 608-512(%rax),%ymm7 1101 vpaddd L$eight(%rip),%ymm4,%ymm4 1102 1103L$oop_enter8x: 1104 vmovdqa %ymm14,64(%rsp) 1105 vmovdqa %ymm15,96(%rsp) 1106 vbroadcasti128 (%r10),%ymm15 1107 vmovdqa %ymm4,512-512(%rax) 1108 movl $10,%eax 1109 jmp L$oop8x 1110 1111.p2align 5 1112L$oop8x: 1113 vpaddd %ymm0,%ymm8,%ymm8 1114 vpxor %ymm4,%ymm8,%ymm4 1115 vpshufb %ymm15,%ymm4,%ymm4 1116 vpaddd %ymm1,%ymm9,%ymm9 1117 vpxor %ymm5,%ymm9,%ymm5 1118 vpshufb %ymm15,%ymm5,%ymm5 1119 vpaddd %ymm4,%ymm12,%ymm12 1120 vpxor %ymm0,%ymm12,%ymm0 1121 vpslld $12,%ymm0,%ymm14 1122 vpsrld $20,%ymm0,%ymm0 1123 vpor %ymm0,%ymm14,%ymm0 1124 vbroadcasti128 (%r11),%ymm14 1125 vpaddd %ymm5,%ymm13,%ymm13 1126 vpxor %ymm1,%ymm13,%ymm1 1127 vpslld $12,%ymm1,%ymm15 1128 vpsrld $20,%ymm1,%ymm1 1129 vpor %ymm1,%ymm15,%ymm1 1130 vpaddd %ymm0,%ymm8,%ymm8 1131 vpxor %ymm4,%ymm8,%ymm4 1132 vpshufb %ymm14,%ymm4,%ymm4 1133 vpaddd %ymm1,%ymm9,%ymm9 1134 vpxor %ymm5,%ymm9,%ymm5 1135 vpshufb %ymm14,%ymm5,%ymm5 1136 vpaddd %ymm4,%ymm12,%ymm12 1137 vpxor %ymm0,%ymm12,%ymm0 1138 vpslld $7,%ymm0,%ymm15 1139 vpsrld $25,%ymm0,%ymm0 1140 vpor %ymm0,%ymm15,%ymm0 1141 vbroadcasti128 (%r10),%ymm15 1142 vpaddd %ymm5,%ymm13,%ymm13 1143 vpxor %ymm1,%ymm13,%ymm1 1144 vpslld $7,%ymm1,%ymm14 1145 vpsrld $25,%ymm1,%ymm1 1146 vpor %ymm1,%ymm14,%ymm1 1147 vmovdqa %ymm12,0(%rsp) 1148 vmovdqa %ymm13,32(%rsp) 1149 vmovdqa 64(%rsp),%ymm12 1150 vmovdqa 96(%rsp),%ymm13 1151 vpaddd %ymm2,%ymm10,%ymm10 1152 vpxor %ymm6,%ymm10,%ymm6 1153 vpshufb %ymm15,%ymm6,%ymm6 1154 vpaddd %ymm3,%ymm11,%ymm11 1155 vpxor %ymm7,%ymm11,%ymm7 1156 vpshufb %ymm15,%ymm7,%ymm7 1157 vpaddd %ymm6,%ymm12,%ymm12 1158 vpxor %ymm2,%ymm12,%ymm2 1159 vpslld $12,%ymm2,%ymm14 1160 vpsrld $20,%ymm2,%ymm2 1161 vpor %ymm2,%ymm14,%ymm2 1162 vbroadcasti128 (%r11),%ymm14 1163 vpaddd %ymm7,%ymm13,%ymm13 1164 vpxor %ymm3,%ymm13,%ymm3 1165 vpslld $12,%ymm3,%ymm15 1166 vpsrld $20,%ymm3,%ymm3 1167 vpor %ymm3,%ymm15,%ymm3 1168 vpaddd %ymm2,%ymm10,%ymm10 1169 vpxor %ymm6,%ymm10,%ymm6 1170 vpshufb %ymm14,%ymm6,%ymm6 1171 vpaddd %ymm3,%ymm11,%ymm11 1172 vpxor %ymm7,%ymm11,%ymm7 1173 vpshufb %ymm14,%ymm7,%ymm7 1174 vpaddd %ymm6,%ymm12,%ymm12 1175 vpxor %ymm2,%ymm12,%ymm2 1176 vpslld $7,%ymm2,%ymm15 1177 vpsrld $25,%ymm2,%ymm2 1178 vpor %ymm2,%ymm15,%ymm2 1179 vbroadcasti128 (%r10),%ymm15 1180 vpaddd %ymm7,%ymm13,%ymm13 1181 vpxor %ymm3,%ymm13,%ymm3 1182 vpslld $7,%ymm3,%ymm14 1183 vpsrld $25,%ymm3,%ymm3 1184 vpor %ymm3,%ymm14,%ymm3 1185 vpaddd %ymm1,%ymm8,%ymm8 1186 vpxor %ymm7,%ymm8,%ymm7 1187 vpshufb %ymm15,%ymm7,%ymm7 1188 vpaddd %ymm2,%ymm9,%ymm9 1189 vpxor %ymm4,%ymm9,%ymm4 1190 vpshufb %ymm15,%ymm4,%ymm4 1191 vpaddd %ymm7,%ymm12,%ymm12 1192 vpxor %ymm1,%ymm12,%ymm1 1193 vpslld $12,%ymm1,%ymm14 1194 vpsrld $20,%ymm1,%ymm1 1195 vpor %ymm1,%ymm14,%ymm1 1196 vbroadcasti128 (%r11),%ymm14 1197 vpaddd %ymm4,%ymm13,%ymm13 1198 vpxor %ymm2,%ymm13,%ymm2 1199 vpslld $12,%ymm2,%ymm15 1200 vpsrld $20,%ymm2,%ymm2 1201 vpor %ymm2,%ymm15,%ymm2 1202 vpaddd %ymm1,%ymm8,%ymm8 1203 vpxor %ymm7,%ymm8,%ymm7 1204 vpshufb %ymm14,%ymm7,%ymm7 1205 vpaddd %ymm2,%ymm9,%ymm9 1206 vpxor %ymm4,%ymm9,%ymm4 1207 vpshufb %ymm14,%ymm4,%ymm4 1208 vpaddd %ymm7,%ymm12,%ymm12 1209 vpxor %ymm1,%ymm12,%ymm1 1210 vpslld $7,%ymm1,%ymm15 1211 vpsrld $25,%ymm1,%ymm1 1212 vpor %ymm1,%ymm15,%ymm1 1213 vbroadcasti128 (%r10),%ymm15 1214 vpaddd %ymm4,%ymm13,%ymm13 1215 vpxor %ymm2,%ymm13,%ymm2 1216 vpslld $7,%ymm2,%ymm14 1217 vpsrld $25,%ymm2,%ymm2 1218 vpor %ymm2,%ymm14,%ymm2 1219 vmovdqa %ymm12,64(%rsp) 1220 vmovdqa %ymm13,96(%rsp) 1221 vmovdqa 0(%rsp),%ymm12 1222 vmovdqa 32(%rsp),%ymm13 1223 vpaddd %ymm3,%ymm10,%ymm10 1224 vpxor %ymm5,%ymm10,%ymm5 1225 vpshufb %ymm15,%ymm5,%ymm5 1226 vpaddd %ymm0,%ymm11,%ymm11 1227 vpxor %ymm6,%ymm11,%ymm6 1228 vpshufb %ymm15,%ymm6,%ymm6 1229 vpaddd %ymm5,%ymm12,%ymm12 1230 vpxor %ymm3,%ymm12,%ymm3 1231 vpslld $12,%ymm3,%ymm14 1232 vpsrld $20,%ymm3,%ymm3 1233 vpor %ymm3,%ymm14,%ymm3 1234 vbroadcasti128 (%r11),%ymm14 1235 vpaddd %ymm6,%ymm13,%ymm13 1236 vpxor %ymm0,%ymm13,%ymm0 1237 vpslld $12,%ymm0,%ymm15 1238 vpsrld $20,%ymm0,%ymm0 1239 vpor %ymm0,%ymm15,%ymm0 1240 vpaddd %ymm3,%ymm10,%ymm10 1241 vpxor %ymm5,%ymm10,%ymm5 1242 vpshufb %ymm14,%ymm5,%ymm5 1243 vpaddd %ymm0,%ymm11,%ymm11 1244 vpxor %ymm6,%ymm11,%ymm6 1245 vpshufb %ymm14,%ymm6,%ymm6 1246 vpaddd %ymm5,%ymm12,%ymm12 1247 vpxor %ymm3,%ymm12,%ymm3 1248 vpslld $7,%ymm3,%ymm15 1249 vpsrld $25,%ymm3,%ymm3 1250 vpor %ymm3,%ymm15,%ymm3 1251 vbroadcasti128 (%r10),%ymm15 1252 vpaddd %ymm6,%ymm13,%ymm13 1253 vpxor %ymm0,%ymm13,%ymm0 1254 vpslld $7,%ymm0,%ymm14 1255 vpsrld $25,%ymm0,%ymm0 1256 vpor %ymm0,%ymm14,%ymm0 1257 decl %eax 1258 jnz L$oop8x 1259 1260 leaq 512(%rsp),%rax 1261 vpaddd 128-256(%rcx),%ymm8,%ymm8 1262 vpaddd 160-256(%rcx),%ymm9,%ymm9 1263 vpaddd 192-256(%rcx),%ymm10,%ymm10 1264 vpaddd 224-256(%rcx),%ymm11,%ymm11 1265 1266 vpunpckldq %ymm9,%ymm8,%ymm14 1267 vpunpckldq %ymm11,%ymm10,%ymm15 1268 vpunpckhdq %ymm9,%ymm8,%ymm8 1269 vpunpckhdq %ymm11,%ymm10,%ymm10 1270 vpunpcklqdq %ymm15,%ymm14,%ymm9 1271 vpunpckhqdq %ymm15,%ymm14,%ymm14 1272 vpunpcklqdq %ymm10,%ymm8,%ymm11 1273 vpunpckhqdq %ymm10,%ymm8,%ymm8 1274 vpaddd 256-256(%rcx),%ymm0,%ymm0 1275 vpaddd 288-256(%rcx),%ymm1,%ymm1 1276 vpaddd 320-256(%rcx),%ymm2,%ymm2 1277 vpaddd 352-256(%rcx),%ymm3,%ymm3 1278 1279 vpunpckldq %ymm1,%ymm0,%ymm10 1280 vpunpckldq %ymm3,%ymm2,%ymm15 1281 vpunpckhdq %ymm1,%ymm0,%ymm0 1282 vpunpckhdq %ymm3,%ymm2,%ymm2 1283 vpunpcklqdq %ymm15,%ymm10,%ymm1 1284 vpunpckhqdq %ymm15,%ymm10,%ymm10 1285 vpunpcklqdq %ymm2,%ymm0,%ymm3 1286 vpunpckhqdq %ymm2,%ymm0,%ymm0 1287 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1288 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1289 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1290 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1291 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1292 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1293 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1294 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1295 vmovdqa %ymm15,0(%rsp) 1296 vmovdqa %ymm9,32(%rsp) 1297 vmovdqa 64(%rsp),%ymm15 1298 vmovdqa 96(%rsp),%ymm9 1299 1300 vpaddd 384-512(%rax),%ymm12,%ymm12 1301 vpaddd 416-512(%rax),%ymm13,%ymm13 1302 vpaddd 448-512(%rax),%ymm15,%ymm15 1303 vpaddd 480-512(%rax),%ymm9,%ymm9 1304 1305 vpunpckldq %ymm13,%ymm12,%ymm2 1306 vpunpckldq %ymm9,%ymm15,%ymm8 1307 vpunpckhdq %ymm13,%ymm12,%ymm12 1308 vpunpckhdq %ymm9,%ymm15,%ymm15 1309 vpunpcklqdq %ymm8,%ymm2,%ymm13 1310 vpunpckhqdq %ymm8,%ymm2,%ymm2 1311 vpunpcklqdq %ymm15,%ymm12,%ymm9 1312 vpunpckhqdq %ymm15,%ymm12,%ymm12 1313 vpaddd 512-512(%rax),%ymm4,%ymm4 1314 vpaddd 544-512(%rax),%ymm5,%ymm5 1315 vpaddd 576-512(%rax),%ymm6,%ymm6 1316 vpaddd 608-512(%rax),%ymm7,%ymm7 1317 1318 vpunpckldq %ymm5,%ymm4,%ymm15 1319 vpunpckldq %ymm7,%ymm6,%ymm8 1320 vpunpckhdq %ymm5,%ymm4,%ymm4 1321 vpunpckhdq %ymm7,%ymm6,%ymm6 1322 vpunpcklqdq %ymm8,%ymm15,%ymm5 1323 vpunpckhqdq %ymm8,%ymm15,%ymm15 1324 vpunpcklqdq %ymm6,%ymm4,%ymm7 1325 vpunpckhqdq %ymm6,%ymm4,%ymm4 1326 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1327 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1328 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1329 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1330 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1331 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1332 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1333 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1334 vmovdqa 0(%rsp),%ymm6 1335 vmovdqa 32(%rsp),%ymm12 1336 1337 cmpq $512,%rdx 1338 jb L$tail8x 1339 1340 vpxor 0(%rsi),%ymm6,%ymm6 1341 vpxor 32(%rsi),%ymm8,%ymm8 1342 vpxor 64(%rsi),%ymm1,%ymm1 1343 vpxor 96(%rsi),%ymm5,%ymm5 1344 leaq 128(%rsi),%rsi 1345 vmovdqu %ymm6,0(%rdi) 1346 vmovdqu %ymm8,32(%rdi) 1347 vmovdqu %ymm1,64(%rdi) 1348 vmovdqu %ymm5,96(%rdi) 1349 leaq 128(%rdi),%rdi 1350 1351 vpxor 0(%rsi),%ymm12,%ymm12 1352 vpxor 32(%rsi),%ymm13,%ymm13 1353 vpxor 64(%rsi),%ymm10,%ymm10 1354 vpxor 96(%rsi),%ymm15,%ymm15 1355 leaq 128(%rsi),%rsi 1356 vmovdqu %ymm12,0(%rdi) 1357 vmovdqu %ymm13,32(%rdi) 1358 vmovdqu %ymm10,64(%rdi) 1359 vmovdqu %ymm15,96(%rdi) 1360 leaq 128(%rdi),%rdi 1361 1362 vpxor 0(%rsi),%ymm14,%ymm14 1363 vpxor 32(%rsi),%ymm2,%ymm2 1364 vpxor 64(%rsi),%ymm3,%ymm3 1365 vpxor 96(%rsi),%ymm7,%ymm7 1366 leaq 128(%rsi),%rsi 1367 vmovdqu %ymm14,0(%rdi) 1368 vmovdqu %ymm2,32(%rdi) 1369 vmovdqu %ymm3,64(%rdi) 1370 vmovdqu %ymm7,96(%rdi) 1371 leaq 128(%rdi),%rdi 1372 1373 vpxor 0(%rsi),%ymm11,%ymm11 1374 vpxor 32(%rsi),%ymm9,%ymm9 1375 vpxor 64(%rsi),%ymm0,%ymm0 1376 vpxor 96(%rsi),%ymm4,%ymm4 1377 leaq 128(%rsi),%rsi 1378 vmovdqu %ymm11,0(%rdi) 1379 vmovdqu %ymm9,32(%rdi) 1380 vmovdqu %ymm0,64(%rdi) 1381 vmovdqu %ymm4,96(%rdi) 1382 leaq 128(%rdi),%rdi 1383 1384 subq $512,%rdx 1385 jnz L$oop_outer8x 1386 1387 jmp L$done8x 1388 1389L$tail8x: 1390 cmpq $448,%rdx 1391 jae L$448_or_more8x 1392 cmpq $384,%rdx 1393 jae L$384_or_more8x 1394 cmpq $320,%rdx 1395 jae L$320_or_more8x 1396 cmpq $256,%rdx 1397 jae L$256_or_more8x 1398 cmpq $192,%rdx 1399 jae L$192_or_more8x 1400 cmpq $128,%rdx 1401 jae L$128_or_more8x 1402 cmpq $64,%rdx 1403 jae L$64_or_more8x 1404 1405 xorq %r10,%r10 1406 vmovdqa %ymm6,0(%rsp) 1407 vmovdqa %ymm8,32(%rsp) 1408 jmp L$oop_tail8x 1409 1410.p2align 5 1411L$64_or_more8x: 1412 vpxor 0(%rsi),%ymm6,%ymm6 1413 vpxor 32(%rsi),%ymm8,%ymm8 1414 vmovdqu %ymm6,0(%rdi) 1415 vmovdqu %ymm8,32(%rdi) 1416 je L$done8x 1417 1418 leaq 64(%rsi),%rsi 1419 xorq %r10,%r10 1420 vmovdqa %ymm1,0(%rsp) 1421 leaq 64(%rdi),%rdi 1422 subq $64,%rdx 1423 vmovdqa %ymm5,32(%rsp) 1424 jmp L$oop_tail8x 1425 1426.p2align 5 1427L$128_or_more8x: 1428 vpxor 0(%rsi),%ymm6,%ymm6 1429 vpxor 32(%rsi),%ymm8,%ymm8 1430 vpxor 64(%rsi),%ymm1,%ymm1 1431 vpxor 96(%rsi),%ymm5,%ymm5 1432 vmovdqu %ymm6,0(%rdi) 1433 vmovdqu %ymm8,32(%rdi) 1434 vmovdqu %ymm1,64(%rdi) 1435 vmovdqu %ymm5,96(%rdi) 1436 je L$done8x 1437 1438 leaq 128(%rsi),%rsi 1439 xorq %r10,%r10 1440 vmovdqa %ymm12,0(%rsp) 1441 leaq 128(%rdi),%rdi 1442 subq $128,%rdx 1443 vmovdqa %ymm13,32(%rsp) 1444 jmp L$oop_tail8x 1445 1446.p2align 5 1447L$192_or_more8x: 1448 vpxor 0(%rsi),%ymm6,%ymm6 1449 vpxor 32(%rsi),%ymm8,%ymm8 1450 vpxor 64(%rsi),%ymm1,%ymm1 1451 vpxor 96(%rsi),%ymm5,%ymm5 1452 vpxor 128(%rsi),%ymm12,%ymm12 1453 vpxor 160(%rsi),%ymm13,%ymm13 1454 vmovdqu %ymm6,0(%rdi) 1455 vmovdqu %ymm8,32(%rdi) 1456 vmovdqu %ymm1,64(%rdi) 1457 vmovdqu %ymm5,96(%rdi) 1458 vmovdqu %ymm12,128(%rdi) 1459 vmovdqu %ymm13,160(%rdi) 1460 je L$done8x 1461 1462 leaq 192(%rsi),%rsi 1463 xorq %r10,%r10 1464 vmovdqa %ymm10,0(%rsp) 1465 leaq 192(%rdi),%rdi 1466 subq $192,%rdx 1467 vmovdqa %ymm15,32(%rsp) 1468 jmp L$oop_tail8x 1469 1470.p2align 5 1471L$256_or_more8x: 1472 vpxor 0(%rsi),%ymm6,%ymm6 1473 vpxor 32(%rsi),%ymm8,%ymm8 1474 vpxor 64(%rsi),%ymm1,%ymm1 1475 vpxor 96(%rsi),%ymm5,%ymm5 1476 vpxor 128(%rsi),%ymm12,%ymm12 1477 vpxor 160(%rsi),%ymm13,%ymm13 1478 vpxor 192(%rsi),%ymm10,%ymm10 1479 vpxor 224(%rsi),%ymm15,%ymm15 1480 vmovdqu %ymm6,0(%rdi) 1481 vmovdqu %ymm8,32(%rdi) 1482 vmovdqu %ymm1,64(%rdi) 1483 vmovdqu %ymm5,96(%rdi) 1484 vmovdqu %ymm12,128(%rdi) 1485 vmovdqu %ymm13,160(%rdi) 1486 vmovdqu %ymm10,192(%rdi) 1487 vmovdqu %ymm15,224(%rdi) 1488 je L$done8x 1489 1490 leaq 256(%rsi),%rsi 1491 xorq %r10,%r10 1492 vmovdqa %ymm14,0(%rsp) 1493 leaq 256(%rdi),%rdi 1494 subq $256,%rdx 1495 vmovdqa %ymm2,32(%rsp) 1496 jmp L$oop_tail8x 1497 1498.p2align 5 1499L$320_or_more8x: 1500 vpxor 0(%rsi),%ymm6,%ymm6 1501 vpxor 32(%rsi),%ymm8,%ymm8 1502 vpxor 64(%rsi),%ymm1,%ymm1 1503 vpxor 96(%rsi),%ymm5,%ymm5 1504 vpxor 128(%rsi),%ymm12,%ymm12 1505 vpxor 160(%rsi),%ymm13,%ymm13 1506 vpxor 192(%rsi),%ymm10,%ymm10 1507 vpxor 224(%rsi),%ymm15,%ymm15 1508 vpxor 256(%rsi),%ymm14,%ymm14 1509 vpxor 288(%rsi),%ymm2,%ymm2 1510 vmovdqu %ymm6,0(%rdi) 1511 vmovdqu %ymm8,32(%rdi) 1512 vmovdqu %ymm1,64(%rdi) 1513 vmovdqu %ymm5,96(%rdi) 1514 vmovdqu %ymm12,128(%rdi) 1515 vmovdqu %ymm13,160(%rdi) 1516 vmovdqu %ymm10,192(%rdi) 1517 vmovdqu %ymm15,224(%rdi) 1518 vmovdqu %ymm14,256(%rdi) 1519 vmovdqu %ymm2,288(%rdi) 1520 je L$done8x 1521 1522 leaq 320(%rsi),%rsi 1523 xorq %r10,%r10 1524 vmovdqa %ymm3,0(%rsp) 1525 leaq 320(%rdi),%rdi 1526 subq $320,%rdx 1527 vmovdqa %ymm7,32(%rsp) 1528 jmp L$oop_tail8x 1529 1530.p2align 5 1531L$384_or_more8x: 1532 vpxor 0(%rsi),%ymm6,%ymm6 1533 vpxor 32(%rsi),%ymm8,%ymm8 1534 vpxor 64(%rsi),%ymm1,%ymm1 1535 vpxor 96(%rsi),%ymm5,%ymm5 1536 vpxor 128(%rsi),%ymm12,%ymm12 1537 vpxor 160(%rsi),%ymm13,%ymm13 1538 vpxor 192(%rsi),%ymm10,%ymm10 1539 vpxor 224(%rsi),%ymm15,%ymm15 1540 vpxor 256(%rsi),%ymm14,%ymm14 1541 vpxor 288(%rsi),%ymm2,%ymm2 1542 vpxor 320(%rsi),%ymm3,%ymm3 1543 vpxor 352(%rsi),%ymm7,%ymm7 1544 vmovdqu %ymm6,0(%rdi) 1545 vmovdqu %ymm8,32(%rdi) 1546 vmovdqu %ymm1,64(%rdi) 1547 vmovdqu %ymm5,96(%rdi) 1548 vmovdqu %ymm12,128(%rdi) 1549 vmovdqu %ymm13,160(%rdi) 1550 vmovdqu %ymm10,192(%rdi) 1551 vmovdqu %ymm15,224(%rdi) 1552 vmovdqu %ymm14,256(%rdi) 1553 vmovdqu %ymm2,288(%rdi) 1554 vmovdqu %ymm3,320(%rdi) 1555 vmovdqu %ymm7,352(%rdi) 1556 je L$done8x 1557 1558 leaq 384(%rsi),%rsi 1559 xorq %r10,%r10 1560 vmovdqa %ymm11,0(%rsp) 1561 leaq 384(%rdi),%rdi 1562 subq $384,%rdx 1563 vmovdqa %ymm9,32(%rsp) 1564 jmp L$oop_tail8x 1565 1566.p2align 5 1567L$448_or_more8x: 1568 vpxor 0(%rsi),%ymm6,%ymm6 1569 vpxor 32(%rsi),%ymm8,%ymm8 1570 vpxor 64(%rsi),%ymm1,%ymm1 1571 vpxor 96(%rsi),%ymm5,%ymm5 1572 vpxor 128(%rsi),%ymm12,%ymm12 1573 vpxor 160(%rsi),%ymm13,%ymm13 1574 vpxor 192(%rsi),%ymm10,%ymm10 1575 vpxor 224(%rsi),%ymm15,%ymm15 1576 vpxor 256(%rsi),%ymm14,%ymm14 1577 vpxor 288(%rsi),%ymm2,%ymm2 1578 vpxor 320(%rsi),%ymm3,%ymm3 1579 vpxor 352(%rsi),%ymm7,%ymm7 1580 vpxor 384(%rsi),%ymm11,%ymm11 1581 vpxor 416(%rsi),%ymm9,%ymm9 1582 vmovdqu %ymm6,0(%rdi) 1583 vmovdqu %ymm8,32(%rdi) 1584 vmovdqu %ymm1,64(%rdi) 1585 vmovdqu %ymm5,96(%rdi) 1586 vmovdqu %ymm12,128(%rdi) 1587 vmovdqu %ymm13,160(%rdi) 1588 vmovdqu %ymm10,192(%rdi) 1589 vmovdqu %ymm15,224(%rdi) 1590 vmovdqu %ymm14,256(%rdi) 1591 vmovdqu %ymm2,288(%rdi) 1592 vmovdqu %ymm3,320(%rdi) 1593 vmovdqu %ymm7,352(%rdi) 1594 vmovdqu %ymm11,384(%rdi) 1595 vmovdqu %ymm9,416(%rdi) 1596 je L$done8x 1597 1598 leaq 448(%rsi),%rsi 1599 xorq %r10,%r10 1600 vmovdqa %ymm0,0(%rsp) 1601 leaq 448(%rdi),%rdi 1602 subq $448,%rdx 1603 vmovdqa %ymm4,32(%rsp) 1604 1605L$oop_tail8x: 1606 movzbl (%rsi,%r10,1),%eax 1607 movzbl (%rsp,%r10,1),%ecx 1608 leaq 1(%r10),%r10 1609 xorl %ecx,%eax 1610 movb %al,-1(%rdi,%r10,1) 1611 decq %rdx 1612 jnz L$oop_tail8x 1613 1614L$done8x: 1615 vzeroall 1616 leaq (%r9),%rsp 1617 1618L$8x_epilogue: 1619 .byte 0xf3,0xc3 1620 1621 1622#endif 1623