1.text 2 3 4 5.p2align 6 6L$zero: 7.long 0,0,0,0 8L$one: 9.long 1,0,0,0 10L$inc: 11.long 0,1,2,3 12L$four: 13.long 4,4,4,4 14L$incy: 15.long 0,2,4,6,1,3,5,7 16L$eight: 17.long 8,8,8,8,8,8,8,8 18L$rot16: 19.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 20L$rot24: 21.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 22L$twoy: 23.long 2,0,0,0, 2,0,0,0 24.p2align 6 25L$zeroz: 26.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 27L$fourz: 28.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 29L$incz: 30.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 31L$sixteen: 32.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 33L$sigma: 34.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 35.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 36.globl _ChaCha20_ctr32 37 38.p2align 6 39_ChaCha20_ctr32: 40 41 cmpq $0,%rdx 42 je L$no_data 43 movq _OPENSSL_ia32cap_P+4(%rip),%r10 44 testl $512,%r10d 45 jnz L$ChaCha20_ssse3 46 47 pushq %rbx 48 49 pushq %rbp 50 51 pushq %r12 52 53 pushq %r13 54 55 pushq %r14 56 57 pushq %r15 58 59 subq $64+24,%rsp 60 61L$ctr32_body: 62 63 64 movdqu (%rcx),%xmm1 65 movdqu 16(%rcx),%xmm2 66 movdqu (%r8),%xmm3 67 movdqa L$one(%rip),%xmm4 68 69 70 movdqa %xmm1,16(%rsp) 71 movdqa %xmm2,32(%rsp) 72 movdqa %xmm3,48(%rsp) 73 movq %rdx,%rbp 74 jmp L$oop_outer 75 76.p2align 5 77L$oop_outer: 78 movl $0x61707865,%eax 79 movl $0x3320646e,%ebx 80 movl $0x79622d32,%ecx 81 movl $0x6b206574,%edx 82 movl 16(%rsp),%r8d 83 movl 20(%rsp),%r9d 84 movl 24(%rsp),%r10d 85 movl 28(%rsp),%r11d 86 movd %xmm3,%r12d 87 movl 52(%rsp),%r13d 88 movl 56(%rsp),%r14d 89 movl 60(%rsp),%r15d 90 91 movq %rbp,64+0(%rsp) 92 movl $10,%ebp 93 movq %rsi,64+8(%rsp) 94.byte 102,72,15,126,214 95 movq %rdi,64+16(%rsp) 96 movq %rsi,%rdi 97 shrq $32,%rdi 98 jmp L$oop 99 100.p2align 5 101L$oop: 102 addl %r8d,%eax 103 xorl %eax,%r12d 104 roll $16,%r12d 105 addl %r9d,%ebx 106 xorl %ebx,%r13d 107 roll $16,%r13d 108 addl %r12d,%esi 109 xorl %esi,%r8d 110 roll $12,%r8d 111 addl %r13d,%edi 112 xorl %edi,%r9d 113 roll $12,%r9d 114 addl %r8d,%eax 115 xorl %eax,%r12d 116 roll $8,%r12d 117 addl %r9d,%ebx 118 xorl %ebx,%r13d 119 roll $8,%r13d 120 addl %r12d,%esi 121 xorl %esi,%r8d 122 roll $7,%r8d 123 addl %r13d,%edi 124 xorl %edi,%r9d 125 roll $7,%r9d 126 movl %esi,32(%rsp) 127 movl %edi,36(%rsp) 128 movl 40(%rsp),%esi 129 movl 44(%rsp),%edi 130 addl %r10d,%ecx 131 xorl %ecx,%r14d 132 roll $16,%r14d 133 addl %r11d,%edx 134 xorl %edx,%r15d 135 roll $16,%r15d 136 addl %r14d,%esi 137 xorl %esi,%r10d 138 roll $12,%r10d 139 addl %r15d,%edi 140 xorl %edi,%r11d 141 roll $12,%r11d 142 addl %r10d,%ecx 143 xorl %ecx,%r14d 144 roll $8,%r14d 145 addl %r11d,%edx 146 xorl %edx,%r15d 147 roll $8,%r15d 148 addl %r14d,%esi 149 xorl %esi,%r10d 150 roll $7,%r10d 151 addl %r15d,%edi 152 xorl %edi,%r11d 153 roll $7,%r11d 154 addl %r9d,%eax 155 xorl %eax,%r15d 156 roll $16,%r15d 157 addl %r10d,%ebx 158 xorl %ebx,%r12d 159 roll $16,%r12d 160 addl %r15d,%esi 161 xorl %esi,%r9d 162 roll $12,%r9d 163 addl %r12d,%edi 164 xorl %edi,%r10d 165 roll $12,%r10d 166 addl %r9d,%eax 167 xorl %eax,%r15d 168 roll $8,%r15d 169 addl %r10d,%ebx 170 xorl %ebx,%r12d 171 roll $8,%r12d 172 addl %r15d,%esi 173 xorl %esi,%r9d 174 roll $7,%r9d 175 addl %r12d,%edi 176 xorl %edi,%r10d 177 roll $7,%r10d 178 movl %esi,40(%rsp) 179 movl %edi,44(%rsp) 180 movl 32(%rsp),%esi 181 movl 36(%rsp),%edi 182 addl %r11d,%ecx 183 xorl %ecx,%r13d 184 roll $16,%r13d 185 addl %r8d,%edx 186 xorl %edx,%r14d 187 roll $16,%r14d 188 addl %r13d,%esi 189 xorl %esi,%r11d 190 roll $12,%r11d 191 addl %r14d,%edi 192 xorl %edi,%r8d 193 roll $12,%r8d 194 addl %r11d,%ecx 195 xorl %ecx,%r13d 196 roll $8,%r13d 197 addl %r8d,%edx 198 xorl %edx,%r14d 199 roll $8,%r14d 200 addl %r13d,%esi 201 xorl %esi,%r11d 202 roll $7,%r11d 203 addl %r14d,%edi 204 xorl %edi,%r8d 205 roll $7,%r8d 206 decl %ebp 207 jnz L$oop 208 movl %edi,36(%rsp) 209 movl %esi,32(%rsp) 210 movq 64(%rsp),%rbp 211 movdqa %xmm2,%xmm1 212 movq 64+8(%rsp),%rsi 213 paddd %xmm4,%xmm3 214 movq 64+16(%rsp),%rdi 215 216 addl $0x61707865,%eax 217 addl $0x3320646e,%ebx 218 addl $0x79622d32,%ecx 219 addl $0x6b206574,%edx 220 addl 16(%rsp),%r8d 221 addl 20(%rsp),%r9d 222 addl 24(%rsp),%r10d 223 addl 28(%rsp),%r11d 224 addl 48(%rsp),%r12d 225 addl 52(%rsp),%r13d 226 addl 56(%rsp),%r14d 227 addl 60(%rsp),%r15d 228 paddd 32(%rsp),%xmm1 229 230 cmpq $64,%rbp 231 jb L$tail 232 233 xorl 0(%rsi),%eax 234 xorl 4(%rsi),%ebx 235 xorl 8(%rsi),%ecx 236 xorl 12(%rsi),%edx 237 xorl 16(%rsi),%r8d 238 xorl 20(%rsi),%r9d 239 xorl 24(%rsi),%r10d 240 xorl 28(%rsi),%r11d 241 movdqu 32(%rsi),%xmm0 242 xorl 48(%rsi),%r12d 243 xorl 52(%rsi),%r13d 244 xorl 56(%rsi),%r14d 245 xorl 60(%rsi),%r15d 246 leaq 64(%rsi),%rsi 247 pxor %xmm1,%xmm0 248 249 movdqa %xmm2,32(%rsp) 250 movd %xmm3,48(%rsp) 251 252 movl %eax,0(%rdi) 253 movl %ebx,4(%rdi) 254 movl %ecx,8(%rdi) 255 movl %edx,12(%rdi) 256 movl %r8d,16(%rdi) 257 movl %r9d,20(%rdi) 258 movl %r10d,24(%rdi) 259 movl %r11d,28(%rdi) 260 movdqu %xmm0,32(%rdi) 261 movl %r12d,48(%rdi) 262 movl %r13d,52(%rdi) 263 movl %r14d,56(%rdi) 264 movl %r15d,60(%rdi) 265 leaq 64(%rdi),%rdi 266 267 subq $64,%rbp 268 jnz L$oop_outer 269 270 jmp L$done 271 272.p2align 4 273L$tail: 274 movl %eax,0(%rsp) 275 movl %ebx,4(%rsp) 276 xorq %rbx,%rbx 277 movl %ecx,8(%rsp) 278 movl %edx,12(%rsp) 279 movl %r8d,16(%rsp) 280 movl %r9d,20(%rsp) 281 movl %r10d,24(%rsp) 282 movl %r11d,28(%rsp) 283 movdqa %xmm1,32(%rsp) 284 movl %r12d,48(%rsp) 285 movl %r13d,52(%rsp) 286 movl %r14d,56(%rsp) 287 movl %r15d,60(%rsp) 288 289L$oop_tail: 290 movzbl (%rsi,%rbx,1),%eax 291 movzbl (%rsp,%rbx,1),%edx 292 leaq 1(%rbx),%rbx 293 xorl %edx,%eax 294 movb %al,-1(%rdi,%rbx,1) 295 decq %rbp 296 jnz L$oop_tail 297 298L$done: 299 leaq 64+24+48(%rsp),%rsi 300 301 movq -48(%rsi),%r15 302 303 movq -40(%rsi),%r14 304 305 movq -32(%rsi),%r13 306 307 movq -24(%rsi),%r12 308 309 movq -16(%rsi),%rbp 310 311 movq -8(%rsi),%rbx 312 313 leaq (%rsi),%rsp 314 315L$no_data: 316 .byte 0xf3,0xc3 317 318 319 320.p2align 5 321ChaCha20_ssse3: 322 323L$ChaCha20_ssse3: 324 movq %rsp,%r9 325 326 testl $2048,%r10d 327 jnz L$ChaCha20_4xop 328 cmpq $128,%rdx 329 je L$ChaCha20_128 330 ja L$ChaCha20_4x 331 332L$do_sse3_after_all: 333 subq $64+8,%rsp 334 movdqa L$sigma(%rip),%xmm0 335 movdqu (%rcx),%xmm1 336 movdqu 16(%rcx),%xmm2 337 movdqu (%r8),%xmm3 338 movdqa L$rot16(%rip),%xmm6 339 movdqa L$rot24(%rip),%xmm7 340 341 movdqa %xmm0,0(%rsp) 342 movdqa %xmm1,16(%rsp) 343 movdqa %xmm2,32(%rsp) 344 movdqa %xmm3,48(%rsp) 345 movq $10,%r8 346 jmp L$oop_ssse3 347 348.p2align 5 349L$oop_outer_ssse3: 350 movdqa L$one(%rip),%xmm3 351 movdqa 0(%rsp),%xmm0 352 movdqa 16(%rsp),%xmm1 353 movdqa 32(%rsp),%xmm2 354 paddd 48(%rsp),%xmm3 355 movq $10,%r8 356 movdqa %xmm3,48(%rsp) 357 jmp L$oop_ssse3 358 359.p2align 5 360L$oop_ssse3: 361 paddd %xmm1,%xmm0 362 pxor %xmm0,%xmm3 363.byte 102,15,56,0,222 364 paddd %xmm3,%xmm2 365 pxor %xmm2,%xmm1 366 movdqa %xmm1,%xmm4 367 psrld $20,%xmm1 368 pslld $12,%xmm4 369 por %xmm4,%xmm1 370 paddd %xmm1,%xmm0 371 pxor %xmm0,%xmm3 372.byte 102,15,56,0,223 373 paddd %xmm3,%xmm2 374 pxor %xmm2,%xmm1 375 movdqa %xmm1,%xmm4 376 psrld $25,%xmm1 377 pslld $7,%xmm4 378 por %xmm4,%xmm1 379 pshufd $78,%xmm2,%xmm2 380 pshufd $57,%xmm1,%xmm1 381 pshufd $147,%xmm3,%xmm3 382 nop 383 paddd %xmm1,%xmm0 384 pxor %xmm0,%xmm3 385.byte 102,15,56,0,222 386 paddd %xmm3,%xmm2 387 pxor %xmm2,%xmm1 388 movdqa %xmm1,%xmm4 389 psrld $20,%xmm1 390 pslld $12,%xmm4 391 por %xmm4,%xmm1 392 paddd %xmm1,%xmm0 393 pxor %xmm0,%xmm3 394.byte 102,15,56,0,223 395 paddd %xmm3,%xmm2 396 pxor %xmm2,%xmm1 397 movdqa %xmm1,%xmm4 398 psrld $25,%xmm1 399 pslld $7,%xmm4 400 por %xmm4,%xmm1 401 pshufd $78,%xmm2,%xmm2 402 pshufd $147,%xmm1,%xmm1 403 pshufd $57,%xmm3,%xmm3 404 decq %r8 405 jnz L$oop_ssse3 406 paddd 0(%rsp),%xmm0 407 paddd 16(%rsp),%xmm1 408 paddd 32(%rsp),%xmm2 409 paddd 48(%rsp),%xmm3 410 411 cmpq $64,%rdx 412 jb L$tail_ssse3 413 414 movdqu 0(%rsi),%xmm4 415 movdqu 16(%rsi),%xmm5 416 pxor %xmm4,%xmm0 417 movdqu 32(%rsi),%xmm4 418 pxor %xmm5,%xmm1 419 movdqu 48(%rsi),%xmm5 420 leaq 64(%rsi),%rsi 421 pxor %xmm4,%xmm2 422 pxor %xmm5,%xmm3 423 424 movdqu %xmm0,0(%rdi) 425 movdqu %xmm1,16(%rdi) 426 movdqu %xmm2,32(%rdi) 427 movdqu %xmm3,48(%rdi) 428 leaq 64(%rdi),%rdi 429 430 subq $64,%rdx 431 jnz L$oop_outer_ssse3 432 433 jmp L$done_ssse3 434 435.p2align 4 436L$tail_ssse3: 437 movdqa %xmm0,0(%rsp) 438 movdqa %xmm1,16(%rsp) 439 movdqa %xmm2,32(%rsp) 440 movdqa %xmm3,48(%rsp) 441 xorq %r8,%r8 442 443L$oop_tail_ssse3: 444 movzbl (%rsi,%r8,1),%eax 445 movzbl (%rsp,%r8,1),%ecx 446 leaq 1(%r8),%r8 447 xorl %ecx,%eax 448 movb %al,-1(%rdi,%r8,1) 449 decq %rdx 450 jnz L$oop_tail_ssse3 451 452L$done_ssse3: 453 leaq (%r9),%rsp 454 455L$ssse3_epilogue: 456 .byte 0xf3,0xc3 457 458 459 460.p2align 5 461ChaCha20_128: 462 463L$ChaCha20_128: 464 movq %rsp,%r9 465 466 subq $64+8,%rsp 467 movdqa L$sigma(%rip),%xmm8 468 movdqu (%rcx),%xmm9 469 movdqu 16(%rcx),%xmm2 470 movdqu (%r8),%xmm3 471 movdqa L$one(%rip),%xmm1 472 movdqa L$rot16(%rip),%xmm6 473 movdqa L$rot24(%rip),%xmm7 474 475 movdqa %xmm8,%xmm10 476 movdqa %xmm8,0(%rsp) 477 movdqa %xmm9,%xmm11 478 movdqa %xmm9,16(%rsp) 479 movdqa %xmm2,%xmm0 480 movdqa %xmm2,32(%rsp) 481 paddd %xmm3,%xmm1 482 movdqa %xmm3,48(%rsp) 483 movq $10,%r8 484 jmp L$oop_128 485 486.p2align 5 487L$oop_128: 488 paddd %xmm9,%xmm8 489 pxor %xmm8,%xmm3 490 paddd %xmm11,%xmm10 491 pxor %xmm10,%xmm1 492.byte 102,15,56,0,222 493.byte 102,15,56,0,206 494 paddd %xmm3,%xmm2 495 paddd %xmm1,%xmm0 496 pxor %xmm2,%xmm9 497 pxor %xmm0,%xmm11 498 movdqa %xmm9,%xmm4 499 psrld $20,%xmm9 500 movdqa %xmm11,%xmm5 501 pslld $12,%xmm4 502 psrld $20,%xmm11 503 por %xmm4,%xmm9 504 pslld $12,%xmm5 505 por %xmm5,%xmm11 506 paddd %xmm9,%xmm8 507 pxor %xmm8,%xmm3 508 paddd %xmm11,%xmm10 509 pxor %xmm10,%xmm1 510.byte 102,15,56,0,223 511.byte 102,15,56,0,207 512 paddd %xmm3,%xmm2 513 paddd %xmm1,%xmm0 514 pxor %xmm2,%xmm9 515 pxor %xmm0,%xmm11 516 movdqa %xmm9,%xmm4 517 psrld $25,%xmm9 518 movdqa %xmm11,%xmm5 519 pslld $7,%xmm4 520 psrld $25,%xmm11 521 por %xmm4,%xmm9 522 pslld $7,%xmm5 523 por %xmm5,%xmm11 524 pshufd $78,%xmm2,%xmm2 525 pshufd $57,%xmm9,%xmm9 526 pshufd $147,%xmm3,%xmm3 527 pshufd $78,%xmm0,%xmm0 528 pshufd $57,%xmm11,%xmm11 529 pshufd $147,%xmm1,%xmm1 530 paddd %xmm9,%xmm8 531 pxor %xmm8,%xmm3 532 paddd %xmm11,%xmm10 533 pxor %xmm10,%xmm1 534.byte 102,15,56,0,222 535.byte 102,15,56,0,206 536 paddd %xmm3,%xmm2 537 paddd %xmm1,%xmm0 538 pxor %xmm2,%xmm9 539 pxor %xmm0,%xmm11 540 movdqa %xmm9,%xmm4 541 psrld $20,%xmm9 542 movdqa %xmm11,%xmm5 543 pslld $12,%xmm4 544 psrld $20,%xmm11 545 por %xmm4,%xmm9 546 pslld $12,%xmm5 547 por %xmm5,%xmm11 548 paddd %xmm9,%xmm8 549 pxor %xmm8,%xmm3 550 paddd %xmm11,%xmm10 551 pxor %xmm10,%xmm1 552.byte 102,15,56,0,223 553.byte 102,15,56,0,207 554 paddd %xmm3,%xmm2 555 paddd %xmm1,%xmm0 556 pxor %xmm2,%xmm9 557 pxor %xmm0,%xmm11 558 movdqa %xmm9,%xmm4 559 psrld $25,%xmm9 560 movdqa %xmm11,%xmm5 561 pslld $7,%xmm4 562 psrld $25,%xmm11 563 por %xmm4,%xmm9 564 pslld $7,%xmm5 565 por %xmm5,%xmm11 566 pshufd $78,%xmm2,%xmm2 567 pshufd $147,%xmm9,%xmm9 568 pshufd $57,%xmm3,%xmm3 569 pshufd $78,%xmm0,%xmm0 570 pshufd $147,%xmm11,%xmm11 571 pshufd $57,%xmm1,%xmm1 572 decq %r8 573 jnz L$oop_128 574 paddd 0(%rsp),%xmm8 575 paddd 16(%rsp),%xmm9 576 paddd 32(%rsp),%xmm2 577 paddd 48(%rsp),%xmm3 578 paddd L$one(%rip),%xmm1 579 paddd 0(%rsp),%xmm10 580 paddd 16(%rsp),%xmm11 581 paddd 32(%rsp),%xmm0 582 paddd 48(%rsp),%xmm1 583 584 movdqu 0(%rsi),%xmm4 585 movdqu 16(%rsi),%xmm5 586 pxor %xmm4,%xmm8 587 movdqu 32(%rsi),%xmm4 588 pxor %xmm5,%xmm9 589 movdqu 48(%rsi),%xmm5 590 pxor %xmm4,%xmm2 591 movdqu 64(%rsi),%xmm4 592 pxor %xmm5,%xmm3 593 movdqu 80(%rsi),%xmm5 594 pxor %xmm4,%xmm10 595 movdqu 96(%rsi),%xmm4 596 pxor %xmm5,%xmm11 597 movdqu 112(%rsi),%xmm5 598 pxor %xmm4,%xmm0 599 pxor %xmm5,%xmm1 600 601 movdqu %xmm8,0(%rdi) 602 movdqu %xmm9,16(%rdi) 603 movdqu %xmm2,32(%rdi) 604 movdqu %xmm3,48(%rdi) 605 movdqu %xmm10,64(%rdi) 606 movdqu %xmm11,80(%rdi) 607 movdqu %xmm0,96(%rdi) 608 movdqu %xmm1,112(%rdi) 609 leaq (%r9),%rsp 610 611L$128_epilogue: 612 .byte 0xf3,0xc3 613 614 615 616.p2align 5 617ChaCha20_4x: 618 619L$ChaCha20_4x: 620 movq %rsp,%r9 621 622 movq %r10,%r11 623 shrq $32,%r10 624 testq $32,%r10 625 jnz L$ChaCha20_8x 626 cmpq $192,%rdx 627 ja L$proceed4x 628 629 andq $71303168,%r11 630 cmpq $4194304,%r11 631 je L$do_sse3_after_all 632 633L$proceed4x: 634 subq $0x140+8,%rsp 635 movdqa L$sigma(%rip),%xmm11 636 movdqu (%rcx),%xmm15 637 movdqu 16(%rcx),%xmm7 638 movdqu (%r8),%xmm3 639 leaq 256(%rsp),%rcx 640 leaq L$rot16(%rip),%r10 641 leaq L$rot24(%rip),%r11 642 643 pshufd $0x00,%xmm11,%xmm8 644 pshufd $0x55,%xmm11,%xmm9 645 movdqa %xmm8,64(%rsp) 646 pshufd $0xaa,%xmm11,%xmm10 647 movdqa %xmm9,80(%rsp) 648 pshufd $0xff,%xmm11,%xmm11 649 movdqa %xmm10,96(%rsp) 650 movdqa %xmm11,112(%rsp) 651 652 pshufd $0x00,%xmm15,%xmm12 653 pshufd $0x55,%xmm15,%xmm13 654 movdqa %xmm12,128-256(%rcx) 655 pshufd $0xaa,%xmm15,%xmm14 656 movdqa %xmm13,144-256(%rcx) 657 pshufd $0xff,%xmm15,%xmm15 658 movdqa %xmm14,160-256(%rcx) 659 movdqa %xmm15,176-256(%rcx) 660 661 pshufd $0x00,%xmm7,%xmm4 662 pshufd $0x55,%xmm7,%xmm5 663 movdqa %xmm4,192-256(%rcx) 664 pshufd $0xaa,%xmm7,%xmm6 665 movdqa %xmm5,208-256(%rcx) 666 pshufd $0xff,%xmm7,%xmm7 667 movdqa %xmm6,224-256(%rcx) 668 movdqa %xmm7,240-256(%rcx) 669 670 pshufd $0x00,%xmm3,%xmm0 671 pshufd $0x55,%xmm3,%xmm1 672 paddd L$inc(%rip),%xmm0 673 pshufd $0xaa,%xmm3,%xmm2 674 movdqa %xmm1,272-256(%rcx) 675 pshufd $0xff,%xmm3,%xmm3 676 movdqa %xmm2,288-256(%rcx) 677 movdqa %xmm3,304-256(%rcx) 678 679 jmp L$oop_enter4x 680 681.p2align 5 682L$oop_outer4x: 683 movdqa 64(%rsp),%xmm8 684 movdqa 80(%rsp),%xmm9 685 movdqa 96(%rsp),%xmm10 686 movdqa 112(%rsp),%xmm11 687 movdqa 128-256(%rcx),%xmm12 688 movdqa 144-256(%rcx),%xmm13 689 movdqa 160-256(%rcx),%xmm14 690 movdqa 176-256(%rcx),%xmm15 691 movdqa 192-256(%rcx),%xmm4 692 movdqa 208-256(%rcx),%xmm5 693 movdqa 224-256(%rcx),%xmm6 694 movdqa 240-256(%rcx),%xmm7 695 movdqa 256-256(%rcx),%xmm0 696 movdqa 272-256(%rcx),%xmm1 697 movdqa 288-256(%rcx),%xmm2 698 movdqa 304-256(%rcx),%xmm3 699 paddd L$four(%rip),%xmm0 700 701L$oop_enter4x: 702 movdqa %xmm6,32(%rsp) 703 movdqa %xmm7,48(%rsp) 704 movdqa (%r10),%xmm7 705 movl $10,%eax 706 movdqa %xmm0,256-256(%rcx) 707 jmp L$oop4x 708 709.p2align 5 710L$oop4x: 711 paddd %xmm12,%xmm8 712 paddd %xmm13,%xmm9 713 pxor %xmm8,%xmm0 714 pxor %xmm9,%xmm1 715.byte 102,15,56,0,199 716.byte 102,15,56,0,207 717 paddd %xmm0,%xmm4 718 paddd %xmm1,%xmm5 719 pxor %xmm4,%xmm12 720 pxor %xmm5,%xmm13 721 movdqa %xmm12,%xmm6 722 pslld $12,%xmm12 723 psrld $20,%xmm6 724 movdqa %xmm13,%xmm7 725 pslld $12,%xmm13 726 por %xmm6,%xmm12 727 psrld $20,%xmm7 728 movdqa (%r11),%xmm6 729 por %xmm7,%xmm13 730 paddd %xmm12,%xmm8 731 paddd %xmm13,%xmm9 732 pxor %xmm8,%xmm0 733 pxor %xmm9,%xmm1 734.byte 102,15,56,0,198 735.byte 102,15,56,0,206 736 paddd %xmm0,%xmm4 737 paddd %xmm1,%xmm5 738 pxor %xmm4,%xmm12 739 pxor %xmm5,%xmm13 740 movdqa %xmm12,%xmm7 741 pslld $7,%xmm12 742 psrld $25,%xmm7 743 movdqa %xmm13,%xmm6 744 pslld $7,%xmm13 745 por %xmm7,%xmm12 746 psrld $25,%xmm6 747 movdqa (%r10),%xmm7 748 por %xmm6,%xmm13 749 movdqa %xmm4,0(%rsp) 750 movdqa %xmm5,16(%rsp) 751 movdqa 32(%rsp),%xmm4 752 movdqa 48(%rsp),%xmm5 753 paddd %xmm14,%xmm10 754 paddd %xmm15,%xmm11 755 pxor %xmm10,%xmm2 756 pxor %xmm11,%xmm3 757.byte 102,15,56,0,215 758.byte 102,15,56,0,223 759 paddd %xmm2,%xmm4 760 paddd %xmm3,%xmm5 761 pxor %xmm4,%xmm14 762 pxor %xmm5,%xmm15 763 movdqa %xmm14,%xmm6 764 pslld $12,%xmm14 765 psrld $20,%xmm6 766 movdqa %xmm15,%xmm7 767 pslld $12,%xmm15 768 por %xmm6,%xmm14 769 psrld $20,%xmm7 770 movdqa (%r11),%xmm6 771 por %xmm7,%xmm15 772 paddd %xmm14,%xmm10 773 paddd %xmm15,%xmm11 774 pxor %xmm10,%xmm2 775 pxor %xmm11,%xmm3 776.byte 102,15,56,0,214 777.byte 102,15,56,0,222 778 paddd %xmm2,%xmm4 779 paddd %xmm3,%xmm5 780 pxor %xmm4,%xmm14 781 pxor %xmm5,%xmm15 782 movdqa %xmm14,%xmm7 783 pslld $7,%xmm14 784 psrld $25,%xmm7 785 movdqa %xmm15,%xmm6 786 pslld $7,%xmm15 787 por %xmm7,%xmm14 788 psrld $25,%xmm6 789 movdqa (%r10),%xmm7 790 por %xmm6,%xmm15 791 paddd %xmm13,%xmm8 792 paddd %xmm14,%xmm9 793 pxor %xmm8,%xmm3 794 pxor %xmm9,%xmm0 795.byte 102,15,56,0,223 796.byte 102,15,56,0,199 797 paddd %xmm3,%xmm4 798 paddd %xmm0,%xmm5 799 pxor %xmm4,%xmm13 800 pxor %xmm5,%xmm14 801 movdqa %xmm13,%xmm6 802 pslld $12,%xmm13 803 psrld $20,%xmm6 804 movdqa %xmm14,%xmm7 805 pslld $12,%xmm14 806 por %xmm6,%xmm13 807 psrld $20,%xmm7 808 movdqa (%r11),%xmm6 809 por %xmm7,%xmm14 810 paddd %xmm13,%xmm8 811 paddd %xmm14,%xmm9 812 pxor %xmm8,%xmm3 813 pxor %xmm9,%xmm0 814.byte 102,15,56,0,222 815.byte 102,15,56,0,198 816 paddd %xmm3,%xmm4 817 paddd %xmm0,%xmm5 818 pxor %xmm4,%xmm13 819 pxor %xmm5,%xmm14 820 movdqa %xmm13,%xmm7 821 pslld $7,%xmm13 822 psrld $25,%xmm7 823 movdqa %xmm14,%xmm6 824 pslld $7,%xmm14 825 por %xmm7,%xmm13 826 psrld $25,%xmm6 827 movdqa (%r10),%xmm7 828 por %xmm6,%xmm14 829 movdqa %xmm4,32(%rsp) 830 movdqa %xmm5,48(%rsp) 831 movdqa 0(%rsp),%xmm4 832 movdqa 16(%rsp),%xmm5 833 paddd %xmm15,%xmm10 834 paddd %xmm12,%xmm11 835 pxor %xmm10,%xmm1 836 pxor %xmm11,%xmm2 837.byte 102,15,56,0,207 838.byte 102,15,56,0,215 839 paddd %xmm1,%xmm4 840 paddd %xmm2,%xmm5 841 pxor %xmm4,%xmm15 842 pxor %xmm5,%xmm12 843 movdqa %xmm15,%xmm6 844 pslld $12,%xmm15 845 psrld $20,%xmm6 846 movdqa %xmm12,%xmm7 847 pslld $12,%xmm12 848 por %xmm6,%xmm15 849 psrld $20,%xmm7 850 movdqa (%r11),%xmm6 851 por %xmm7,%xmm12 852 paddd %xmm15,%xmm10 853 paddd %xmm12,%xmm11 854 pxor %xmm10,%xmm1 855 pxor %xmm11,%xmm2 856.byte 102,15,56,0,206 857.byte 102,15,56,0,214 858 paddd %xmm1,%xmm4 859 paddd %xmm2,%xmm5 860 pxor %xmm4,%xmm15 861 pxor %xmm5,%xmm12 862 movdqa %xmm15,%xmm7 863 pslld $7,%xmm15 864 psrld $25,%xmm7 865 movdqa %xmm12,%xmm6 866 pslld $7,%xmm12 867 por %xmm7,%xmm15 868 psrld $25,%xmm6 869 movdqa (%r10),%xmm7 870 por %xmm6,%xmm12 871 decl %eax 872 jnz L$oop4x 873 874 paddd 64(%rsp),%xmm8 875 paddd 80(%rsp),%xmm9 876 paddd 96(%rsp),%xmm10 877 paddd 112(%rsp),%xmm11 878 879 movdqa %xmm8,%xmm6 880 punpckldq %xmm9,%xmm8 881 movdqa %xmm10,%xmm7 882 punpckldq %xmm11,%xmm10 883 punpckhdq %xmm9,%xmm6 884 punpckhdq %xmm11,%xmm7 885 movdqa %xmm8,%xmm9 886 punpcklqdq %xmm10,%xmm8 887 movdqa %xmm6,%xmm11 888 punpcklqdq %xmm7,%xmm6 889 punpckhqdq %xmm10,%xmm9 890 punpckhqdq %xmm7,%xmm11 891 paddd 128-256(%rcx),%xmm12 892 paddd 144-256(%rcx),%xmm13 893 paddd 160-256(%rcx),%xmm14 894 paddd 176-256(%rcx),%xmm15 895 896 movdqa %xmm8,0(%rsp) 897 movdqa %xmm9,16(%rsp) 898 movdqa 32(%rsp),%xmm8 899 movdqa 48(%rsp),%xmm9 900 901 movdqa %xmm12,%xmm10 902 punpckldq %xmm13,%xmm12 903 movdqa %xmm14,%xmm7 904 punpckldq %xmm15,%xmm14 905 punpckhdq %xmm13,%xmm10 906 punpckhdq %xmm15,%xmm7 907 movdqa %xmm12,%xmm13 908 punpcklqdq %xmm14,%xmm12 909 movdqa %xmm10,%xmm15 910 punpcklqdq %xmm7,%xmm10 911 punpckhqdq %xmm14,%xmm13 912 punpckhqdq %xmm7,%xmm15 913 paddd 192-256(%rcx),%xmm4 914 paddd 208-256(%rcx),%xmm5 915 paddd 224-256(%rcx),%xmm8 916 paddd 240-256(%rcx),%xmm9 917 918 movdqa %xmm6,32(%rsp) 919 movdqa %xmm11,48(%rsp) 920 921 movdqa %xmm4,%xmm14 922 punpckldq %xmm5,%xmm4 923 movdqa %xmm8,%xmm7 924 punpckldq %xmm9,%xmm8 925 punpckhdq %xmm5,%xmm14 926 punpckhdq %xmm9,%xmm7 927 movdqa %xmm4,%xmm5 928 punpcklqdq %xmm8,%xmm4 929 movdqa %xmm14,%xmm9 930 punpcklqdq %xmm7,%xmm14 931 punpckhqdq %xmm8,%xmm5 932 punpckhqdq %xmm7,%xmm9 933 paddd 256-256(%rcx),%xmm0 934 paddd 272-256(%rcx),%xmm1 935 paddd 288-256(%rcx),%xmm2 936 paddd 304-256(%rcx),%xmm3 937 938 movdqa %xmm0,%xmm8 939 punpckldq %xmm1,%xmm0 940 movdqa %xmm2,%xmm7 941 punpckldq %xmm3,%xmm2 942 punpckhdq %xmm1,%xmm8 943 punpckhdq %xmm3,%xmm7 944 movdqa %xmm0,%xmm1 945 punpcklqdq %xmm2,%xmm0 946 movdqa %xmm8,%xmm3 947 punpcklqdq %xmm7,%xmm8 948 punpckhqdq %xmm2,%xmm1 949 punpckhqdq %xmm7,%xmm3 950 cmpq $256,%rdx 951 jb L$tail4x 952 953 movdqu 0(%rsi),%xmm6 954 movdqu 16(%rsi),%xmm11 955 movdqu 32(%rsi),%xmm2 956 movdqu 48(%rsi),%xmm7 957 pxor 0(%rsp),%xmm6 958 pxor %xmm12,%xmm11 959 pxor %xmm4,%xmm2 960 pxor %xmm0,%xmm7 961 962 movdqu %xmm6,0(%rdi) 963 movdqu 64(%rsi),%xmm6 964 movdqu %xmm11,16(%rdi) 965 movdqu 80(%rsi),%xmm11 966 movdqu %xmm2,32(%rdi) 967 movdqu 96(%rsi),%xmm2 968 movdqu %xmm7,48(%rdi) 969 movdqu 112(%rsi),%xmm7 970 leaq 128(%rsi),%rsi 971 pxor 16(%rsp),%xmm6 972 pxor %xmm13,%xmm11 973 pxor %xmm5,%xmm2 974 pxor %xmm1,%xmm7 975 976 movdqu %xmm6,64(%rdi) 977 movdqu 0(%rsi),%xmm6 978 movdqu %xmm11,80(%rdi) 979 movdqu 16(%rsi),%xmm11 980 movdqu %xmm2,96(%rdi) 981 movdqu 32(%rsi),%xmm2 982 movdqu %xmm7,112(%rdi) 983 leaq 128(%rdi),%rdi 984 movdqu 48(%rsi),%xmm7 985 pxor 32(%rsp),%xmm6 986 pxor %xmm10,%xmm11 987 pxor %xmm14,%xmm2 988 pxor %xmm8,%xmm7 989 990 movdqu %xmm6,0(%rdi) 991 movdqu 64(%rsi),%xmm6 992 movdqu %xmm11,16(%rdi) 993 movdqu 80(%rsi),%xmm11 994 movdqu %xmm2,32(%rdi) 995 movdqu 96(%rsi),%xmm2 996 movdqu %xmm7,48(%rdi) 997 movdqu 112(%rsi),%xmm7 998 leaq 128(%rsi),%rsi 999 pxor 48(%rsp),%xmm6 1000 pxor %xmm15,%xmm11 1001 pxor %xmm9,%xmm2 1002 pxor %xmm3,%xmm7 1003 movdqu %xmm6,64(%rdi) 1004 movdqu %xmm11,80(%rdi) 1005 movdqu %xmm2,96(%rdi) 1006 movdqu %xmm7,112(%rdi) 1007 leaq 128(%rdi),%rdi 1008 1009 subq $256,%rdx 1010 jnz L$oop_outer4x 1011 1012 jmp L$done4x 1013 1014L$tail4x: 1015 cmpq $192,%rdx 1016 jae L$192_or_more4x 1017 cmpq $128,%rdx 1018 jae L$128_or_more4x 1019 cmpq $64,%rdx 1020 jae L$64_or_more4x 1021 1022 1023 xorq %r10,%r10 1024 1025 movdqa %xmm12,16(%rsp) 1026 movdqa %xmm4,32(%rsp) 1027 movdqa %xmm0,48(%rsp) 1028 jmp L$oop_tail4x 1029 1030.p2align 5 1031L$64_or_more4x: 1032 movdqu 0(%rsi),%xmm6 1033 movdqu 16(%rsi),%xmm11 1034 movdqu 32(%rsi),%xmm2 1035 movdqu 48(%rsi),%xmm7 1036 pxor 0(%rsp),%xmm6 1037 pxor %xmm12,%xmm11 1038 pxor %xmm4,%xmm2 1039 pxor %xmm0,%xmm7 1040 movdqu %xmm6,0(%rdi) 1041 movdqu %xmm11,16(%rdi) 1042 movdqu %xmm2,32(%rdi) 1043 movdqu %xmm7,48(%rdi) 1044 je L$done4x 1045 1046 movdqa 16(%rsp),%xmm6 1047 leaq 64(%rsi),%rsi 1048 xorq %r10,%r10 1049 movdqa %xmm6,0(%rsp) 1050 movdqa %xmm13,16(%rsp) 1051 leaq 64(%rdi),%rdi 1052 movdqa %xmm5,32(%rsp) 1053 subq $64,%rdx 1054 movdqa %xmm1,48(%rsp) 1055 jmp L$oop_tail4x 1056 1057.p2align 5 1058L$128_or_more4x: 1059 movdqu 0(%rsi),%xmm6 1060 movdqu 16(%rsi),%xmm11 1061 movdqu 32(%rsi),%xmm2 1062 movdqu 48(%rsi),%xmm7 1063 pxor 0(%rsp),%xmm6 1064 pxor %xmm12,%xmm11 1065 pxor %xmm4,%xmm2 1066 pxor %xmm0,%xmm7 1067 1068 movdqu %xmm6,0(%rdi) 1069 movdqu 64(%rsi),%xmm6 1070 movdqu %xmm11,16(%rdi) 1071 movdqu 80(%rsi),%xmm11 1072 movdqu %xmm2,32(%rdi) 1073 movdqu 96(%rsi),%xmm2 1074 movdqu %xmm7,48(%rdi) 1075 movdqu 112(%rsi),%xmm7 1076 pxor 16(%rsp),%xmm6 1077 pxor %xmm13,%xmm11 1078 pxor %xmm5,%xmm2 1079 pxor %xmm1,%xmm7 1080 movdqu %xmm6,64(%rdi) 1081 movdqu %xmm11,80(%rdi) 1082 movdqu %xmm2,96(%rdi) 1083 movdqu %xmm7,112(%rdi) 1084 je L$done4x 1085 1086 movdqa 32(%rsp),%xmm6 1087 leaq 128(%rsi),%rsi 1088 xorq %r10,%r10 1089 movdqa %xmm6,0(%rsp) 1090 movdqa %xmm10,16(%rsp) 1091 leaq 128(%rdi),%rdi 1092 movdqa %xmm14,32(%rsp) 1093 subq $128,%rdx 1094 movdqa %xmm8,48(%rsp) 1095 jmp L$oop_tail4x 1096 1097.p2align 5 1098L$192_or_more4x: 1099 movdqu 0(%rsi),%xmm6 1100 movdqu 16(%rsi),%xmm11 1101 movdqu 32(%rsi),%xmm2 1102 movdqu 48(%rsi),%xmm7 1103 pxor 0(%rsp),%xmm6 1104 pxor %xmm12,%xmm11 1105 pxor %xmm4,%xmm2 1106 pxor %xmm0,%xmm7 1107 1108 movdqu %xmm6,0(%rdi) 1109 movdqu 64(%rsi),%xmm6 1110 movdqu %xmm11,16(%rdi) 1111 movdqu 80(%rsi),%xmm11 1112 movdqu %xmm2,32(%rdi) 1113 movdqu 96(%rsi),%xmm2 1114 movdqu %xmm7,48(%rdi) 1115 movdqu 112(%rsi),%xmm7 1116 leaq 128(%rsi),%rsi 1117 pxor 16(%rsp),%xmm6 1118 pxor %xmm13,%xmm11 1119 pxor %xmm5,%xmm2 1120 pxor %xmm1,%xmm7 1121 1122 movdqu %xmm6,64(%rdi) 1123 movdqu 0(%rsi),%xmm6 1124 movdqu %xmm11,80(%rdi) 1125 movdqu 16(%rsi),%xmm11 1126 movdqu %xmm2,96(%rdi) 1127 movdqu 32(%rsi),%xmm2 1128 movdqu %xmm7,112(%rdi) 1129 leaq 128(%rdi),%rdi 1130 movdqu 48(%rsi),%xmm7 1131 pxor 32(%rsp),%xmm6 1132 pxor %xmm10,%xmm11 1133 pxor %xmm14,%xmm2 1134 pxor %xmm8,%xmm7 1135 movdqu %xmm6,0(%rdi) 1136 movdqu %xmm11,16(%rdi) 1137 movdqu %xmm2,32(%rdi) 1138 movdqu %xmm7,48(%rdi) 1139 je L$done4x 1140 1141 movdqa 48(%rsp),%xmm6 1142 leaq 64(%rsi),%rsi 1143 xorq %r10,%r10 1144 movdqa %xmm6,0(%rsp) 1145 movdqa %xmm15,16(%rsp) 1146 leaq 64(%rdi),%rdi 1147 movdqa %xmm9,32(%rsp) 1148 subq $192,%rdx 1149 movdqa %xmm3,48(%rsp) 1150 1151L$oop_tail4x: 1152 movzbl (%rsi,%r10,1),%eax 1153 movzbl (%rsp,%r10,1),%ecx 1154 leaq 1(%r10),%r10 1155 xorl %ecx,%eax 1156 movb %al,-1(%rdi,%r10,1) 1157 decq %rdx 1158 jnz L$oop_tail4x 1159 1160L$done4x: 1161 leaq (%r9),%rsp 1162 1163L$4x_epilogue: 1164 .byte 0xf3,0xc3 1165 1166 1167 1168.p2align 5 1169ChaCha20_4xop: 1170 1171L$ChaCha20_4xop: 1172 movq %rsp,%r9 1173 1174 subq $0x140+8,%rsp 1175 vzeroupper 1176 1177 vmovdqa L$sigma(%rip),%xmm11 1178 vmovdqu (%rcx),%xmm3 1179 vmovdqu 16(%rcx),%xmm15 1180 vmovdqu (%r8),%xmm7 1181 leaq 256(%rsp),%rcx 1182 1183 vpshufd $0x00,%xmm11,%xmm8 1184 vpshufd $0x55,%xmm11,%xmm9 1185 vmovdqa %xmm8,64(%rsp) 1186 vpshufd $0xaa,%xmm11,%xmm10 1187 vmovdqa %xmm9,80(%rsp) 1188 vpshufd $0xff,%xmm11,%xmm11 1189 vmovdqa %xmm10,96(%rsp) 1190 vmovdqa %xmm11,112(%rsp) 1191 1192 vpshufd $0x00,%xmm3,%xmm0 1193 vpshufd $0x55,%xmm3,%xmm1 1194 vmovdqa %xmm0,128-256(%rcx) 1195 vpshufd $0xaa,%xmm3,%xmm2 1196 vmovdqa %xmm1,144-256(%rcx) 1197 vpshufd $0xff,%xmm3,%xmm3 1198 vmovdqa %xmm2,160-256(%rcx) 1199 vmovdqa %xmm3,176-256(%rcx) 1200 1201 vpshufd $0x00,%xmm15,%xmm12 1202 vpshufd $0x55,%xmm15,%xmm13 1203 vmovdqa %xmm12,192-256(%rcx) 1204 vpshufd $0xaa,%xmm15,%xmm14 1205 vmovdqa %xmm13,208-256(%rcx) 1206 vpshufd $0xff,%xmm15,%xmm15 1207 vmovdqa %xmm14,224-256(%rcx) 1208 vmovdqa %xmm15,240-256(%rcx) 1209 1210 vpshufd $0x00,%xmm7,%xmm4 1211 vpshufd $0x55,%xmm7,%xmm5 1212 vpaddd L$inc(%rip),%xmm4,%xmm4 1213 vpshufd $0xaa,%xmm7,%xmm6 1214 vmovdqa %xmm5,272-256(%rcx) 1215 vpshufd $0xff,%xmm7,%xmm7 1216 vmovdqa %xmm6,288-256(%rcx) 1217 vmovdqa %xmm7,304-256(%rcx) 1218 1219 jmp L$oop_enter4xop 1220 1221.p2align 5 1222L$oop_outer4xop: 1223 vmovdqa 64(%rsp),%xmm8 1224 vmovdqa 80(%rsp),%xmm9 1225 vmovdqa 96(%rsp),%xmm10 1226 vmovdqa 112(%rsp),%xmm11 1227 vmovdqa 128-256(%rcx),%xmm0 1228 vmovdqa 144-256(%rcx),%xmm1 1229 vmovdqa 160-256(%rcx),%xmm2 1230 vmovdqa 176-256(%rcx),%xmm3 1231 vmovdqa 192-256(%rcx),%xmm12 1232 vmovdqa 208-256(%rcx),%xmm13 1233 vmovdqa 224-256(%rcx),%xmm14 1234 vmovdqa 240-256(%rcx),%xmm15 1235 vmovdqa 256-256(%rcx),%xmm4 1236 vmovdqa 272-256(%rcx),%xmm5 1237 vmovdqa 288-256(%rcx),%xmm6 1238 vmovdqa 304-256(%rcx),%xmm7 1239 vpaddd L$four(%rip),%xmm4,%xmm4 1240 1241L$oop_enter4xop: 1242 movl $10,%eax 1243 vmovdqa %xmm4,256-256(%rcx) 1244 jmp L$oop4xop 1245 1246.p2align 5 1247L$oop4xop: 1248 vpaddd %xmm0,%xmm8,%xmm8 1249 vpaddd %xmm1,%xmm9,%xmm9 1250 vpaddd %xmm2,%xmm10,%xmm10 1251 vpaddd %xmm3,%xmm11,%xmm11 1252 vpxor %xmm4,%xmm8,%xmm4 1253 vpxor %xmm5,%xmm9,%xmm5 1254 vpxor %xmm6,%xmm10,%xmm6 1255 vpxor %xmm7,%xmm11,%xmm7 1256.byte 143,232,120,194,228,16 1257.byte 143,232,120,194,237,16 1258.byte 143,232,120,194,246,16 1259.byte 143,232,120,194,255,16 1260 vpaddd %xmm4,%xmm12,%xmm12 1261 vpaddd %xmm5,%xmm13,%xmm13 1262 vpaddd %xmm6,%xmm14,%xmm14 1263 vpaddd %xmm7,%xmm15,%xmm15 1264 vpxor %xmm0,%xmm12,%xmm0 1265 vpxor %xmm1,%xmm13,%xmm1 1266 vpxor %xmm14,%xmm2,%xmm2 1267 vpxor %xmm15,%xmm3,%xmm3 1268.byte 143,232,120,194,192,12 1269.byte 143,232,120,194,201,12 1270.byte 143,232,120,194,210,12 1271.byte 143,232,120,194,219,12 1272 vpaddd %xmm8,%xmm0,%xmm8 1273 vpaddd %xmm9,%xmm1,%xmm9 1274 vpaddd %xmm2,%xmm10,%xmm10 1275 vpaddd %xmm3,%xmm11,%xmm11 1276 vpxor %xmm4,%xmm8,%xmm4 1277 vpxor %xmm5,%xmm9,%xmm5 1278 vpxor %xmm6,%xmm10,%xmm6 1279 vpxor %xmm7,%xmm11,%xmm7 1280.byte 143,232,120,194,228,8 1281.byte 143,232,120,194,237,8 1282.byte 143,232,120,194,246,8 1283.byte 143,232,120,194,255,8 1284 vpaddd %xmm4,%xmm12,%xmm12 1285 vpaddd %xmm5,%xmm13,%xmm13 1286 vpaddd %xmm6,%xmm14,%xmm14 1287 vpaddd %xmm7,%xmm15,%xmm15 1288 vpxor %xmm0,%xmm12,%xmm0 1289 vpxor %xmm1,%xmm13,%xmm1 1290 vpxor %xmm14,%xmm2,%xmm2 1291 vpxor %xmm15,%xmm3,%xmm3 1292.byte 143,232,120,194,192,7 1293.byte 143,232,120,194,201,7 1294.byte 143,232,120,194,210,7 1295.byte 143,232,120,194,219,7 1296 vpaddd %xmm1,%xmm8,%xmm8 1297 vpaddd %xmm2,%xmm9,%xmm9 1298 vpaddd %xmm3,%xmm10,%xmm10 1299 vpaddd %xmm0,%xmm11,%xmm11 1300 vpxor %xmm7,%xmm8,%xmm7 1301 vpxor %xmm4,%xmm9,%xmm4 1302 vpxor %xmm5,%xmm10,%xmm5 1303 vpxor %xmm6,%xmm11,%xmm6 1304.byte 143,232,120,194,255,16 1305.byte 143,232,120,194,228,16 1306.byte 143,232,120,194,237,16 1307.byte 143,232,120,194,246,16 1308 vpaddd %xmm7,%xmm14,%xmm14 1309 vpaddd %xmm4,%xmm15,%xmm15 1310 vpaddd %xmm5,%xmm12,%xmm12 1311 vpaddd %xmm6,%xmm13,%xmm13 1312 vpxor %xmm1,%xmm14,%xmm1 1313 vpxor %xmm2,%xmm15,%xmm2 1314 vpxor %xmm12,%xmm3,%xmm3 1315 vpxor %xmm13,%xmm0,%xmm0 1316.byte 143,232,120,194,201,12 1317.byte 143,232,120,194,210,12 1318.byte 143,232,120,194,219,12 1319.byte 143,232,120,194,192,12 1320 vpaddd %xmm8,%xmm1,%xmm8 1321 vpaddd %xmm9,%xmm2,%xmm9 1322 vpaddd %xmm3,%xmm10,%xmm10 1323 vpaddd %xmm0,%xmm11,%xmm11 1324 vpxor %xmm7,%xmm8,%xmm7 1325 vpxor %xmm4,%xmm9,%xmm4 1326 vpxor %xmm5,%xmm10,%xmm5 1327 vpxor %xmm6,%xmm11,%xmm6 1328.byte 143,232,120,194,255,8 1329.byte 143,232,120,194,228,8 1330.byte 143,232,120,194,237,8 1331.byte 143,232,120,194,246,8 1332 vpaddd %xmm7,%xmm14,%xmm14 1333 vpaddd %xmm4,%xmm15,%xmm15 1334 vpaddd %xmm5,%xmm12,%xmm12 1335 vpaddd %xmm6,%xmm13,%xmm13 1336 vpxor %xmm1,%xmm14,%xmm1 1337 vpxor %xmm2,%xmm15,%xmm2 1338 vpxor %xmm12,%xmm3,%xmm3 1339 vpxor %xmm13,%xmm0,%xmm0 1340.byte 143,232,120,194,201,7 1341.byte 143,232,120,194,210,7 1342.byte 143,232,120,194,219,7 1343.byte 143,232,120,194,192,7 1344 decl %eax 1345 jnz L$oop4xop 1346 1347 vpaddd 64(%rsp),%xmm8,%xmm8 1348 vpaddd 80(%rsp),%xmm9,%xmm9 1349 vpaddd 96(%rsp),%xmm10,%xmm10 1350 vpaddd 112(%rsp),%xmm11,%xmm11 1351 1352 vmovdqa %xmm14,32(%rsp) 1353 vmovdqa %xmm15,48(%rsp) 1354 1355 vpunpckldq %xmm9,%xmm8,%xmm14 1356 vpunpckldq %xmm11,%xmm10,%xmm15 1357 vpunpckhdq %xmm9,%xmm8,%xmm8 1358 vpunpckhdq %xmm11,%xmm10,%xmm10 1359 vpunpcklqdq %xmm15,%xmm14,%xmm9 1360 vpunpckhqdq %xmm15,%xmm14,%xmm14 1361 vpunpcklqdq %xmm10,%xmm8,%xmm11 1362 vpunpckhqdq %xmm10,%xmm8,%xmm8 1363 vpaddd 128-256(%rcx),%xmm0,%xmm0 1364 vpaddd 144-256(%rcx),%xmm1,%xmm1 1365 vpaddd 160-256(%rcx),%xmm2,%xmm2 1366 vpaddd 176-256(%rcx),%xmm3,%xmm3 1367 1368 vmovdqa %xmm9,0(%rsp) 1369 vmovdqa %xmm14,16(%rsp) 1370 vmovdqa 32(%rsp),%xmm9 1371 vmovdqa 48(%rsp),%xmm14 1372 1373 vpunpckldq %xmm1,%xmm0,%xmm10 1374 vpunpckldq %xmm3,%xmm2,%xmm15 1375 vpunpckhdq %xmm1,%xmm0,%xmm0 1376 vpunpckhdq %xmm3,%xmm2,%xmm2 1377 vpunpcklqdq %xmm15,%xmm10,%xmm1 1378 vpunpckhqdq %xmm15,%xmm10,%xmm10 1379 vpunpcklqdq %xmm2,%xmm0,%xmm3 1380 vpunpckhqdq %xmm2,%xmm0,%xmm0 1381 vpaddd 192-256(%rcx),%xmm12,%xmm12 1382 vpaddd 208-256(%rcx),%xmm13,%xmm13 1383 vpaddd 224-256(%rcx),%xmm9,%xmm9 1384 vpaddd 240-256(%rcx),%xmm14,%xmm14 1385 1386 vpunpckldq %xmm13,%xmm12,%xmm2 1387 vpunpckldq %xmm14,%xmm9,%xmm15 1388 vpunpckhdq %xmm13,%xmm12,%xmm12 1389 vpunpckhdq %xmm14,%xmm9,%xmm9 1390 vpunpcklqdq %xmm15,%xmm2,%xmm13 1391 vpunpckhqdq %xmm15,%xmm2,%xmm2 1392 vpunpcklqdq %xmm9,%xmm12,%xmm14 1393 vpunpckhqdq %xmm9,%xmm12,%xmm12 1394 vpaddd 256-256(%rcx),%xmm4,%xmm4 1395 vpaddd 272-256(%rcx),%xmm5,%xmm5 1396 vpaddd 288-256(%rcx),%xmm6,%xmm6 1397 vpaddd 304-256(%rcx),%xmm7,%xmm7 1398 1399 vpunpckldq %xmm5,%xmm4,%xmm9 1400 vpunpckldq %xmm7,%xmm6,%xmm15 1401 vpunpckhdq %xmm5,%xmm4,%xmm4 1402 vpunpckhdq %xmm7,%xmm6,%xmm6 1403 vpunpcklqdq %xmm15,%xmm9,%xmm5 1404 vpunpckhqdq %xmm15,%xmm9,%xmm9 1405 vpunpcklqdq %xmm6,%xmm4,%xmm7 1406 vpunpckhqdq %xmm6,%xmm4,%xmm4 1407 vmovdqa 0(%rsp),%xmm6 1408 vmovdqa 16(%rsp),%xmm15 1409 1410 cmpq $256,%rdx 1411 jb L$tail4xop 1412 1413 vpxor 0(%rsi),%xmm6,%xmm6 1414 vpxor 16(%rsi),%xmm1,%xmm1 1415 vpxor 32(%rsi),%xmm13,%xmm13 1416 vpxor 48(%rsi),%xmm5,%xmm5 1417 vpxor 64(%rsi),%xmm15,%xmm15 1418 vpxor 80(%rsi),%xmm10,%xmm10 1419 vpxor 96(%rsi),%xmm2,%xmm2 1420 vpxor 112(%rsi),%xmm9,%xmm9 1421 leaq 128(%rsi),%rsi 1422 vpxor 0(%rsi),%xmm11,%xmm11 1423 vpxor 16(%rsi),%xmm3,%xmm3 1424 vpxor 32(%rsi),%xmm14,%xmm14 1425 vpxor 48(%rsi),%xmm7,%xmm7 1426 vpxor 64(%rsi),%xmm8,%xmm8 1427 vpxor 80(%rsi),%xmm0,%xmm0 1428 vpxor 96(%rsi),%xmm12,%xmm12 1429 vpxor 112(%rsi),%xmm4,%xmm4 1430 leaq 128(%rsi),%rsi 1431 1432 vmovdqu %xmm6,0(%rdi) 1433 vmovdqu %xmm1,16(%rdi) 1434 vmovdqu %xmm13,32(%rdi) 1435 vmovdqu %xmm5,48(%rdi) 1436 vmovdqu %xmm15,64(%rdi) 1437 vmovdqu %xmm10,80(%rdi) 1438 vmovdqu %xmm2,96(%rdi) 1439 vmovdqu %xmm9,112(%rdi) 1440 leaq 128(%rdi),%rdi 1441 vmovdqu %xmm11,0(%rdi) 1442 vmovdqu %xmm3,16(%rdi) 1443 vmovdqu %xmm14,32(%rdi) 1444 vmovdqu %xmm7,48(%rdi) 1445 vmovdqu %xmm8,64(%rdi) 1446 vmovdqu %xmm0,80(%rdi) 1447 vmovdqu %xmm12,96(%rdi) 1448 vmovdqu %xmm4,112(%rdi) 1449 leaq 128(%rdi),%rdi 1450 1451 subq $256,%rdx 1452 jnz L$oop_outer4xop 1453 1454 jmp L$done4xop 1455 1456.p2align 5 1457L$tail4xop: 1458 cmpq $192,%rdx 1459 jae L$192_or_more4xop 1460 cmpq $128,%rdx 1461 jae L$128_or_more4xop 1462 cmpq $64,%rdx 1463 jae L$64_or_more4xop 1464 1465 xorq %r10,%r10 1466 vmovdqa %xmm6,0(%rsp) 1467 vmovdqa %xmm1,16(%rsp) 1468 vmovdqa %xmm13,32(%rsp) 1469 vmovdqa %xmm5,48(%rsp) 1470 jmp L$oop_tail4xop 1471 1472.p2align 5 1473L$64_or_more4xop: 1474 vpxor 0(%rsi),%xmm6,%xmm6 1475 vpxor 16(%rsi),%xmm1,%xmm1 1476 vpxor 32(%rsi),%xmm13,%xmm13 1477 vpxor 48(%rsi),%xmm5,%xmm5 1478 vmovdqu %xmm6,0(%rdi) 1479 vmovdqu %xmm1,16(%rdi) 1480 vmovdqu %xmm13,32(%rdi) 1481 vmovdqu %xmm5,48(%rdi) 1482 je L$done4xop 1483 1484 leaq 64(%rsi),%rsi 1485 vmovdqa %xmm15,0(%rsp) 1486 xorq %r10,%r10 1487 vmovdqa %xmm10,16(%rsp) 1488 leaq 64(%rdi),%rdi 1489 vmovdqa %xmm2,32(%rsp) 1490 subq $64,%rdx 1491 vmovdqa %xmm9,48(%rsp) 1492 jmp L$oop_tail4xop 1493 1494.p2align 5 1495L$128_or_more4xop: 1496 vpxor 0(%rsi),%xmm6,%xmm6 1497 vpxor 16(%rsi),%xmm1,%xmm1 1498 vpxor 32(%rsi),%xmm13,%xmm13 1499 vpxor 48(%rsi),%xmm5,%xmm5 1500 vpxor 64(%rsi),%xmm15,%xmm15 1501 vpxor 80(%rsi),%xmm10,%xmm10 1502 vpxor 96(%rsi),%xmm2,%xmm2 1503 vpxor 112(%rsi),%xmm9,%xmm9 1504 1505 vmovdqu %xmm6,0(%rdi) 1506 vmovdqu %xmm1,16(%rdi) 1507 vmovdqu %xmm13,32(%rdi) 1508 vmovdqu %xmm5,48(%rdi) 1509 vmovdqu %xmm15,64(%rdi) 1510 vmovdqu %xmm10,80(%rdi) 1511 vmovdqu %xmm2,96(%rdi) 1512 vmovdqu %xmm9,112(%rdi) 1513 je L$done4xop 1514 1515 leaq 128(%rsi),%rsi 1516 vmovdqa %xmm11,0(%rsp) 1517 xorq %r10,%r10 1518 vmovdqa %xmm3,16(%rsp) 1519 leaq 128(%rdi),%rdi 1520 vmovdqa %xmm14,32(%rsp) 1521 subq $128,%rdx 1522 vmovdqa %xmm7,48(%rsp) 1523 jmp L$oop_tail4xop 1524 1525.p2align 5 1526L$192_or_more4xop: 1527 vpxor 0(%rsi),%xmm6,%xmm6 1528 vpxor 16(%rsi),%xmm1,%xmm1 1529 vpxor 32(%rsi),%xmm13,%xmm13 1530 vpxor 48(%rsi),%xmm5,%xmm5 1531 vpxor 64(%rsi),%xmm15,%xmm15 1532 vpxor 80(%rsi),%xmm10,%xmm10 1533 vpxor 96(%rsi),%xmm2,%xmm2 1534 vpxor 112(%rsi),%xmm9,%xmm9 1535 leaq 128(%rsi),%rsi 1536 vpxor 0(%rsi),%xmm11,%xmm11 1537 vpxor 16(%rsi),%xmm3,%xmm3 1538 vpxor 32(%rsi),%xmm14,%xmm14 1539 vpxor 48(%rsi),%xmm7,%xmm7 1540 1541 vmovdqu %xmm6,0(%rdi) 1542 vmovdqu %xmm1,16(%rdi) 1543 vmovdqu %xmm13,32(%rdi) 1544 vmovdqu %xmm5,48(%rdi) 1545 vmovdqu %xmm15,64(%rdi) 1546 vmovdqu %xmm10,80(%rdi) 1547 vmovdqu %xmm2,96(%rdi) 1548 vmovdqu %xmm9,112(%rdi) 1549 leaq 128(%rdi),%rdi 1550 vmovdqu %xmm11,0(%rdi) 1551 vmovdqu %xmm3,16(%rdi) 1552 vmovdqu %xmm14,32(%rdi) 1553 vmovdqu %xmm7,48(%rdi) 1554 je L$done4xop 1555 1556 leaq 64(%rsi),%rsi 1557 vmovdqa %xmm8,0(%rsp) 1558 xorq %r10,%r10 1559 vmovdqa %xmm0,16(%rsp) 1560 leaq 64(%rdi),%rdi 1561 vmovdqa %xmm12,32(%rsp) 1562 subq $192,%rdx 1563 vmovdqa %xmm4,48(%rsp) 1564 1565L$oop_tail4xop: 1566 movzbl (%rsi,%r10,1),%eax 1567 movzbl (%rsp,%r10,1),%ecx 1568 leaq 1(%r10),%r10 1569 xorl %ecx,%eax 1570 movb %al,-1(%rdi,%r10,1) 1571 decq %rdx 1572 jnz L$oop_tail4xop 1573 1574L$done4xop: 1575 vzeroupper 1576 leaq (%r9),%rsp 1577 1578L$4xop_epilogue: 1579 .byte 0xf3,0xc3 1580 1581 1582 1583.p2align 5 1584ChaCha20_8x: 1585 1586L$ChaCha20_8x: 1587 movq %rsp,%r9 1588 1589 subq $0x280+8,%rsp 1590 andq $-32,%rsp 1591 vzeroupper 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 vbroadcasti128 L$sigma(%rip),%ymm11 1603 vbroadcasti128 (%rcx),%ymm3 1604 vbroadcasti128 16(%rcx),%ymm15 1605 vbroadcasti128 (%r8),%ymm7 1606 leaq 256(%rsp),%rcx 1607 leaq 512(%rsp),%rax 1608 leaq L$rot16(%rip),%r10 1609 leaq L$rot24(%rip),%r11 1610 1611 vpshufd $0x00,%ymm11,%ymm8 1612 vpshufd $0x55,%ymm11,%ymm9 1613 vmovdqa %ymm8,128-256(%rcx) 1614 vpshufd $0xaa,%ymm11,%ymm10 1615 vmovdqa %ymm9,160-256(%rcx) 1616 vpshufd $0xff,%ymm11,%ymm11 1617 vmovdqa %ymm10,192-256(%rcx) 1618 vmovdqa %ymm11,224-256(%rcx) 1619 1620 vpshufd $0x00,%ymm3,%ymm0 1621 vpshufd $0x55,%ymm3,%ymm1 1622 vmovdqa %ymm0,256-256(%rcx) 1623 vpshufd $0xaa,%ymm3,%ymm2 1624 vmovdqa %ymm1,288-256(%rcx) 1625 vpshufd $0xff,%ymm3,%ymm3 1626 vmovdqa %ymm2,320-256(%rcx) 1627 vmovdqa %ymm3,352-256(%rcx) 1628 1629 vpshufd $0x00,%ymm15,%ymm12 1630 vpshufd $0x55,%ymm15,%ymm13 1631 vmovdqa %ymm12,384-512(%rax) 1632 vpshufd $0xaa,%ymm15,%ymm14 1633 vmovdqa %ymm13,416-512(%rax) 1634 vpshufd $0xff,%ymm15,%ymm15 1635 vmovdqa %ymm14,448-512(%rax) 1636 vmovdqa %ymm15,480-512(%rax) 1637 1638 vpshufd $0x00,%ymm7,%ymm4 1639 vpshufd $0x55,%ymm7,%ymm5 1640 vpaddd L$incy(%rip),%ymm4,%ymm4 1641 vpshufd $0xaa,%ymm7,%ymm6 1642 vmovdqa %ymm5,544-512(%rax) 1643 vpshufd $0xff,%ymm7,%ymm7 1644 vmovdqa %ymm6,576-512(%rax) 1645 vmovdqa %ymm7,608-512(%rax) 1646 1647 jmp L$oop_enter8x 1648 1649.p2align 5 1650L$oop_outer8x: 1651 vmovdqa 128-256(%rcx),%ymm8 1652 vmovdqa 160-256(%rcx),%ymm9 1653 vmovdqa 192-256(%rcx),%ymm10 1654 vmovdqa 224-256(%rcx),%ymm11 1655 vmovdqa 256-256(%rcx),%ymm0 1656 vmovdqa 288-256(%rcx),%ymm1 1657 vmovdqa 320-256(%rcx),%ymm2 1658 vmovdqa 352-256(%rcx),%ymm3 1659 vmovdqa 384-512(%rax),%ymm12 1660 vmovdqa 416-512(%rax),%ymm13 1661 vmovdqa 448-512(%rax),%ymm14 1662 vmovdqa 480-512(%rax),%ymm15 1663 vmovdqa 512-512(%rax),%ymm4 1664 vmovdqa 544-512(%rax),%ymm5 1665 vmovdqa 576-512(%rax),%ymm6 1666 vmovdqa 608-512(%rax),%ymm7 1667 vpaddd L$eight(%rip),%ymm4,%ymm4 1668 1669L$oop_enter8x: 1670 vmovdqa %ymm14,64(%rsp) 1671 vmovdqa %ymm15,96(%rsp) 1672 vbroadcasti128 (%r10),%ymm15 1673 vmovdqa %ymm4,512-512(%rax) 1674 movl $10,%eax 1675 jmp L$oop8x 1676 1677.p2align 5 1678L$oop8x: 1679 vpaddd %ymm0,%ymm8,%ymm8 1680 vpxor %ymm4,%ymm8,%ymm4 1681 vpshufb %ymm15,%ymm4,%ymm4 1682 vpaddd %ymm1,%ymm9,%ymm9 1683 vpxor %ymm5,%ymm9,%ymm5 1684 vpshufb %ymm15,%ymm5,%ymm5 1685 vpaddd %ymm4,%ymm12,%ymm12 1686 vpxor %ymm0,%ymm12,%ymm0 1687 vpslld $12,%ymm0,%ymm14 1688 vpsrld $20,%ymm0,%ymm0 1689 vpor %ymm0,%ymm14,%ymm0 1690 vbroadcasti128 (%r11),%ymm14 1691 vpaddd %ymm5,%ymm13,%ymm13 1692 vpxor %ymm1,%ymm13,%ymm1 1693 vpslld $12,%ymm1,%ymm15 1694 vpsrld $20,%ymm1,%ymm1 1695 vpor %ymm1,%ymm15,%ymm1 1696 vpaddd %ymm0,%ymm8,%ymm8 1697 vpxor %ymm4,%ymm8,%ymm4 1698 vpshufb %ymm14,%ymm4,%ymm4 1699 vpaddd %ymm1,%ymm9,%ymm9 1700 vpxor %ymm5,%ymm9,%ymm5 1701 vpshufb %ymm14,%ymm5,%ymm5 1702 vpaddd %ymm4,%ymm12,%ymm12 1703 vpxor %ymm0,%ymm12,%ymm0 1704 vpslld $7,%ymm0,%ymm15 1705 vpsrld $25,%ymm0,%ymm0 1706 vpor %ymm0,%ymm15,%ymm0 1707 vbroadcasti128 (%r10),%ymm15 1708 vpaddd %ymm5,%ymm13,%ymm13 1709 vpxor %ymm1,%ymm13,%ymm1 1710 vpslld $7,%ymm1,%ymm14 1711 vpsrld $25,%ymm1,%ymm1 1712 vpor %ymm1,%ymm14,%ymm1 1713 vmovdqa %ymm12,0(%rsp) 1714 vmovdqa %ymm13,32(%rsp) 1715 vmovdqa 64(%rsp),%ymm12 1716 vmovdqa 96(%rsp),%ymm13 1717 vpaddd %ymm2,%ymm10,%ymm10 1718 vpxor %ymm6,%ymm10,%ymm6 1719 vpshufb %ymm15,%ymm6,%ymm6 1720 vpaddd %ymm3,%ymm11,%ymm11 1721 vpxor %ymm7,%ymm11,%ymm7 1722 vpshufb %ymm15,%ymm7,%ymm7 1723 vpaddd %ymm6,%ymm12,%ymm12 1724 vpxor %ymm2,%ymm12,%ymm2 1725 vpslld $12,%ymm2,%ymm14 1726 vpsrld $20,%ymm2,%ymm2 1727 vpor %ymm2,%ymm14,%ymm2 1728 vbroadcasti128 (%r11),%ymm14 1729 vpaddd %ymm7,%ymm13,%ymm13 1730 vpxor %ymm3,%ymm13,%ymm3 1731 vpslld $12,%ymm3,%ymm15 1732 vpsrld $20,%ymm3,%ymm3 1733 vpor %ymm3,%ymm15,%ymm3 1734 vpaddd %ymm2,%ymm10,%ymm10 1735 vpxor %ymm6,%ymm10,%ymm6 1736 vpshufb %ymm14,%ymm6,%ymm6 1737 vpaddd %ymm3,%ymm11,%ymm11 1738 vpxor %ymm7,%ymm11,%ymm7 1739 vpshufb %ymm14,%ymm7,%ymm7 1740 vpaddd %ymm6,%ymm12,%ymm12 1741 vpxor %ymm2,%ymm12,%ymm2 1742 vpslld $7,%ymm2,%ymm15 1743 vpsrld $25,%ymm2,%ymm2 1744 vpor %ymm2,%ymm15,%ymm2 1745 vbroadcasti128 (%r10),%ymm15 1746 vpaddd %ymm7,%ymm13,%ymm13 1747 vpxor %ymm3,%ymm13,%ymm3 1748 vpslld $7,%ymm3,%ymm14 1749 vpsrld $25,%ymm3,%ymm3 1750 vpor %ymm3,%ymm14,%ymm3 1751 vpaddd %ymm1,%ymm8,%ymm8 1752 vpxor %ymm7,%ymm8,%ymm7 1753 vpshufb %ymm15,%ymm7,%ymm7 1754 vpaddd %ymm2,%ymm9,%ymm9 1755 vpxor %ymm4,%ymm9,%ymm4 1756 vpshufb %ymm15,%ymm4,%ymm4 1757 vpaddd %ymm7,%ymm12,%ymm12 1758 vpxor %ymm1,%ymm12,%ymm1 1759 vpslld $12,%ymm1,%ymm14 1760 vpsrld $20,%ymm1,%ymm1 1761 vpor %ymm1,%ymm14,%ymm1 1762 vbroadcasti128 (%r11),%ymm14 1763 vpaddd %ymm4,%ymm13,%ymm13 1764 vpxor %ymm2,%ymm13,%ymm2 1765 vpslld $12,%ymm2,%ymm15 1766 vpsrld $20,%ymm2,%ymm2 1767 vpor %ymm2,%ymm15,%ymm2 1768 vpaddd %ymm1,%ymm8,%ymm8 1769 vpxor %ymm7,%ymm8,%ymm7 1770 vpshufb %ymm14,%ymm7,%ymm7 1771 vpaddd %ymm2,%ymm9,%ymm9 1772 vpxor %ymm4,%ymm9,%ymm4 1773 vpshufb %ymm14,%ymm4,%ymm4 1774 vpaddd %ymm7,%ymm12,%ymm12 1775 vpxor %ymm1,%ymm12,%ymm1 1776 vpslld $7,%ymm1,%ymm15 1777 vpsrld $25,%ymm1,%ymm1 1778 vpor %ymm1,%ymm15,%ymm1 1779 vbroadcasti128 (%r10),%ymm15 1780 vpaddd %ymm4,%ymm13,%ymm13 1781 vpxor %ymm2,%ymm13,%ymm2 1782 vpslld $7,%ymm2,%ymm14 1783 vpsrld $25,%ymm2,%ymm2 1784 vpor %ymm2,%ymm14,%ymm2 1785 vmovdqa %ymm12,64(%rsp) 1786 vmovdqa %ymm13,96(%rsp) 1787 vmovdqa 0(%rsp),%ymm12 1788 vmovdqa 32(%rsp),%ymm13 1789 vpaddd %ymm3,%ymm10,%ymm10 1790 vpxor %ymm5,%ymm10,%ymm5 1791 vpshufb %ymm15,%ymm5,%ymm5 1792 vpaddd %ymm0,%ymm11,%ymm11 1793 vpxor %ymm6,%ymm11,%ymm6 1794 vpshufb %ymm15,%ymm6,%ymm6 1795 vpaddd %ymm5,%ymm12,%ymm12 1796 vpxor %ymm3,%ymm12,%ymm3 1797 vpslld $12,%ymm3,%ymm14 1798 vpsrld $20,%ymm3,%ymm3 1799 vpor %ymm3,%ymm14,%ymm3 1800 vbroadcasti128 (%r11),%ymm14 1801 vpaddd %ymm6,%ymm13,%ymm13 1802 vpxor %ymm0,%ymm13,%ymm0 1803 vpslld $12,%ymm0,%ymm15 1804 vpsrld $20,%ymm0,%ymm0 1805 vpor %ymm0,%ymm15,%ymm0 1806 vpaddd %ymm3,%ymm10,%ymm10 1807 vpxor %ymm5,%ymm10,%ymm5 1808 vpshufb %ymm14,%ymm5,%ymm5 1809 vpaddd %ymm0,%ymm11,%ymm11 1810 vpxor %ymm6,%ymm11,%ymm6 1811 vpshufb %ymm14,%ymm6,%ymm6 1812 vpaddd %ymm5,%ymm12,%ymm12 1813 vpxor %ymm3,%ymm12,%ymm3 1814 vpslld $7,%ymm3,%ymm15 1815 vpsrld $25,%ymm3,%ymm3 1816 vpor %ymm3,%ymm15,%ymm3 1817 vbroadcasti128 (%r10),%ymm15 1818 vpaddd %ymm6,%ymm13,%ymm13 1819 vpxor %ymm0,%ymm13,%ymm0 1820 vpslld $7,%ymm0,%ymm14 1821 vpsrld $25,%ymm0,%ymm0 1822 vpor %ymm0,%ymm14,%ymm0 1823 decl %eax 1824 jnz L$oop8x 1825 1826 leaq 512(%rsp),%rax 1827 vpaddd 128-256(%rcx),%ymm8,%ymm8 1828 vpaddd 160-256(%rcx),%ymm9,%ymm9 1829 vpaddd 192-256(%rcx),%ymm10,%ymm10 1830 vpaddd 224-256(%rcx),%ymm11,%ymm11 1831 1832 vpunpckldq %ymm9,%ymm8,%ymm14 1833 vpunpckldq %ymm11,%ymm10,%ymm15 1834 vpunpckhdq %ymm9,%ymm8,%ymm8 1835 vpunpckhdq %ymm11,%ymm10,%ymm10 1836 vpunpcklqdq %ymm15,%ymm14,%ymm9 1837 vpunpckhqdq %ymm15,%ymm14,%ymm14 1838 vpunpcklqdq %ymm10,%ymm8,%ymm11 1839 vpunpckhqdq %ymm10,%ymm8,%ymm8 1840 vpaddd 256-256(%rcx),%ymm0,%ymm0 1841 vpaddd 288-256(%rcx),%ymm1,%ymm1 1842 vpaddd 320-256(%rcx),%ymm2,%ymm2 1843 vpaddd 352-256(%rcx),%ymm3,%ymm3 1844 1845 vpunpckldq %ymm1,%ymm0,%ymm10 1846 vpunpckldq %ymm3,%ymm2,%ymm15 1847 vpunpckhdq %ymm1,%ymm0,%ymm0 1848 vpunpckhdq %ymm3,%ymm2,%ymm2 1849 vpunpcklqdq %ymm15,%ymm10,%ymm1 1850 vpunpckhqdq %ymm15,%ymm10,%ymm10 1851 vpunpcklqdq %ymm2,%ymm0,%ymm3 1852 vpunpckhqdq %ymm2,%ymm0,%ymm0 1853 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1854 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1855 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1856 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1857 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1858 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1859 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1860 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1861 vmovdqa %ymm15,0(%rsp) 1862 vmovdqa %ymm9,32(%rsp) 1863 vmovdqa 64(%rsp),%ymm15 1864 vmovdqa 96(%rsp),%ymm9 1865 1866 vpaddd 384-512(%rax),%ymm12,%ymm12 1867 vpaddd 416-512(%rax),%ymm13,%ymm13 1868 vpaddd 448-512(%rax),%ymm15,%ymm15 1869 vpaddd 480-512(%rax),%ymm9,%ymm9 1870 1871 vpunpckldq %ymm13,%ymm12,%ymm2 1872 vpunpckldq %ymm9,%ymm15,%ymm8 1873 vpunpckhdq %ymm13,%ymm12,%ymm12 1874 vpunpckhdq %ymm9,%ymm15,%ymm15 1875 vpunpcklqdq %ymm8,%ymm2,%ymm13 1876 vpunpckhqdq %ymm8,%ymm2,%ymm2 1877 vpunpcklqdq %ymm15,%ymm12,%ymm9 1878 vpunpckhqdq %ymm15,%ymm12,%ymm12 1879 vpaddd 512-512(%rax),%ymm4,%ymm4 1880 vpaddd 544-512(%rax),%ymm5,%ymm5 1881 vpaddd 576-512(%rax),%ymm6,%ymm6 1882 vpaddd 608-512(%rax),%ymm7,%ymm7 1883 1884 vpunpckldq %ymm5,%ymm4,%ymm15 1885 vpunpckldq %ymm7,%ymm6,%ymm8 1886 vpunpckhdq %ymm5,%ymm4,%ymm4 1887 vpunpckhdq %ymm7,%ymm6,%ymm6 1888 vpunpcklqdq %ymm8,%ymm15,%ymm5 1889 vpunpckhqdq %ymm8,%ymm15,%ymm15 1890 vpunpcklqdq %ymm6,%ymm4,%ymm7 1891 vpunpckhqdq %ymm6,%ymm4,%ymm4 1892 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1893 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1894 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1895 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1896 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1897 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1898 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1899 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1900 vmovdqa 0(%rsp),%ymm6 1901 vmovdqa 32(%rsp),%ymm12 1902 1903 cmpq $512,%rdx 1904 jb L$tail8x 1905 1906 vpxor 0(%rsi),%ymm6,%ymm6 1907 vpxor 32(%rsi),%ymm8,%ymm8 1908 vpxor 64(%rsi),%ymm1,%ymm1 1909 vpxor 96(%rsi),%ymm5,%ymm5 1910 leaq 128(%rsi),%rsi 1911 vmovdqu %ymm6,0(%rdi) 1912 vmovdqu %ymm8,32(%rdi) 1913 vmovdqu %ymm1,64(%rdi) 1914 vmovdqu %ymm5,96(%rdi) 1915 leaq 128(%rdi),%rdi 1916 1917 vpxor 0(%rsi),%ymm12,%ymm12 1918 vpxor 32(%rsi),%ymm13,%ymm13 1919 vpxor 64(%rsi),%ymm10,%ymm10 1920 vpxor 96(%rsi),%ymm15,%ymm15 1921 leaq 128(%rsi),%rsi 1922 vmovdqu %ymm12,0(%rdi) 1923 vmovdqu %ymm13,32(%rdi) 1924 vmovdqu %ymm10,64(%rdi) 1925 vmovdqu %ymm15,96(%rdi) 1926 leaq 128(%rdi),%rdi 1927 1928 vpxor 0(%rsi),%ymm14,%ymm14 1929 vpxor 32(%rsi),%ymm2,%ymm2 1930 vpxor 64(%rsi),%ymm3,%ymm3 1931 vpxor 96(%rsi),%ymm7,%ymm7 1932 leaq 128(%rsi),%rsi 1933 vmovdqu %ymm14,0(%rdi) 1934 vmovdqu %ymm2,32(%rdi) 1935 vmovdqu %ymm3,64(%rdi) 1936 vmovdqu %ymm7,96(%rdi) 1937 leaq 128(%rdi),%rdi 1938 1939 vpxor 0(%rsi),%ymm11,%ymm11 1940 vpxor 32(%rsi),%ymm9,%ymm9 1941 vpxor 64(%rsi),%ymm0,%ymm0 1942 vpxor 96(%rsi),%ymm4,%ymm4 1943 leaq 128(%rsi),%rsi 1944 vmovdqu %ymm11,0(%rdi) 1945 vmovdqu %ymm9,32(%rdi) 1946 vmovdqu %ymm0,64(%rdi) 1947 vmovdqu %ymm4,96(%rdi) 1948 leaq 128(%rdi),%rdi 1949 1950 subq $512,%rdx 1951 jnz L$oop_outer8x 1952 1953 jmp L$done8x 1954 1955L$tail8x: 1956 cmpq $448,%rdx 1957 jae L$448_or_more8x 1958 cmpq $384,%rdx 1959 jae L$384_or_more8x 1960 cmpq $320,%rdx 1961 jae L$320_or_more8x 1962 cmpq $256,%rdx 1963 jae L$256_or_more8x 1964 cmpq $192,%rdx 1965 jae L$192_or_more8x 1966 cmpq $128,%rdx 1967 jae L$128_or_more8x 1968 cmpq $64,%rdx 1969 jae L$64_or_more8x 1970 1971 xorq %r10,%r10 1972 vmovdqa %ymm6,0(%rsp) 1973 vmovdqa %ymm8,32(%rsp) 1974 jmp L$oop_tail8x 1975 1976.p2align 5 1977L$64_or_more8x: 1978 vpxor 0(%rsi),%ymm6,%ymm6 1979 vpxor 32(%rsi),%ymm8,%ymm8 1980 vmovdqu %ymm6,0(%rdi) 1981 vmovdqu %ymm8,32(%rdi) 1982 je L$done8x 1983 1984 leaq 64(%rsi),%rsi 1985 xorq %r10,%r10 1986 vmovdqa %ymm1,0(%rsp) 1987 leaq 64(%rdi),%rdi 1988 subq $64,%rdx 1989 vmovdqa %ymm5,32(%rsp) 1990 jmp L$oop_tail8x 1991 1992.p2align 5 1993L$128_or_more8x: 1994 vpxor 0(%rsi),%ymm6,%ymm6 1995 vpxor 32(%rsi),%ymm8,%ymm8 1996 vpxor 64(%rsi),%ymm1,%ymm1 1997 vpxor 96(%rsi),%ymm5,%ymm5 1998 vmovdqu %ymm6,0(%rdi) 1999 vmovdqu %ymm8,32(%rdi) 2000 vmovdqu %ymm1,64(%rdi) 2001 vmovdqu %ymm5,96(%rdi) 2002 je L$done8x 2003 2004 leaq 128(%rsi),%rsi 2005 xorq %r10,%r10 2006 vmovdqa %ymm12,0(%rsp) 2007 leaq 128(%rdi),%rdi 2008 subq $128,%rdx 2009 vmovdqa %ymm13,32(%rsp) 2010 jmp L$oop_tail8x 2011 2012.p2align 5 2013L$192_or_more8x: 2014 vpxor 0(%rsi),%ymm6,%ymm6 2015 vpxor 32(%rsi),%ymm8,%ymm8 2016 vpxor 64(%rsi),%ymm1,%ymm1 2017 vpxor 96(%rsi),%ymm5,%ymm5 2018 vpxor 128(%rsi),%ymm12,%ymm12 2019 vpxor 160(%rsi),%ymm13,%ymm13 2020 vmovdqu %ymm6,0(%rdi) 2021 vmovdqu %ymm8,32(%rdi) 2022 vmovdqu %ymm1,64(%rdi) 2023 vmovdqu %ymm5,96(%rdi) 2024 vmovdqu %ymm12,128(%rdi) 2025 vmovdqu %ymm13,160(%rdi) 2026 je L$done8x 2027 2028 leaq 192(%rsi),%rsi 2029 xorq %r10,%r10 2030 vmovdqa %ymm10,0(%rsp) 2031 leaq 192(%rdi),%rdi 2032 subq $192,%rdx 2033 vmovdqa %ymm15,32(%rsp) 2034 jmp L$oop_tail8x 2035 2036.p2align 5 2037L$256_or_more8x: 2038 vpxor 0(%rsi),%ymm6,%ymm6 2039 vpxor 32(%rsi),%ymm8,%ymm8 2040 vpxor 64(%rsi),%ymm1,%ymm1 2041 vpxor 96(%rsi),%ymm5,%ymm5 2042 vpxor 128(%rsi),%ymm12,%ymm12 2043 vpxor 160(%rsi),%ymm13,%ymm13 2044 vpxor 192(%rsi),%ymm10,%ymm10 2045 vpxor 224(%rsi),%ymm15,%ymm15 2046 vmovdqu %ymm6,0(%rdi) 2047 vmovdqu %ymm8,32(%rdi) 2048 vmovdqu %ymm1,64(%rdi) 2049 vmovdqu %ymm5,96(%rdi) 2050 vmovdqu %ymm12,128(%rdi) 2051 vmovdqu %ymm13,160(%rdi) 2052 vmovdqu %ymm10,192(%rdi) 2053 vmovdqu %ymm15,224(%rdi) 2054 je L$done8x 2055 2056 leaq 256(%rsi),%rsi 2057 xorq %r10,%r10 2058 vmovdqa %ymm14,0(%rsp) 2059 leaq 256(%rdi),%rdi 2060 subq $256,%rdx 2061 vmovdqa %ymm2,32(%rsp) 2062 jmp L$oop_tail8x 2063 2064.p2align 5 2065L$320_or_more8x: 2066 vpxor 0(%rsi),%ymm6,%ymm6 2067 vpxor 32(%rsi),%ymm8,%ymm8 2068 vpxor 64(%rsi),%ymm1,%ymm1 2069 vpxor 96(%rsi),%ymm5,%ymm5 2070 vpxor 128(%rsi),%ymm12,%ymm12 2071 vpxor 160(%rsi),%ymm13,%ymm13 2072 vpxor 192(%rsi),%ymm10,%ymm10 2073 vpxor 224(%rsi),%ymm15,%ymm15 2074 vpxor 256(%rsi),%ymm14,%ymm14 2075 vpxor 288(%rsi),%ymm2,%ymm2 2076 vmovdqu %ymm6,0(%rdi) 2077 vmovdqu %ymm8,32(%rdi) 2078 vmovdqu %ymm1,64(%rdi) 2079 vmovdqu %ymm5,96(%rdi) 2080 vmovdqu %ymm12,128(%rdi) 2081 vmovdqu %ymm13,160(%rdi) 2082 vmovdqu %ymm10,192(%rdi) 2083 vmovdqu %ymm15,224(%rdi) 2084 vmovdqu %ymm14,256(%rdi) 2085 vmovdqu %ymm2,288(%rdi) 2086 je L$done8x 2087 2088 leaq 320(%rsi),%rsi 2089 xorq %r10,%r10 2090 vmovdqa %ymm3,0(%rsp) 2091 leaq 320(%rdi),%rdi 2092 subq $320,%rdx 2093 vmovdqa %ymm7,32(%rsp) 2094 jmp L$oop_tail8x 2095 2096.p2align 5 2097L$384_or_more8x: 2098 vpxor 0(%rsi),%ymm6,%ymm6 2099 vpxor 32(%rsi),%ymm8,%ymm8 2100 vpxor 64(%rsi),%ymm1,%ymm1 2101 vpxor 96(%rsi),%ymm5,%ymm5 2102 vpxor 128(%rsi),%ymm12,%ymm12 2103 vpxor 160(%rsi),%ymm13,%ymm13 2104 vpxor 192(%rsi),%ymm10,%ymm10 2105 vpxor 224(%rsi),%ymm15,%ymm15 2106 vpxor 256(%rsi),%ymm14,%ymm14 2107 vpxor 288(%rsi),%ymm2,%ymm2 2108 vpxor 320(%rsi),%ymm3,%ymm3 2109 vpxor 352(%rsi),%ymm7,%ymm7 2110 vmovdqu %ymm6,0(%rdi) 2111 vmovdqu %ymm8,32(%rdi) 2112 vmovdqu %ymm1,64(%rdi) 2113 vmovdqu %ymm5,96(%rdi) 2114 vmovdqu %ymm12,128(%rdi) 2115 vmovdqu %ymm13,160(%rdi) 2116 vmovdqu %ymm10,192(%rdi) 2117 vmovdqu %ymm15,224(%rdi) 2118 vmovdqu %ymm14,256(%rdi) 2119 vmovdqu %ymm2,288(%rdi) 2120 vmovdqu %ymm3,320(%rdi) 2121 vmovdqu %ymm7,352(%rdi) 2122 je L$done8x 2123 2124 leaq 384(%rsi),%rsi 2125 xorq %r10,%r10 2126 vmovdqa %ymm11,0(%rsp) 2127 leaq 384(%rdi),%rdi 2128 subq $384,%rdx 2129 vmovdqa %ymm9,32(%rsp) 2130 jmp L$oop_tail8x 2131 2132.p2align 5 2133L$448_or_more8x: 2134 vpxor 0(%rsi),%ymm6,%ymm6 2135 vpxor 32(%rsi),%ymm8,%ymm8 2136 vpxor 64(%rsi),%ymm1,%ymm1 2137 vpxor 96(%rsi),%ymm5,%ymm5 2138 vpxor 128(%rsi),%ymm12,%ymm12 2139 vpxor 160(%rsi),%ymm13,%ymm13 2140 vpxor 192(%rsi),%ymm10,%ymm10 2141 vpxor 224(%rsi),%ymm15,%ymm15 2142 vpxor 256(%rsi),%ymm14,%ymm14 2143 vpxor 288(%rsi),%ymm2,%ymm2 2144 vpxor 320(%rsi),%ymm3,%ymm3 2145 vpxor 352(%rsi),%ymm7,%ymm7 2146 vpxor 384(%rsi),%ymm11,%ymm11 2147 vpxor 416(%rsi),%ymm9,%ymm9 2148 vmovdqu %ymm6,0(%rdi) 2149 vmovdqu %ymm8,32(%rdi) 2150 vmovdqu %ymm1,64(%rdi) 2151 vmovdqu %ymm5,96(%rdi) 2152 vmovdqu %ymm12,128(%rdi) 2153 vmovdqu %ymm13,160(%rdi) 2154 vmovdqu %ymm10,192(%rdi) 2155 vmovdqu %ymm15,224(%rdi) 2156 vmovdqu %ymm14,256(%rdi) 2157 vmovdqu %ymm2,288(%rdi) 2158 vmovdqu %ymm3,320(%rdi) 2159 vmovdqu %ymm7,352(%rdi) 2160 vmovdqu %ymm11,384(%rdi) 2161 vmovdqu %ymm9,416(%rdi) 2162 je L$done8x 2163 2164 leaq 448(%rsi),%rsi 2165 xorq %r10,%r10 2166 vmovdqa %ymm0,0(%rsp) 2167 leaq 448(%rdi),%rdi 2168 subq $448,%rdx 2169 vmovdqa %ymm4,32(%rsp) 2170 2171L$oop_tail8x: 2172 movzbl (%rsi,%r10,1),%eax 2173 movzbl (%rsp,%r10,1),%ecx 2174 leaq 1(%r10),%r10 2175 xorl %ecx,%eax 2176 movb %al,-1(%rdi,%r10,1) 2177 decq %rdx 2178 jnz L$oop_tail8x 2179 2180L$done8x: 2181 vzeroall 2182 leaq (%r9),%rsp 2183 2184L$8x_epilogue: 2185 .byte 0xf3,0xc3 2186 2187 2188