1#include "x86.inc" 2 3SECTION_TEXT 4 5GLOBAL_HIDDEN_FN poly1305_block_size_sse2 6movl $32, %eax 7ret 8FN_END poly1305_block_size_sse2 9 10GLOBAL_HIDDEN_FN poly1305_auth_sse2 11poly1305_auth_sse2_local: 12pushl %ebp 13movl %esp, %ebp 14andl $-64, %esp 15pushl %esi 16pushl %edi 17pushl %ebx 18subl $244, %esp 19movl 16(%ebp), %esi 20lea 64(%esp), %eax 21movl %esi, %ecx 22movl 20(%ebp), %edx 23movl 12(%ebp), %edi 24call poly1305_init_ext_sse2_local 25poly1305_auth_sse2_2: 26movl %esi, %ebx 27andl $-32, %ebx 28je poly1305_auth_sse2_5 29poly1305_auth_sse2_3: 30movl %edi, %edx 31lea 64(%esp), %eax 32movl %ebx, %ecx 33call poly1305_blocks_sse2_local 34poly1305_auth_sse2_4: 35addl %ebx, %edi 36subl %ebx, %esi 37poly1305_auth_sse2_5: 38pushl 8(%ebp) 39pushl %esi 40pushl %edi 41lea 76(%esp), %eax 42pushl %eax 43call poly1305_finish_ext_sse2_local 44poly1305_auth_sse2_6: 45addl $260, %esp 46popl %ebx 47popl %edi 48popl %esi 49movl %ebp, %esp 50popl %ebp 51ret 52FN_END poly1305_auth_sse2 53 54GLOBAL_HIDDEN_FN poly1305_finish_ext_sse2 55poly1305_finish_ext_sse2_local: 56pushl %esi 57pushl %edi 58pushl %ebx 59pushl %ebp 60subl $60, %esp 61movl 88(%esp), %ebp 62testl %ebp, %ebp 63movl 80(%esp), %ebx 64je poly1305_finish_ext_sse2_18 65poly1305_finish_ext_sse2_2: 66pxor %xmm0, %xmm0 67movaps %xmm0, 16(%esp) 68movaps %xmm0, 32(%esp) 69poly1305_finish_ext_sse2_3: 70movl 84(%esp), %ecx 71lea 16(%esp), %edx 72subl %edx, %ecx 73testl $16, %ebp 74je poly1305_finish_ext_sse2_5 75poly1305_finish_ext_sse2_4: 76lea 32(%esp), %edx 77movdqu 16(%esp,%ecx), %xmm0 78movdqa %xmm0, 16(%esp) 79poly1305_finish_ext_sse2_5: 80testl $8, %ebp 81je poly1305_finish_ext_sse2_7 82poly1305_finish_ext_sse2_6: 83movl (%edx,%ecx), %esi 84movl 4(%edx,%ecx), %edi 85movl %esi, (%edx) 86movl %edi, 4(%edx) 87addl $8, %edx 88poly1305_finish_ext_sse2_7: 89testl $4, %ebp 90je poly1305_finish_ext_sse2_9 91poly1305_finish_ext_sse2_8: 92movl (%edx,%ecx), %esi 93movl %esi, (%edx) 94addl $4, %edx 95poly1305_finish_ext_sse2_9: 96testl $2, %ebp 97je poly1305_finish_ext_sse2_11 98poly1305_finish_ext_sse2_10: 99movzwl (%edx,%ecx), %esi 100movw %si, (%edx) 101addl $2, %edx 102poly1305_finish_ext_sse2_11: 103testl $1, %ebp 104je poly1305_finish_ext_sse2_13 105poly1305_finish_ext_sse2_12: 106movzbl (%edx,%ecx), %ecx 107movb %cl, (%edx) 108poly1305_finish_ext_sse2_13: 109cmpl $16, %ebp 110je poly1305_finish_ext_sse2_16 111poly1305_finish_ext_sse2_14: 112movb $1, 16(%esp,%ebp) 113jae poly1305_finish_ext_sse2_16 114poly1305_finish_ext_sse2_15: 115movl $8, %edx 116jmp poly1305_finish_ext_sse2_17 117poly1305_finish_ext_sse2_16: 118movl $4, %edx 119poly1305_finish_ext_sse2_17: 120orl %edx, 116(%ebx) 121movl %ebx, %eax 122movl $32, %ecx 123lea 16(%esp), %edx 124call poly1305_blocks_sse2_local 125poly1305_finish_ext_sse2_18: 126movl 116(%ebx), %edx 127testb $1, %dl 128je poly1305_finish_ext_sse2_24 129poly1305_finish_ext_sse2_19: 130testl %ebp, %ebp 131je poly1305_finish_ext_sse2_21 132poly1305_finish_ext_sse2_20: 133cmpl $16, %ebp 134jbe poly1305_finish_ext_sse2_22 135poly1305_finish_ext_sse2_21: 136orl $16, %edx 137movl %edx, 116(%ebx) 138jmp poly1305_finish_ext_sse2_23 139poly1305_finish_ext_sse2_22: 140orl $32, %edx 141movl %edx, 116(%ebx) 142poly1305_finish_ext_sse2_23: 143movl %ebx, %eax 144xorl %edx, %edx 145movl $32, %ecx 146call poly1305_blocks_sse2_local 147poly1305_finish_ext_sse2_24: 148movl 8(%ebx), %edx 149movl %edx, %eax 150movl 4(%ebx), %ecx 151movl %ecx, %esi 152shrl $6, %ecx 153shll $20, %eax 154pxor %xmm0, %xmm0 155movl 12(%ebx), %ebp 156orl %eax, %ecx 157movl %ebp, %eax 158shrl $12, %edx 159shll $14, %eax 160orl %eax, %edx 161movl 16(%ebx), %eax 162shll $26, %esi 163shrl $18, %ebp 164shll $8, %eax 165movl 92(%esp), %edi 166orl %eax, %ebp 167orl (%ebx), %esi 168addl 100(%ebx), %esi 169adcl 104(%ebx), %ecx 170adcl 108(%ebx), %edx 171adcl 112(%ebx), %ebp 172movdqu %xmm0, (%ebx) 173movdqu %xmm0, 16(%ebx) 174movdqu %xmm0, 32(%ebx) 175movdqu %xmm0, 48(%ebx) 176movdqu %xmm0, 64(%ebx) 177movdqu %xmm0, 80(%ebx) 178movdqu %xmm0, 96(%ebx) 179movdqu %xmm0, 112(%ebx) 180movl %esi, (%edi) 181movl %ecx, 4(%edi) 182movl %edx, 8(%edi) 183movl %ebp, 12(%edi) 184addl $60, %esp 185popl %ebp 186popl %ebx 187popl %edi 188popl %esi 189ret 190FN_END poly1305_finish_ext_sse2 191 192 193GLOBAL_HIDDEN_FN poly1305_blocks_sse2 194movl 4(%esp), %eax 195movl 8(%esp), %edx 196movl 12(%esp), %ecx 197poly1305_blocks_sse2_local: 198pushl %esi 199pushl %edi 200pushl %ebx 201subl $544, %esp 202movl $16777216, %ebx 203movl $67108863, %esi 204movl $5, %edi 205movd %ebx, %xmm0 206movd %esi, %xmm2 207movd %edi, %xmm4 208movl 116(%eax), %ebx 209testb $4, %bl 210pshufd $68, %xmm0, %xmm1 211pshufd $68, %xmm2, %xmm3 212pshufd $68, %xmm4, %xmm5 213movdqa %xmm1, 272(%esp) 214movdqa %xmm3, 256(%esp) 215movdqa %xmm5, 160(%esp) 216je poly1305_blocks_sse2_3 217poly1305_blocks_sse2_2: 218movdqa 272(%esp), %xmm0 219psrldq $8, %xmm0 220movdqa %xmm0, 272(%esp) 221poly1305_blocks_sse2_3: 222testb $8, %bl 223je poly1305_blocks_sse2_5 224poly1305_blocks_sse2_4: 225pxor %xmm0, %xmm0 226movdqa %xmm0, 272(%esp) 227poly1305_blocks_sse2_5: 228testb $1, %bl 229jne poly1305_blocks_sse2_7 230poly1305_blocks_sse2_6: 231movq 8(%edx), %xmm0 232orl $1, %ebx 233movq (%edx), %xmm1 234addl $-32, %ecx 235movhpd 24(%edx), %xmm0 236movdqa 256(%esp), %xmm4 237movaps %xmm0, %xmm2 238movhpd 16(%edx), %xmm1 239movdqa %xmm4, %xmm7 240pand %xmm1, %xmm7 241movaps %xmm1, %xmm6 242psrlq $52, %xmm1 243psllq $12, %xmm2 244por %xmm2, %xmm1 245movdqa %xmm4, %xmm3 246psrlq $26, %xmm6 247pand %xmm1, %xmm3 248psrlq $26, %xmm1 249psrlq $40, %xmm0 250movdqa %xmm3, 32(%esp) 251pand %xmm4, %xmm6 252por 272(%esp), %xmm0 253pand %xmm4, %xmm1 254movl %ebx, 116(%eax) 255addl $32, %edx 256jmp poly1305_blocks_sse2_8 257poly1305_blocks_sse2_7: 258movdqu 16(%eax), %xmm0 259movdqu (%eax), %xmm6 260movdqu 32(%eax), %xmm2 261pshufd $80, %xmm0, %xmm1 262pshufd $80, %xmm6, %xmm7 263pshufd $250, %xmm6, %xmm6 264movdqa %xmm1, 32(%esp) 265pshufd $250, %xmm0, %xmm1 266pshufd $80, %xmm2, %xmm0 267poly1305_blocks_sse2_8: 268testb $48, %bl 269je poly1305_blocks_sse2_13 270poly1305_blocks_sse2_9: 271movdqu 40(%eax), %xmm2 272movl 56(%eax), %esi 273testb $16, %bl 274je poly1305_blocks_sse2_11 275poly1305_blocks_sse2_10: 276movdqu 60(%eax), %xmm4 277movdqa %xmm4, %xmm5 278movd 76(%eax), %xmm3 279punpckldq %xmm2, %xmm5 280punpckhdq %xmm2, %xmm4 281movd %esi, %xmm2 282punpcklqdq %xmm2, %xmm3 283movdqa %xmm3, 240(%esp) 284jmp poly1305_blocks_sse2_12 285poly1305_blocks_sse2_11: 286movl $1, %ebx 287movdqa %xmm2, %xmm5 288movdqa %xmm2, %xmm4 289movd %esi, %xmm2 290movdqa %xmm2, 240(%esp) 291movd %ebx, %xmm3 292punpckldq %xmm3, %xmm5 293punpckhdq %xmm3, %xmm4 294poly1305_blocks_sse2_12: 295pshufd $80, %xmm5, %xmm2 296pshufd $250, %xmm5, %xmm3 297pshufd $80, %xmm4, %xmm5 298pshufd $250, %xmm4, %xmm4 299movdqa %xmm2, 176(%esp) 300movdqa %xmm3, 224(%esp) 301movdqa %xmm5, 208(%esp) 302movdqa %xmm4, 192(%esp) 303jmp poly1305_blocks_sse2_14 304poly1305_blocks_sse2_13: 305movdqu 60(%eax), %xmm3 306movd 76(%eax), %xmm2 307pshufd $0, %xmm3, %xmm4 308movdqa %xmm4, 176(%esp) 309pshufd $85, %xmm3, %xmm5 310pshufd $170, %xmm3, %xmm4 311pshufd $255, %xmm3, %xmm3 312pshufd $0, %xmm2, %xmm2 313movdqa %xmm5, 224(%esp) 314movdqa %xmm4, 208(%esp) 315movdqa %xmm3, 192(%esp) 316movdqa %xmm2, 240(%esp) 317poly1305_blocks_sse2_14: 318movdqa 160(%esp), %xmm2 319cmpl $64, %ecx 320movdqa 192(%esp), %xmm5 321pmuludq %xmm2, %xmm5 322movdqa %xmm5, 320(%esp) 323movdqa 224(%esp), %xmm4 324movdqa 208(%esp), %xmm3 325movdqa 240(%esp), %xmm5 326pmuludq %xmm2, %xmm4 327pmuludq %xmm2, %xmm3 328pmuludq %xmm2, %xmm5 329movdqa 320(%esp), %xmm2 330jb poly1305_blocks_sse2_18 331poly1305_blocks_sse2_15: 332movdqa %xmm3, 304(%esp) 333movdqu 80(%eax), %xmm3 334movaps %xmm0, 400(%esp) 335movd 96(%eax), %xmm0 336movdqa %xmm2, 320(%esp) 337pshufd $85, %xmm3, %xmm2 338movdqa %xmm2, 128(%esp) 339pshufd $0, %xmm0, %xmm2 340movdqa 160(%esp), %xmm0 341movdqa %xmm5, 336(%esp) 342pshufd $0, %xmm3, %xmm5 343movdqa %xmm5, (%esp) 344movdqa %xmm0, %xmm5 345pmuludq 128(%esp), %xmm5 346movdqa %xmm4, 288(%esp) 347pshufd $170, %xmm3, %xmm4 348movdqa %xmm5, 80(%esp) 349movdqa %xmm0, %xmm5 350movdqa %xmm4, 112(%esp) 351pshufd $255, %xmm3, %xmm3 352pmuludq %xmm4, %xmm5 353movdqa %xmm0, %xmm4 354pmuludq %xmm3, %xmm4 355pmuludq %xmm2, %xmm0 356movdqa %xmm3, 144(%esp) 357movdqa %xmm2, 96(%esp) 358movdqa %xmm5, 64(%esp) 359movdqa %xmm4, 48(%esp) 360movdqa %xmm0, 16(%esp) 361movaps 400(%esp), %xmm0 362movaps %xmm1, 384(%esp) 363movdqa %xmm6, 368(%esp) 364movdqa %xmm7, 352(%esp) 365poly1305_blocks_sse2_16: 366movq 8(%edx), %xmm6 367addl $-64, %ecx 368movq (%edx), %xmm2 369movhpd 24(%edx), %xmm6 370movdqa 256(%esp), %xmm1 371movaps %xmm6, %xmm7 372movhpd 16(%edx), %xmm2 373movdqa %xmm1, %xmm4 374pand %xmm2, %xmm4 375movaps %xmm2, %xmm3 376movaps %xmm6, %xmm5 377psrlq $52, %xmm2 378psllq $12, %xmm7 379psrlq $26, %xmm3 380psrlq $14, %xmm5 381por %xmm7, %xmm2 382pand %xmm1, %xmm3 383pand %xmm1, %xmm5 384pand %xmm1, %xmm2 385psrlq $40, %xmm6 386movdqu 32(%edx), %xmm1 387movdqu 48(%edx), %xmm7 388movdqa %xmm4, 416(%esp) 389movdqa %xmm1, %xmm4 390punpckldq %xmm7, %xmm4 391addl $64, %edx 392punpckhdq %xmm7, %xmm1 393cmpl $64, %ecx 394movaps 384(%esp), %xmm7 395movaps %xmm0, 400(%esp) 396pmuludq 80(%esp), %xmm0 397pmuludq 64(%esp), %xmm7 398movdqa %xmm1, 480(%esp) 399movdqa 32(%esp), %xmm1 400pmuludq 48(%esp), %xmm1 401paddq %xmm7, %xmm0 402movdqa 368(%esp), %xmm7 403pmuludq 16(%esp), %xmm7 404paddq %xmm1, %xmm0 405movdqa 352(%esp), %xmm1 406paddq %xmm7, %xmm0 407movdqa (%esp), %xmm7 408pmuludq %xmm7, %xmm1 409por 272(%esp), %xmm6 410paddq %xmm1, %xmm0 411movdqa 288(%esp), %xmm1 412pmuludq %xmm6, %xmm1 413paddq %xmm1, %xmm0 414movdqa 304(%esp), %xmm1 415pmuludq %xmm5, %xmm1 416paddq %xmm1, %xmm0 417movdqa 320(%esp), %xmm1 418pmuludq %xmm2, %xmm1 419movaps %xmm2, 448(%esp) 420movdqa 336(%esp), %xmm2 421paddq %xmm1, %xmm0 422movdqa %xmm2, %xmm1 423pmuludq %xmm3, %xmm1 424paddq %xmm1, %xmm0 425movdqa 176(%esp), %xmm1 426movdqa %xmm3, 432(%esp) 427movdqa %xmm1, %xmm3 428pmuludq 416(%esp), %xmm3 429pmuludq %xmm5, %xmm1 430paddq %xmm3, %xmm0 431pxor %xmm3, %xmm3 432movdqa %xmm4, 464(%esp) 433punpckldq %xmm3, %xmm4 434paddq %xmm4, %xmm0 435movdqa %xmm0, 496(%esp) 436movaps 400(%esp), %xmm4 437movaps 384(%esp), %xmm0 438pmuludq 16(%esp), %xmm4 439pmuludq %xmm7, %xmm0 440movdqa 32(%esp), %xmm7 441pmuludq 128(%esp), %xmm7 442paddq %xmm0, %xmm4 443movdqa 368(%esp), %xmm0 444pmuludq 112(%esp), %xmm0 445paddq %xmm7, %xmm4 446movdqa 352(%esp), %xmm7 447pmuludq 144(%esp), %xmm7 448paddq %xmm0, %xmm4 449movdqa %xmm2, %xmm0 450pmuludq %xmm6, %xmm0 451paddq %xmm7, %xmm4 452movdqa 224(%esp), %xmm7 453paddq %xmm0, %xmm4 454movaps 448(%esp), %xmm0 455pmuludq %xmm0, %xmm7 456pmuludq %xmm0, %xmm2 457paddq %xmm1, %xmm4 458movdqa 208(%esp), %xmm1 459pmuludq 432(%esp), %xmm1 460paddq %xmm7, %xmm4 461movdqa 192(%esp), %xmm7 462paddq %xmm1, %xmm4 463movdqa 416(%esp), %xmm1 464pmuludq %xmm1, %xmm7 465paddq %xmm7, %xmm4 466movdqa 480(%esp), %xmm7 467punpckhdq %xmm3, %xmm7 468psllq $18, %xmm7 469paddq %xmm7, %xmm4 470movdqa %xmm4, 512(%esp) 471movaps 400(%esp), %xmm4 472movaps 384(%esp), %xmm3 473pmuludq 64(%esp), %xmm4 474pmuludq 48(%esp), %xmm3 475movdqa 32(%esp), %xmm7 476pmuludq 16(%esp), %xmm7 477paddq %xmm3, %xmm4 478movdqa 368(%esp), %xmm3 479pmuludq (%esp), %xmm3 480paddq %xmm7, %xmm4 481movdqa 352(%esp), %xmm7 482paddq %xmm3, %xmm4 483movdqa 128(%esp), %xmm3 484pmuludq %xmm3, %xmm7 485paddq %xmm7, %xmm4 486movdqa 304(%esp), %xmm7 487pmuludq %xmm6, %xmm7 488paddq %xmm7, %xmm4 489movdqa 320(%esp), %xmm7 490pmuludq %xmm5, %xmm7 491paddq %xmm7, %xmm4 492movdqa 176(%esp), %xmm7 493movdqa %xmm7, %xmm0 494pmuludq 432(%esp), %xmm0 495paddq %xmm2, %xmm4 496movdqa 224(%esp), %xmm2 497paddq %xmm0, %xmm4 498movdqa %xmm2, %xmm0 499pmuludq %xmm1, %xmm0 500pmuludq %xmm5, %xmm2 501pmuludq 336(%esp), %xmm5 502paddq %xmm0, %xmm4 503movdqa 464(%esp), %xmm0 504pxor %xmm1, %xmm1 505punpckhdq %xmm1, %xmm0 506psllq $6, %xmm0 507movdqa 496(%esp), %xmm1 508paddq %xmm0, %xmm4 509psrlq $26, %xmm1 510paddq %xmm1, %xmm4 511movdqa %xmm4, 528(%esp) 512movaps 400(%esp), %xmm4 513movaps 384(%esp), %xmm1 514movaps %xmm4, %xmm0 515pmuludq (%esp), %xmm0 516pmuludq %xmm3, %xmm1 517pmuludq 48(%esp), %xmm4 518paddq %xmm1, %xmm0 519movdqa 32(%esp), %xmm3 520pmuludq 112(%esp), %xmm3 521paddq %xmm3, %xmm0 522movdqa 368(%esp), %xmm3 523movdqa %xmm3, %xmm1 524pmuludq 144(%esp), %xmm1 525pmuludq 128(%esp), %xmm3 526paddq %xmm1, %xmm0 527movdqa 352(%esp), %xmm1 528pmuludq 96(%esp), %xmm1 529paddq %xmm1, %xmm0 530movdqa %xmm7, %xmm1 531pmuludq %xmm6, %xmm1 532pmuludq 320(%esp), %xmm6 533paddq %xmm1, %xmm0 534movdqa 208(%esp), %xmm1 535pmuludq 448(%esp), %xmm1 536paddq %xmm2, %xmm0 537movdqa 192(%esp), %xmm2 538pmuludq 432(%esp), %xmm2 539paddq %xmm1, %xmm0 540movdqa 416(%esp), %xmm1 541paddq %xmm2, %xmm0 542movdqa 240(%esp), %xmm2 543pmuludq %xmm1, %xmm2 544pmuludq 208(%esp), %xmm1 545paddq %xmm2, %xmm0 546movdqa 512(%esp), %xmm2 547paddq 272(%esp), %xmm0 548psrlq $26, %xmm2 549paddq %xmm2, %xmm0 550movaps 384(%esp), %xmm2 551pmuludq 16(%esp), %xmm2 552paddq %xmm2, %xmm4 553movdqa 32(%esp), %xmm2 554pmuludq (%esp), %xmm2 555paddq %xmm2, %xmm4 556paddq %xmm3, %xmm4 557movdqa 352(%esp), %xmm3 558pmuludq 112(%esp), %xmm3 559paddq %xmm3, %xmm4 560paddq %xmm6, %xmm4 561movaps 448(%esp), %xmm6 562movdqa %xmm0, %xmm3 563pmuludq %xmm7, %xmm6 564psrlq $26, %xmm3 565pmuludq 160(%esp), %xmm3 566paddq %xmm5, %xmm4 567movdqa 432(%esp), %xmm5 568pmuludq 224(%esp), %xmm5 569paddq %xmm6, %xmm4 570paddq %xmm5, %xmm4 571movdqa 480(%esp), %xmm7 572pxor %xmm5, %xmm5 573punpckldq %xmm5, %xmm7 574paddq %xmm1, %xmm4 575movdqa 528(%esp), %xmm2 576psllq $12, %xmm7 577movdqa %xmm2, %xmm1 578paddq %xmm7, %xmm4 579psrlq $26, %xmm1 580paddq %xmm1, %xmm4 581movdqa 256(%esp), %xmm5 582movaps %xmm4, %xmm1 583movdqa 496(%esp), %xmm6 584psrlq $26, %xmm1 585movdqa 512(%esp), %xmm7 586pand %xmm5, %xmm6 587pand %xmm5, %xmm7 588pand %xmm5, %xmm2 589paddq %xmm3, %xmm6 590paddq %xmm1, %xmm7 591movdqa %xmm5, %xmm3 592movdqa %xmm5, %xmm1 593pand %xmm6, %xmm3 594psrlq $26, %xmm6 595pand %xmm4, %xmm1 596movdqa %xmm5, %xmm4 597paddq %xmm6, %xmm2 598pand %xmm7, %xmm4 599pand %xmm5, %xmm0 600psrlq $26, %xmm7 601movdqa %xmm3, 352(%esp) 602movdqa %xmm2, 368(%esp) 603movdqa %xmm1, 32(%esp) 604movaps %xmm4, 384(%esp) 605paddq %xmm7, %xmm0 606jae poly1305_blocks_sse2_16 607poly1305_blocks_sse2_17: 608movdqa 336(%esp), %xmm5 609movdqa 320(%esp), %xmm2 610movdqa 304(%esp), %xmm3 611movdqa 288(%esp), %xmm4 612movaps 384(%esp), %xmm1 613movdqa 368(%esp), %xmm6 614movdqa 352(%esp), %xmm7 615poly1305_blocks_sse2_18: 616cmpl $32, %ecx 617jb poly1305_blocks_sse2_22 618poly1305_blocks_sse2_19: 619movaps %xmm1, 384(%esp) 620testl %edx, %edx 621pmuludq %xmm0, %xmm4 622pmuludq %xmm3, %xmm1 623pmuludq %xmm0, %xmm3 624paddq %xmm1, %xmm4 625movdqa 32(%esp), %xmm1 626pmuludq %xmm2, %xmm1 627paddq %xmm1, %xmm4 628movdqa %xmm6, %xmm1 629pmuludq %xmm5, %xmm1 630paddq %xmm1, %xmm4 631movdqa 176(%esp), %xmm1 632movdqa %xmm7, 352(%esp) 633pmuludq %xmm1, %xmm7 634paddq %xmm7, %xmm4 635movdqa %xmm4, 288(%esp) 636movaps 384(%esp), %xmm4 637movaps %xmm4, %xmm7 638pmuludq %xmm2, %xmm7 639pmuludq %xmm0, %xmm2 640paddq %xmm7, %xmm3 641movdqa 32(%esp), %xmm7 642pmuludq %xmm5, %xmm7 643movdqa %xmm6, 368(%esp) 644pmuludq %xmm1, %xmm6 645paddq %xmm7, %xmm3 646paddq %xmm6, %xmm3 647movdqa 352(%esp), %xmm7 648movdqa 224(%esp), %xmm6 649pmuludq %xmm6, %xmm7 650paddq %xmm7, %xmm3 651movdqa %xmm3, 304(%esp) 652movaps %xmm4, %xmm3 653pmuludq %xmm5, %xmm3 654pmuludq %xmm0, %xmm5 655pmuludq %xmm1, %xmm0 656paddq %xmm3, %xmm2 657movdqa 32(%esp), %xmm3 658movdqa %xmm3, %xmm7 659pmuludq %xmm1, %xmm7 660paddq %xmm7, %xmm2 661movdqa 368(%esp), %xmm7 662pmuludq %xmm6, %xmm7 663movdqa 352(%esp), %xmm6 664paddq %xmm7, %xmm2 665movdqa 208(%esp), %xmm7 666pmuludq %xmm7, %xmm6 667paddq %xmm6, %xmm2 668movdqa %xmm2, 320(%esp) 669movaps %xmm4, %xmm2 670pmuludq %xmm1, %xmm2 671paddq %xmm2, %xmm5 672movdqa 224(%esp), %xmm6 673movdqa %xmm3, %xmm2 674pmuludq %xmm6, %xmm2 675pmuludq %xmm6, %xmm4 676pmuludq 208(%esp), %xmm3 677paddq %xmm2, %xmm5 678paddq %xmm4, %xmm0 679movdqa 368(%esp), %xmm2 680pmuludq %xmm7, %xmm2 681paddq %xmm3, %xmm0 682paddq %xmm2, %xmm5 683movdqa 192(%esp), %xmm2 684movdqa 368(%esp), %xmm4 685pmuludq %xmm2, %xmm4 686movdqa 352(%esp), %xmm7 687movdqa 352(%esp), %xmm1 688pmuludq 240(%esp), %xmm1 689pmuludq %xmm2, %xmm7 690paddq %xmm4, %xmm0 691paddq %xmm7, %xmm5 692paddq %xmm1, %xmm0 693movdqa 288(%esp), %xmm4 694movdqa 304(%esp), %xmm3 695movdqa 320(%esp), %xmm2 696je poly1305_blocks_sse2_21 697poly1305_blocks_sse2_20: 698movdqu (%edx), %xmm1 699movdqu 16(%edx), %xmm7 700movdqa %xmm1, %xmm6 701movaps %xmm0, 400(%esp) 702punpckldq %xmm7, %xmm6 703pxor %xmm0, %xmm0 704punpckhdq %xmm7, %xmm1 705movdqa %xmm6, %xmm7 706punpckhdq %xmm0, %xmm6 707psllq $6, %xmm6 708paddq %xmm6, %xmm3 709movdqa %xmm1, %xmm6 710punpckldq %xmm0, %xmm6 711punpckhdq %xmm0, %xmm1 712psllq $12, %xmm6 713punpckldq %xmm0, %xmm7 714psllq $18, %xmm1 715movaps 400(%esp), %xmm0 716paddq 272(%esp), %xmm0 717paddq %xmm7, %xmm4 718paddq %xmm6, %xmm2 719paddq %xmm1, %xmm5 720poly1305_blocks_sse2_21: 721movdqa %xmm5, %xmm6 722movdqa %xmm4, %xmm7 723psrlq $26, %xmm6 724psrlq $26, %xmm7 725paddq %xmm6, %xmm0 726paddq %xmm7, %xmm3 727movaps %xmm0, %xmm7 728movdqa %xmm3, %xmm1 729psrlq $26, %xmm7 730psrlq $26, %xmm1 731pmuludq 160(%esp), %xmm7 732paddq %xmm1, %xmm2 733movdqa 256(%esp), %xmm1 734movdqa %xmm2, %xmm6 735pand 256(%esp), %xmm5 736psrlq $26, %xmm6 737pand %xmm1, %xmm3 738pand 256(%esp), %xmm4 739paddq %xmm6, %xmm5 740paddq %xmm7, %xmm4 741movdqa %xmm3, %xmm6 742movdqa %xmm1, %xmm3 743movdqa %xmm1, %xmm7 744pand %xmm2, %xmm3 745movdqa %xmm1, %xmm2 746pand %xmm4, %xmm7 747psrlq $26, %xmm4 748pand %xmm5, %xmm1 749pand %xmm2, %xmm0 750psrlq $26, %xmm5 751paddq %xmm4, %xmm6 752paddq %xmm5, %xmm0 753movdqa %xmm3, 32(%esp) 754poly1305_blocks_sse2_22: 755testl %edx, %edx 756je poly1305_blocks_sse2_24 757poly1305_blocks_sse2_23: 758pshufd $8, %xmm7, %xmm2 759pshufd $8, %xmm6, %xmm6 760pshufd $8, 32(%esp), %xmm3 761pshufd $8, %xmm1, %xmm1 762punpcklqdq %xmm6, %xmm2 763punpcklqdq %xmm1, %xmm3 764pshufd $8, %xmm0, %xmm0 765movdqu %xmm2, (%eax) 766movdqu %xmm3, 16(%eax) 767movq %xmm0, 32(%eax) 768addl $544, %esp 769popl %ebx 770popl %edi 771popl %esi 772ret 773poly1305_blocks_sse2_24: 774movdqa %xmm7, %xmm2 775movdqa %xmm6, %xmm3 776psrldq $8, %xmm2 777paddq %xmm2, %xmm7 778psrldq $8, %xmm3 779movd %xmm7, %ecx 780paddq %xmm3, %xmm6 781movdqa 32(%esp), %xmm5 782movl %ecx, %esi 783movdqa %xmm5, %xmm4 784andl $67108863, %ecx 785movd %xmm6, %ebx 786movaps %xmm1, %xmm6 787psrldq $8, %xmm4 788paddq %xmm4, %xmm5 789shrl $26, %esi 790addl %esi, %ebx 791psrldq $8, %xmm6 792movd %xmm5, %edi 793paddq %xmm6, %xmm1 794movl %eax, (%esp) 795movl %ebx, %eax 796shrl $26, %eax 797andl $67108863, %ebx 798addl %eax, %edi 799movd %xmm1, %eax 800movaps %xmm0, %xmm1 801psrldq $8, %xmm1 802paddq %xmm1, %xmm0 803movl %edi, %edx 804andl $67108863, %edi 805shrl $26, %edx 806addl %edx, %eax 807movd %xmm0, %edx 808movl %eax, %esi 809shrl $26, %esi 810andl $67108863, %eax 811addl %esi, %edx 812movl %edx, %esi 813andl $67108863, %edx 814shrl $26, %esi 815lea (%esi,%esi,4), %esi 816addl %esi, %ecx 817movl %ecx, %esi 818andl $67108863, %ecx 819shrl $26, %esi 820addl %esi, %ebx 821movl %ebx, %esi 822andl $67108863, %ebx 823shrl $26, %esi 824addl %esi, %edi 825movl %edi, %esi 826shrl $26, %edi 827andl $67108863, %esi 828addl %edi, %eax 829movl %eax, %edi 830shrl $26, %eax 831andl $67108863, %edi 832addl %eax, %edx 833movl %edx, %eax 834shrl $26, %edx 835andl $67108863, %eax 836movl %eax, 8(%esp) 837movl %edi, 4(%esp) 838lea (%edx,%edx,4), %edx 839addl %edx, %ecx 840movl %ecx, %edx 841andl $67108863, %edx 842shrl $26, %ecx 843addl %ecx, %ebx 844lea 5(%edx), %ecx 845movl %ecx, 12(%esp) 846shrl $26, %ecx 847addl %ebx, %ecx 848movl %ecx, 16(%esp) 849shrl $26, %ecx 850addl %esi, %ecx 851movl %ecx, 20(%esp) 852shrl $26, %ecx 853addl %edi, %ecx 854movl %ecx, 24(%esp) 855shrl $26, %ecx 856movl 12(%esp), %edi 857andl $67108863, %edi 858lea -67108864(%ecx,%eax), %eax 859movl %eax, 28(%esp) 860shrl $31, %eax 861decl %eax 862movl %eax, %ecx 863andl %eax, %edi 864notl %ecx 865andl %ecx, %edx 866andl %ecx, %ebx 867orl %edi, %edx 868andl %ecx, %esi 869movl (%esp), %edi 870movl %edx, (%edi) 871movl 16(%esp), %edx 872andl $67108863, %edx 873andl %eax, %edx 874orl %edx, %ebx 875movl %ebx, 4(%edi) 876movl 20(%esp), %ebx 877andl $67108863, %ebx 878andl %eax, %ebx 879movl 24(%esp), %edx 880orl %ebx, %esi 881andl $67108863, %edx 882movl %esi, 8(%edi) 883andl %eax, %edx 884movl 4(%esp), %esi 885andl %ecx, %esi 886orl %edx, %esi 887movl 28(%esp), %edx 888andl 8(%esp), %ecx 889andl %eax, %edx 890orl %edx, %ecx 891movl %esi, 12(%edi) 892movl %ecx, 16(%edi) 893poly1305_blocks_sse2_25: 894addl $544, %esp 895popl %ebx 896popl %edi 897popl %esi 898ret 899FN_END poly1305_blocks_sse2 900 901GLOBAL_HIDDEN_FN poly1305_init_ext_sse2 902movl 4(%esp), %eax 903movl 8(%esp), %edx 904movl 12(%esp), %ecx 905poly1305_init_ext_sse2_local: 906pushl %esi 907pushl %edi 908pushl %ebx 909pushl %ebp 910subl $76, %esp 911movl %edx, %ebx 912movl $-1, %edx 913testl %ecx, %ecx 914pxor %xmm0, %xmm0 915movdqu %xmm0, (%eax) 916movdqu %xmm0, 16(%eax) 917movdqu %xmm0, 32(%eax) 918cmove %edx, %ecx 919movl 4(%ebx), %edx 920movl %edx, %ebp 921movl (%ebx), %edi 922movl %edi, %esi 923shrl $26, %edi 924andl $67108863, %esi 925shll $6, %ebp 926movl %ecx, 12(%esp) 927orl %ebp, %edi 928movl 8(%ebx), %ecx 929movl %ecx, %ebp 930shrl $20, %edx 931andl $67108611, %edi 932shll $12, %ebp 933movl %ebx, (%esp) 934orl %ebp, %edx 935movl 12(%ebx), %ebx 936movl %ebx, %ebp 937shrl $14, %ecx 938andl $67092735, %edx 939shll $18, %ebp 940orl %ebp, %ecx 941movl (%esp), %ebp 942andl $66076671, %ecx 943shrl $8, %ebx 944andl $1048575, %ebx 945movl %esi, 40(%eax) 946movl %edi, 44(%eax) 947movl %edx, 48(%eax) 948movl %ecx, 52(%eax) 949movl %ebx, 56(%eax) 950movl %esi, 20(%esp) 951movl 16(%ebp), %esi 952movl %esi, 100(%eax) 953movl %edi, 24(%esp) 954movl 20(%ebp), %edi 955movl %edi, 104(%eax) 956movl 24(%ebp), %esi 957movl %esi, 108(%eax) 958lea 80(%eax), %esi 959movl 28(%ebp), %ebp 960movl 12(%esp), %edi 961cmpl $16, %edi 962movl %ebp, 112(%eax) 963lea 60(%eax), %ebp 964movl $0, 28(%esp) 965movl %ebp, 16(%esp) 966jbe poly1305_init_ext_sse2_9 967poly1305_init_ext_sse2_2: 968movl %ebp, 8(%esp) 969movl %esi, (%esp) 970movl %ebx, 32(%esp) 971movl %ecx, 40(%esp) 972movl %edx, 36(%esp) 973movl %edi, 12(%esp) 974movl %eax, 4(%esp) 975poly1305_init_ext_sse2_3: 976movl 40(%esp), %ebp 977movl 20(%esp), %eax 978mull %eax 979movl 32(%esp), %ebx 980lea (%ebp,%ebp,4), %esi 981movl 24(%esp), %ebp 982movl %eax, %ecx 983movl %esi, 48(%esp) 984lea (%ebx,%ebx), %edi 985movl %edx, %ebx 986movl %edi, 44(%esp) 987lea (%ebp,%ebp,4), %eax 988mull %edi 989movl 36(%esp), %edi 990addl %eax, %ecx 991movl %edi, 60(%esp) 992adcl %edx, %ebx 993lea (%edi,%edi), %eax 994mull %esi 995addl %eax, %ecx 996lea (%ebp,%ebp), %esi 997movl 20(%esp), %eax 998adcl %edx, %ebx 999movl 40(%esp), %ebp 1000movl %esi, 56(%esp) 1001lea (%eax,%eax), %edx 1002movl %edx, 68(%esp) 1003mull %esi 1004movl %edx, %esi 1005addl %ebp, %ebp 1006movl %ebp, 64(%esp) 1007movl %eax, %ebp 1008movl %ecx, 52(%esp) 1009lea (%edi,%edi,4), %eax 1010mull 44(%esp) 1011addl %eax, %ebp 1012movl 40(%esp), %eax 1013movl 48(%esp), %edi 1014adcl %edx, %esi 1015mull %edi 1016shll $6, %ebx 1017shrl $26, %ecx 1018orl %ecx, %ebx 1019addl %ebx, %eax 1020adcl $0, %edx 1021addl %eax, %ebp 1022movl 24(%esp), %eax 1023adcl %edx, %esi 1024mull %eax 1025movl %eax, %ecx 1026movl %edx, %ebx 1027movl 60(%esp), %eax 1028mull 68(%esp) 1029addl %eax, %ecx 1030movl 44(%esp), %eax 1031adcl %edx, %ebx 1032mull %edi 1033movl %ebp, 72(%esp) 1034shll $6, %esi 1035shrl $26, %ebp 1036orl %ebp, %esi 1037addl %esi, %eax 1038adcl $0, %edx 1039addl %eax, %ecx 1040movl %ecx, %edi 1041adcl %edx, %ebx 1042andl $67108863, %edi 1043shll $6, %ebx 1044shrl $26, %ecx 1045movl 56(%esp), %eax 1046orl %ecx, %ebx 1047movl 60(%esp), %ecx 1048mull %ecx 1049movl %edi, 36(%esp) 1050movl %eax, %esi 1051movl 20(%esp), %eax 1052movl %edx, %ebp 1053movl 64(%esp), %edi 1054mull %edi 1055addl %eax, %esi 1056movl 32(%esp), %eax 1057adcl %edx, %ebp 1058lea (%eax,%eax,4), %edx 1059mull %edx 1060addl %eax, %esi 1061movl %ecx, %eax 1062adcl %edx, %ebp 1063addl %esi, %ebx 1064movl %ebx, %esi 1065adcl $0, %ebp 1066andl $67108863, %esi 1067mull %ecx 1068shll $6, %ebp 1069movl %eax, %ecx 1070shrl $26, %ebx 1071movl %edi, %eax 1072orl %ebx, %ebp 1073movl %edx, %ebx 1074mull 24(%esp) 1075addl %eax, %ecx 1076movl 68(%esp), %eax 1077adcl %edx, %ebx 1078mull 32(%esp) 1079addl %eax, %ecx 1080movl 52(%esp), %edi 1081adcl %edx, %ebx 1082addl %ecx, %ebp 1083movl %ebp, %ecx 1084adcl $0, %ebx 1085andl $67108863, %edi 1086shll $6, %ebx 1087andl $67108863, %ecx 1088shrl $26, %ebp 1089orl %ebp, %ebx 1090movl 72(%esp), %eax 1091andl $67108863, %eax 1092movl %ecx, 32(%esp) 1093movl 28(%esp), %ecx 1094lea (%ebx,%ebx,4), %ebx 1095addl %ebx, %edi 1096incl %ecx 1097movl %edi, %ebp 1098shrl $26, %edi 1099andl $67108863, %ebp 1100movl %esi, 40(%esp) 1101cmpl $2, %ecx 1102movl %ebp, 20(%esp) 1103movl %ecx, 28(%esp) 1104lea (%eax,%edi), %edx 1105movl %edx, 24(%esp) 1106jae poly1305_init_ext_sse2_8 1107poly1305_init_ext_sse2_4: 1108cmpl $0, 28(%esp) 1109jne poly1305_init_ext_sse2_6 1110poly1305_init_ext_sse2_5: 1111movl 8(%esp), %esi 1112movl 32(%esp), %eax 1113movl 40(%esp), %edx 1114movl 36(%esp), %ecx 1115movl 24(%esp), %ebx 1116movl 16(%esp), %edi 1117movl %eax, 16(%esi) 1118movl %edx, 12(%esi) 1119movl %ecx, 8(%esi) 1120movl %ebx, 4(%esi) 1121movl %ebp, (%esi) 1122movl %edi, 8(%esp) 1123jmp poly1305_init_ext_sse2_3 1124poly1305_init_ext_sse2_6: 1125cmpl $1, 28(%esp) 1126jne poly1305_init_ext_sse2_3 1127poly1305_init_ext_sse2_7: 1128movl 8(%esp), %esi 1129movl 32(%esp), %eax 1130movl 40(%esp), %edx 1131movl 36(%esp), %ecx 1132movl 24(%esp), %ebx 1133movl (%esp), %edi 1134movl %eax, 16(%esi) 1135movl %edx, 12(%esi) 1136movl %ecx, 8(%esi) 1137movl %ebx, 4(%esi) 1138movl %ebp, (%esi) 1139movl %edi, 8(%esp) 1140cmpl $96, 12(%esp) 1141jae poly1305_init_ext_sse2_3 1142jmp poly1305_init_ext_sse2_10 1143poly1305_init_ext_sse2_8: 1144movl 8(%esp), %ebp 1145movl %esi, %ecx 1146movl 36(%esp), %edx 1147movl %ecx, 12(%ebp) 1148movl %edx, 8(%ebp) 1149movl 32(%esp), %ebx 1150movl 24(%esp), %edx 1151movl 20(%esp), %ecx 1152movl 4(%esp), %eax 1153movl %ebx, 16(%ebp) 1154movl %edx, 4(%ebp) 1155movl %ecx, (%ebp) 1156poly1305_init_ext_sse2_9: 1157movl $0, 116(%eax) 1158addl $76, %esp 1159popl %ebp 1160popl %ebx 1161popl %edi 1162popl %esi 1163ret 1164poly1305_init_ext_sse2_10: 1165movl 4(%esp), %eax 1166jmp poly1305_init_ext_sse2_9 1167FN_END poly1305_init_ext_sse2 1168 1169