1include ksamd64.inc 2EXTERNDEF s_sosemanukMulTables:FAR 3.CODE 4 5ALIGN 8 6Salsa20_OperateKeystream PROC FRAME 7mov r10, [rsp + 5*8] 8alloc_stack(10*16 + 32*16 + 8) 9save_xmm128 xmm6, 0200h 10save_xmm128 xmm7, 0210h 11save_xmm128 xmm8, 0220h 12save_xmm128 xmm9, 0230h 13save_xmm128 xmm10, 0240h 14save_xmm128 xmm11, 0250h 15save_xmm128 xmm12, 0260h 16save_xmm128 xmm13, 0270h 17save_xmm128 xmm14, 0280h 18save_xmm128 xmm15, 0290h 19.endprolog 20cmp r8, 4 21jl label5 22movdqa xmm0, [r10 + 0*16] 23movdqa xmm1, [r10 + 1*16] 24movdqa xmm2, [r10 + 2*16] 25movdqa xmm3, [r10 + 3*16] 26pshufd xmm4, xmm0, 0*64+0*16+0*4+0 27movdqa [rsp + (0*4+0)*16 + 256], xmm4 28pshufd xmm4, xmm0, 1*64+1*16+1*4+1 29movdqa [rsp + (0*4+1)*16 + 256], xmm4 30pshufd xmm4, xmm0, 2*64+2*16+2*4+2 31movdqa [rsp + (0*4+2)*16 + 256], xmm4 32pshufd xmm4, xmm0, 3*64+3*16+3*4+3 33movdqa [rsp + (0*4+3)*16 + 256], xmm4 34pshufd xmm4, xmm1, 0*64+0*16+0*4+0 35movdqa [rsp + (1*4+0)*16 + 256], xmm4 36pshufd xmm4, xmm1, 2*64+2*16+2*4+2 37movdqa [rsp + (1*4+2)*16 + 256], xmm4 38pshufd xmm4, xmm1, 3*64+3*16+3*4+3 39movdqa [rsp + (1*4+3)*16 + 256], xmm4 40pshufd xmm4, xmm2, 1*64+1*16+1*4+1 41movdqa [rsp + (2*4+1)*16 + 256], xmm4 42pshufd xmm4, xmm2, 2*64+2*16+2*4+2 43movdqa [rsp + (2*4+2)*16 + 256], xmm4 44pshufd xmm4, xmm2, 3*64+3*16+3*4+3 45movdqa [rsp + (2*4+3)*16 + 256], xmm4 46pshufd xmm4, xmm3, 0*64+0*16+0*4+0 47movdqa [rsp + (3*4+0)*16 + 256], xmm4 48pshufd xmm4, xmm3, 1*64+1*16+1*4+1 49movdqa [rsp + (3*4+1)*16 + 256], xmm4 50pshufd xmm4, xmm3, 2*64+2*16+2*4+2 51movdqa [rsp + (3*4+2)*16 + 256], xmm4 52pshufd xmm4, xmm3, 3*64+3*16+3*4+3 53movdqa [rsp + (3*4+3)*16 + 256], xmm4 54label1: 55mov eax, dword ptr [r10 + 8*4] 56mov r11d, dword ptr [r10 + 5*4] 57mov dword ptr [rsp + 8*16 + 0*4 + 256], eax 58mov dword ptr [rsp + 5*16 + 0*4 + 256], r11d 59add eax, 1 60adc r11d, 0 61mov dword ptr [rsp + 8*16 + 1*4 + 256], eax 62mov dword ptr [rsp + 5*16 + 1*4 + 256], r11d 63add eax, 1 64adc r11d, 0 65mov dword ptr [rsp + 8*16 + 2*4 + 256], eax 66mov dword ptr [rsp + 5*16 + 2*4 + 256], r11d 67add eax, 1 68adc r11d, 0 69mov dword ptr [rsp + 8*16 + 3*4 + 256], eax 70mov dword ptr [rsp + 5*16 + 3*4 + 256], r11d 71add eax, 1 72adc r11d, 0 73mov dword ptr [r10 + 8*4], eax 74mov dword ptr [r10 + 5*4], r11d 75movdqa xmm0, [rsp + 12*16 + 1*256] 76movdqa xmm4, [rsp + 13*16 + 1*256] 77movdqa xmm8, [rsp + 14*16 + 1*256] 78movdqa xmm12, [rsp + 15*16 + 1*256] 79movdqa xmm2, [rsp + 0*16 + 1*256] 80movdqa xmm6, [rsp + 1*16 + 1*256] 81movdqa xmm10, [rsp + 2*16 + 1*256] 82movdqa xmm14, [rsp + 3*16 + 1*256] 83paddd xmm0, xmm2 84paddd xmm4, xmm6 85paddd xmm8, xmm10 86paddd xmm12, xmm14 87movdqa xmm1, xmm0 88movdqa xmm5, xmm4 89movdqa xmm9, xmm8 90movdqa xmm13, xmm12 91pslld xmm0, 7 92pslld xmm4, 7 93pslld xmm8, 7 94pslld xmm12, 7 95psrld xmm1, 32-7 96psrld xmm5, 32-7 97psrld xmm9, 32-7 98psrld xmm13, 32-7 99pxor xmm0, [rsp + 4*16 + 1*256] 100pxor xmm4, [rsp + 5*16 + 1*256] 101pxor xmm8, [rsp + 6*16 + 1*256] 102pxor xmm12, [rsp + 7*16 + 1*256] 103pxor xmm0, xmm1 104pxor xmm4, xmm5 105pxor xmm8, xmm9 106pxor xmm12, xmm13 107movdqa [rsp + 4*16], xmm0 108movdqa [rsp + 5*16], xmm4 109movdqa [rsp + 6*16], xmm8 110movdqa [rsp + 7*16], xmm12 111movdqa xmm1, xmm0 112movdqa xmm5, xmm4 113movdqa xmm9, xmm8 114movdqa xmm13, xmm12 115paddd xmm0, xmm2 116paddd xmm4, xmm6 117paddd xmm8, xmm10 118paddd xmm12, xmm14 119movdqa xmm3, xmm0 120movdqa xmm7, xmm4 121movdqa xmm11, xmm8 122movdqa xmm15, xmm12 123pslld xmm0, 9 124pslld xmm4, 9 125pslld xmm8, 9 126pslld xmm12, 9 127psrld xmm3, 32-9 128psrld xmm7, 32-9 129psrld xmm11, 32-9 130psrld xmm15, 32-9 131pxor xmm0, [rsp + 8*16 + 1*256] 132pxor xmm4, [rsp + 9*16 + 1*256] 133pxor xmm8, [rsp + 10*16 + 1*256] 134pxor xmm12, [rsp + 11*16 + 1*256] 135pxor xmm0, xmm3 136pxor xmm4, xmm7 137pxor xmm8, xmm11 138pxor xmm12, xmm15 139movdqa [rsp + 8*16], xmm0 140movdqa [rsp + 9*16], xmm4 141movdqa [rsp + 10*16], xmm8 142movdqa [rsp + 11*16], xmm12 143movdqa xmm3, xmm0 144movdqa xmm7, xmm4 145movdqa xmm11, xmm8 146movdqa xmm15, xmm12 147paddd xmm0, xmm1 148paddd xmm4, xmm5 149paddd xmm8, xmm9 150paddd xmm12, xmm13 151movdqa xmm1, xmm0 152movdqa xmm5, xmm4 153movdqa xmm9, xmm8 154movdqa xmm13, xmm12 155pslld xmm0, 13 156pslld xmm4, 13 157pslld xmm8, 13 158pslld xmm12, 13 159psrld xmm1, 32-13 160psrld xmm5, 32-13 161psrld xmm9, 32-13 162psrld xmm13, 32-13 163pxor xmm0, [rsp + 12*16 + 1*256] 164pxor xmm4, [rsp + 13*16 + 1*256] 165pxor xmm8, [rsp + 14*16 + 1*256] 166pxor xmm12, [rsp + 15*16 + 1*256] 167pxor xmm0, xmm1 168pxor xmm4, xmm5 169pxor xmm8, xmm9 170pxor xmm12, xmm13 171movdqa [rsp + 12*16], xmm0 172movdqa [rsp + 13*16], xmm4 173movdqa [rsp + 14*16], xmm8 174movdqa [rsp + 15*16], xmm12 175paddd xmm0, xmm3 176paddd xmm4, xmm7 177paddd xmm8, xmm11 178paddd xmm12, xmm15 179movdqa xmm3, xmm0 180movdqa xmm7, xmm4 181movdqa xmm11, xmm8 182movdqa xmm15, xmm12 183pslld xmm0, 18 184pslld xmm4, 18 185pslld xmm8, 18 186pslld xmm12, 18 187psrld xmm3, 32-18 188psrld xmm7, 32-18 189psrld xmm11, 32-18 190psrld xmm15, 32-18 191pxor xmm0, xmm2 192pxor xmm4, xmm6 193pxor xmm8, xmm10 194pxor xmm12, xmm14 195pxor xmm0, xmm3 196pxor xmm4, xmm7 197pxor xmm8, xmm11 198pxor xmm12, xmm15 199movdqa [rsp + 0*16], xmm0 200movdqa [rsp + 1*16], xmm4 201movdqa [rsp + 2*16], xmm8 202movdqa [rsp + 3*16], xmm12 203mov rax, r9 204jmp label2 205labelSSE2_Salsa_Output: 206movdqa xmm0, xmm4 207punpckldq xmm4, xmm5 208movdqa xmm1, xmm6 209punpckldq xmm6, xmm7 210movdqa xmm2, xmm4 211punpcklqdq xmm4, xmm6 212punpckhqdq xmm2, xmm6 213punpckhdq xmm0, xmm5 214punpckhdq xmm1, xmm7 215movdqa xmm6, xmm0 216punpcklqdq xmm0, xmm1 217punpckhqdq xmm6, xmm1 218test rdx, rdx 219jz labelSSE2_Salsa_Output_A3 220test rdx, 15 221jnz labelSSE2_Salsa_Output_A7 222pxor xmm4, [rdx+0*16] 223pxor xmm2, [rdx+4*16] 224pxor xmm0, [rdx+8*16] 225pxor xmm6, [rdx+12*16] 226add rdx, 1*16 227jmp labelSSE2_Salsa_Output_A3 228labelSSE2_Salsa_Output_A7: 229movdqu xmm1, [rdx+0*16] 230pxor xmm4, xmm1 231movdqu xmm1, [rdx+4*16] 232pxor xmm2, xmm1 233movdqu xmm1, [rdx+8*16] 234pxor xmm0, xmm1 235movdqu xmm1, [rdx+12*16] 236pxor xmm6, xmm1 237add rdx, 1*16 238labelSSE2_Salsa_Output_A3: 239test rcx, 15 240jnz labelSSE2_Salsa_Output_A8 241movdqa [rcx+0*16], xmm4 242movdqa [rcx+4*16], xmm2 243movdqa [rcx+8*16], xmm0 244movdqa [rcx+12*16], xmm6 245jmp labelSSE2_Salsa_Output_A9 246labelSSE2_Salsa_Output_A8: 247movdqu [rcx+0*16], xmm4 248movdqu [rcx+4*16], xmm2 249movdqu [rcx+8*16], xmm0 250movdqu [rcx+12*16], xmm6 251labelSSE2_Salsa_Output_A9: 252add rcx, 1*16 253ret 254label6: 255movdqa xmm0, [rsp + 12*16 + 0*256] 256movdqa xmm4, [rsp + 13*16 + 0*256] 257movdqa xmm8, [rsp + 14*16 + 0*256] 258movdqa xmm12, [rsp + 15*16 + 0*256] 259movdqa xmm2, [rsp + 0*16 + 0*256] 260movdqa xmm6, [rsp + 1*16 + 0*256] 261movdqa xmm10, [rsp + 2*16 + 0*256] 262movdqa xmm14, [rsp + 3*16 + 0*256] 263paddd xmm0, xmm2 264paddd xmm4, xmm6 265paddd xmm8, xmm10 266paddd xmm12, xmm14 267movdqa xmm1, xmm0 268movdqa xmm5, xmm4 269movdqa xmm9, xmm8 270movdqa xmm13, xmm12 271pslld xmm0, 7 272pslld xmm4, 7 273pslld xmm8, 7 274pslld xmm12, 7 275psrld xmm1, 32-7 276psrld xmm5, 32-7 277psrld xmm9, 32-7 278psrld xmm13, 32-7 279pxor xmm0, [rsp + 4*16 + 0*256] 280pxor xmm4, [rsp + 5*16 + 0*256] 281pxor xmm8, [rsp + 6*16 + 0*256] 282pxor xmm12, [rsp + 7*16 + 0*256] 283pxor xmm0, xmm1 284pxor xmm4, xmm5 285pxor xmm8, xmm9 286pxor xmm12, xmm13 287movdqa [rsp + 4*16], xmm0 288movdqa [rsp + 5*16], xmm4 289movdqa [rsp + 6*16], xmm8 290movdqa [rsp + 7*16], xmm12 291movdqa xmm1, xmm0 292movdqa xmm5, xmm4 293movdqa xmm9, xmm8 294movdqa xmm13, xmm12 295paddd xmm0, xmm2 296paddd xmm4, xmm6 297paddd xmm8, xmm10 298paddd xmm12, xmm14 299movdqa xmm3, xmm0 300movdqa xmm7, xmm4 301movdqa xmm11, xmm8 302movdqa xmm15, xmm12 303pslld xmm0, 9 304pslld xmm4, 9 305pslld xmm8, 9 306pslld xmm12, 9 307psrld xmm3, 32-9 308psrld xmm7, 32-9 309psrld xmm11, 32-9 310psrld xmm15, 32-9 311pxor xmm0, [rsp + 8*16 + 0*256] 312pxor xmm4, [rsp + 9*16 + 0*256] 313pxor xmm8, [rsp + 10*16 + 0*256] 314pxor xmm12, [rsp + 11*16 + 0*256] 315pxor xmm0, xmm3 316pxor xmm4, xmm7 317pxor xmm8, xmm11 318pxor xmm12, xmm15 319movdqa [rsp + 8*16], xmm0 320movdqa [rsp + 9*16], xmm4 321movdqa [rsp + 10*16], xmm8 322movdqa [rsp + 11*16], xmm12 323movdqa xmm3, xmm0 324movdqa xmm7, xmm4 325movdqa xmm11, xmm8 326movdqa xmm15, xmm12 327paddd xmm0, xmm1 328paddd xmm4, xmm5 329paddd xmm8, xmm9 330paddd xmm12, xmm13 331movdqa xmm1, xmm0 332movdqa xmm5, xmm4 333movdqa xmm9, xmm8 334movdqa xmm13, xmm12 335pslld xmm0, 13 336pslld xmm4, 13 337pslld xmm8, 13 338pslld xmm12, 13 339psrld xmm1, 32-13 340psrld xmm5, 32-13 341psrld xmm9, 32-13 342psrld xmm13, 32-13 343pxor xmm0, [rsp + 12*16 + 0*256] 344pxor xmm4, [rsp + 13*16 + 0*256] 345pxor xmm8, [rsp + 14*16 + 0*256] 346pxor xmm12, [rsp + 15*16 + 0*256] 347pxor xmm0, xmm1 348pxor xmm4, xmm5 349pxor xmm8, xmm9 350pxor xmm12, xmm13 351movdqa [rsp + 12*16], xmm0 352movdqa [rsp + 13*16], xmm4 353movdqa [rsp + 14*16], xmm8 354movdqa [rsp + 15*16], xmm12 355paddd xmm0, xmm3 356paddd xmm4, xmm7 357paddd xmm8, xmm11 358paddd xmm12, xmm15 359movdqa xmm3, xmm0 360movdqa xmm7, xmm4 361movdqa xmm11, xmm8 362movdqa xmm15, xmm12 363pslld xmm0, 18 364pslld xmm4, 18 365pslld xmm8, 18 366pslld xmm12, 18 367psrld xmm3, 32-18 368psrld xmm7, 32-18 369psrld xmm11, 32-18 370psrld xmm15, 32-18 371pxor xmm0, xmm2 372pxor xmm4, xmm6 373pxor xmm8, xmm10 374pxor xmm12, xmm14 375pxor xmm0, xmm3 376pxor xmm4, xmm7 377pxor xmm8, xmm11 378pxor xmm12, xmm15 379movdqa [rsp + 0*16], xmm0 380movdqa [rsp + 1*16], xmm4 381movdqa [rsp + 2*16], xmm8 382movdqa [rsp + 3*16], xmm12 383label2: 384movdqa xmm0, [rsp + 7*16 + 0*256] 385movdqa xmm4, [rsp + 4*16 + 0*256] 386movdqa xmm8, [rsp + 5*16 + 0*256] 387movdqa xmm12, [rsp + 6*16 + 0*256] 388movdqa xmm2, [rsp + 0*16 + 0*256] 389movdqa xmm6, [rsp + 1*16 + 0*256] 390movdqa xmm10, [rsp + 2*16 + 0*256] 391movdqa xmm14, [rsp + 3*16 + 0*256] 392paddd xmm0, xmm2 393paddd xmm4, xmm6 394paddd xmm8, xmm10 395paddd xmm12, xmm14 396movdqa xmm1, xmm0 397movdqa xmm5, xmm4 398movdqa xmm9, xmm8 399movdqa xmm13, xmm12 400pslld xmm0, 7 401pslld xmm4, 7 402pslld xmm8, 7 403pslld xmm12, 7 404psrld xmm1, 32-7 405psrld xmm5, 32-7 406psrld xmm9, 32-7 407psrld xmm13, 32-7 408pxor xmm0, [rsp + 13*16 + 0*256] 409pxor xmm4, [rsp + 14*16 + 0*256] 410pxor xmm8, [rsp + 15*16 + 0*256] 411pxor xmm12, [rsp + 12*16 + 0*256] 412pxor xmm0, xmm1 413pxor xmm4, xmm5 414pxor xmm8, xmm9 415pxor xmm12, xmm13 416movdqa [rsp + 13*16], xmm0 417movdqa [rsp + 14*16], xmm4 418movdqa [rsp + 15*16], xmm8 419movdqa [rsp + 12*16], xmm12 420movdqa xmm1, xmm0 421movdqa xmm5, xmm4 422movdqa xmm9, xmm8 423movdqa xmm13, xmm12 424paddd xmm0, xmm2 425paddd xmm4, xmm6 426paddd xmm8, xmm10 427paddd xmm12, xmm14 428movdqa xmm3, xmm0 429movdqa xmm7, xmm4 430movdqa xmm11, xmm8 431movdqa xmm15, xmm12 432pslld xmm0, 9 433pslld xmm4, 9 434pslld xmm8, 9 435pslld xmm12, 9 436psrld xmm3, 32-9 437psrld xmm7, 32-9 438psrld xmm11, 32-9 439psrld xmm15, 32-9 440pxor xmm0, [rsp + 10*16 + 0*256] 441pxor xmm4, [rsp + 11*16 + 0*256] 442pxor xmm8, [rsp + 8*16 + 0*256] 443pxor xmm12, [rsp + 9*16 + 0*256] 444pxor xmm0, xmm3 445pxor xmm4, xmm7 446pxor xmm8, xmm11 447pxor xmm12, xmm15 448movdqa [rsp + 10*16], xmm0 449movdqa [rsp + 11*16], xmm4 450movdqa [rsp + 8*16], xmm8 451movdqa [rsp + 9*16], xmm12 452movdqa xmm3, xmm0 453movdqa xmm7, xmm4 454movdqa xmm11, xmm8 455movdqa xmm15, xmm12 456paddd xmm0, xmm1 457paddd xmm4, xmm5 458paddd xmm8, xmm9 459paddd xmm12, xmm13 460movdqa xmm1, xmm0 461movdqa xmm5, xmm4 462movdqa xmm9, xmm8 463movdqa xmm13, xmm12 464pslld xmm0, 13 465pslld xmm4, 13 466pslld xmm8, 13 467pslld xmm12, 13 468psrld xmm1, 32-13 469psrld xmm5, 32-13 470psrld xmm9, 32-13 471psrld xmm13, 32-13 472pxor xmm0, [rsp + 7*16 + 0*256] 473pxor xmm4, [rsp + 4*16 + 0*256] 474pxor xmm8, [rsp + 5*16 + 0*256] 475pxor xmm12, [rsp + 6*16 + 0*256] 476pxor xmm0, xmm1 477pxor xmm4, xmm5 478pxor xmm8, xmm9 479pxor xmm12, xmm13 480movdqa [rsp + 7*16], xmm0 481movdqa [rsp + 4*16], xmm4 482movdqa [rsp + 5*16], xmm8 483movdqa [rsp + 6*16], xmm12 484paddd xmm0, xmm3 485paddd xmm4, xmm7 486paddd xmm8, xmm11 487paddd xmm12, xmm15 488movdqa xmm3, xmm0 489movdqa xmm7, xmm4 490movdqa xmm11, xmm8 491movdqa xmm15, xmm12 492pslld xmm0, 18 493pslld xmm4, 18 494pslld xmm8, 18 495pslld xmm12, 18 496psrld xmm3, 32-18 497psrld xmm7, 32-18 498psrld xmm11, 32-18 499psrld xmm15, 32-18 500pxor xmm0, xmm2 501pxor xmm4, xmm6 502pxor xmm8, xmm10 503pxor xmm12, xmm14 504pxor xmm0, xmm3 505pxor xmm4, xmm7 506pxor xmm8, xmm11 507pxor xmm12, xmm15 508movdqa [rsp + 0*16], xmm0 509movdqa [rsp + 1*16], xmm4 510movdqa [rsp + 2*16], xmm8 511movdqa [rsp + 3*16], xmm12 512sub eax, 2 513jnz label6 514movdqa xmm4, [rsp + 0*16 + 256] 515paddd xmm4, [rsp + 0*16] 516movdqa xmm5, [rsp + 13*16 + 256] 517paddd xmm5, [rsp + 13*16] 518movdqa xmm6, [rsp + 10*16 + 256] 519paddd xmm6, [rsp + 10*16] 520movdqa xmm7, [rsp + 7*16 + 256] 521paddd xmm7, [rsp + 7*16] 522call labelSSE2_Salsa_Output 523movdqa xmm4, [rsp + 4*16 + 256] 524paddd xmm4, [rsp + 4*16] 525movdqa xmm5, [rsp + 1*16 + 256] 526paddd xmm5, [rsp + 1*16] 527movdqa xmm6, [rsp + 14*16 + 256] 528paddd xmm6, [rsp + 14*16] 529movdqa xmm7, [rsp + 11*16 + 256] 530paddd xmm7, [rsp + 11*16] 531call labelSSE2_Salsa_Output 532movdqa xmm4, [rsp + 8*16 + 256] 533paddd xmm4, [rsp + 8*16] 534movdqa xmm5, [rsp + 5*16 + 256] 535paddd xmm5, [rsp + 5*16] 536movdqa xmm6, [rsp + 2*16 + 256] 537paddd xmm6, [rsp + 2*16] 538movdqa xmm7, [rsp + 15*16 + 256] 539paddd xmm7, [rsp + 15*16] 540call labelSSE2_Salsa_Output 541movdqa xmm4, [rsp + 12*16 + 256] 542paddd xmm4, [rsp + 12*16] 543movdqa xmm5, [rsp + 9*16 + 256] 544paddd xmm5, [rsp + 9*16] 545movdqa xmm6, [rsp + 6*16 + 256] 546paddd xmm6, [rsp + 6*16] 547movdqa xmm7, [rsp + 3*16 + 256] 548paddd xmm7, [rsp + 3*16] 549call labelSSE2_Salsa_Output 550test rdx, rdx 551jz label9 552add rdx, 12*16 553label9: 554add rcx, 12*16 555sub r8, 4 556cmp r8, 4 557jge label1 558label5: 559sub r8, 1 560jl label4 561movdqa xmm0, [r10 + 0*16] 562movdqa xmm1, [r10 + 1*16] 563movdqa xmm2, [r10 + 2*16] 564movdqa xmm3, [r10 + 3*16] 565mov rax, r9 566label0: 567movdqa xmm4, xmm3 568paddd xmm4, xmm0 569movdqa xmm5, xmm4 570pslld xmm4, 7 571psrld xmm5, 32-7 572pxor xmm1, xmm4 573pxor xmm1, xmm5 574movdqa xmm4, xmm0 575paddd xmm4, xmm1 576movdqa xmm5, xmm4 577pslld xmm4, 9 578psrld xmm5, 32-9 579pxor xmm2, xmm4 580pxor xmm2, xmm5 581movdqa xmm4, xmm1 582paddd xmm4, xmm2 583movdqa xmm5, xmm4 584pslld xmm4, 13 585psrld xmm5, 32-13 586pxor xmm3, xmm4 587pxor xmm3, xmm5 588movdqa xmm4, xmm2 589paddd xmm4, xmm3 590movdqa xmm5, xmm4 591pslld xmm4, 18 592psrld xmm5, 32-18 593pxor xmm0, xmm4 594pxor xmm0, xmm5 595pshufd xmm1, xmm1, 2*64+1*16+0*4+3 596pshufd xmm2, xmm2, 1*64+0*16+3*4+2 597pshufd xmm3, xmm3, 0*64+3*16+2*4+1 598movdqa xmm4, xmm1 599paddd xmm4, xmm0 600movdqa xmm5, xmm4 601pslld xmm4, 7 602psrld xmm5, 32-7 603pxor xmm3, xmm4 604pxor xmm3, xmm5 605movdqa xmm4, xmm0 606paddd xmm4, xmm3 607movdqa xmm5, xmm4 608pslld xmm4, 9 609psrld xmm5, 32-9 610pxor xmm2, xmm4 611pxor xmm2, xmm5 612movdqa xmm4, xmm3 613paddd xmm4, xmm2 614movdqa xmm5, xmm4 615pslld xmm4, 13 616psrld xmm5, 32-13 617pxor xmm1, xmm4 618pxor xmm1, xmm5 619movdqa xmm4, xmm2 620paddd xmm4, xmm1 621movdqa xmm5, xmm4 622pslld xmm4, 18 623psrld xmm5, 32-18 624pxor xmm0, xmm4 625pxor xmm0, xmm5 626pshufd xmm1, xmm1, 0*64+3*16+2*4+1 627pshufd xmm2, xmm2, 1*64+0*16+3*4+2 628pshufd xmm3, xmm3, 2*64+1*16+0*4+3 629sub eax, 2 630jnz label0 631paddd xmm0, [r10 + 0*16] 632paddd xmm1, [r10 + 1*16] 633paddd xmm2, [r10 + 2*16] 634paddd xmm3, [r10 + 3*16] 635add dword ptr [r10 + 8*4], 1 636adc dword ptr [r10 + 5*4], 0 637pcmpeqb xmm6, xmm6 638psrlq xmm6, 32 639pshufd xmm7, xmm6, 0*64+1*16+2*4+3 640movdqa xmm4, xmm0 641movdqa xmm5, xmm3 642pand xmm0, xmm7 643pand xmm4, xmm6 644pand xmm3, xmm6 645pand xmm5, xmm7 646por xmm4, xmm5 647movdqa xmm5, xmm1 648pand xmm1, xmm7 649pand xmm5, xmm6 650por xmm0, xmm5 651pand xmm6, xmm2 652pand xmm2, xmm7 653por xmm1, xmm6 654por xmm2, xmm3 655movdqa xmm5, xmm4 656movdqa xmm6, xmm0 657shufpd xmm4, xmm1, 2 658shufpd xmm0, xmm2, 2 659shufpd xmm1, xmm5, 2 660shufpd xmm2, xmm6, 2 661test rdx, rdx 662jz labelSSE2_Salsa_Output_B3 663test rdx, 15 664jnz labelSSE2_Salsa_Output_B7 665pxor xmm4, [rdx+0*16] 666pxor xmm0, [rdx+1*16] 667pxor xmm1, [rdx+2*16] 668pxor xmm2, [rdx+3*16] 669add rdx, 4*16 670jmp labelSSE2_Salsa_Output_B3 671labelSSE2_Salsa_Output_B7: 672movdqu xmm3, [rdx+0*16] 673pxor xmm4, xmm3 674movdqu xmm3, [rdx+1*16] 675pxor xmm0, xmm3 676movdqu xmm3, [rdx+2*16] 677pxor xmm1, xmm3 678movdqu xmm3, [rdx+3*16] 679pxor xmm2, xmm3 680add rdx, 4*16 681labelSSE2_Salsa_Output_B3: 682test rcx, 15 683jnz labelSSE2_Salsa_Output_B8 684movdqa [rcx+0*16], xmm4 685movdqa [rcx+1*16], xmm0 686movdqa [rcx+2*16], xmm1 687movdqa [rcx+3*16], xmm2 688jmp labelSSE2_Salsa_Output_B9 689labelSSE2_Salsa_Output_B8: 690movdqu [rcx+0*16], xmm4 691movdqu [rcx+1*16], xmm0 692movdqu [rcx+2*16], xmm1 693movdqu [rcx+3*16], xmm2 694labelSSE2_Salsa_Output_B9: 695add rcx, 4*16 696jmp label5 697label4: 698movdqa xmm6, [rsp + 0200h] 699movdqa xmm7, [rsp + 0210h] 700movdqa xmm8, [rsp + 0220h] 701movdqa xmm9, [rsp + 0230h] 702movdqa xmm10, [rsp + 0240h] 703movdqa xmm11, [rsp + 0250h] 704movdqa xmm12, [rsp + 0260h] 705movdqa xmm13, [rsp + 0270h] 706movdqa xmm14, [rsp + 0280h] 707movdqa xmm15, [rsp + 0290h] 708add rsp, 10*16 + 32*16 + 8 709ret 710Salsa20_OperateKeystream ENDP 711 712ALIGN 8 713Sosemanuk_OperateKeystream PROC FRAME 714rex_push_reg rsi 715push_reg rdi 716alloc_stack(80*4*2+12*4+8*8 + 2*16+8) 717save_xmm128 xmm6, 02f0h 718save_xmm128 xmm7, 0300h 719.endprolog 720mov rdi, r8 721mov rax, r9 722mov QWORD PTR [rsp+1*8], rdi 723mov QWORD PTR [rsp+2*8], rdx 724mov QWORD PTR [rsp+6*8], rax 725lea rcx, [4*rcx+rcx] 726lea rsi, [4*rcx] 727mov QWORD PTR [rsp+3*8], rsi 728movdqa xmm0, [rax+0*16] 729movdqa [rsp + 8*8+0*16], xmm0 730movdqa xmm0, [rax+1*16] 731movdqa [rsp + 8*8+1*16], xmm0 732movq xmm0, QWORD PTR [rax+2*16] 733movq QWORD PTR [rsp + 8*8+2*16], xmm0 734psrlq xmm0, 32 735movd r10d, xmm0 736mov ecx, [rax+10*4] 737mov edx, [rax+11*4] 738pcmpeqb xmm7, xmm7 739label2: 740lea rdi, [rsp + 8*8 + 12*4] 741mov rax, 80 742cmp rsi, 80 743cmovg rsi, rax 744mov QWORD PTR [rsp+7*8], rsi 745lea rsi, [rdi+rsi] 746mov QWORD PTR [rsp+4*8], rsi 747lea rsi, s_sosemanukMulTables 748label0: 749mov eax, [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4] 750mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4 + 80*4], eax 751rol eax, 8 752lea r11d, [r10d + edx] 753xor r11d, ecx 754mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4], r11d 755mov r11d, 1 756and r11d, edx 757neg r11d 758and r11d, r10d 759xor r10d, eax 760movzx eax, al 761xor r10d, [rsi+rax*4] 762mov eax, [rsp + 8*8 + ((0+3)-((0+3)/(10))*(10))*4] 763xor r11d, [rsp + 8*8 + ((0+2)-((0+2)/(10))*(10))*4] 764add ecx, r11d 765movzx r11d, al 766shr eax, 8 767xor r10d, [rsi+1024+r11*4] 768xor r10d, eax 769imul edx, 54655307h 770rol edx, 7 771mov [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4], r10d 772mov eax, [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4] 773mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4 + 80*4], eax 774rol eax, 8 775lea r11d, [r10d + ecx] 776xor r11d, edx 777mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4], r11d 778mov r11d, 1 779and r11d, ecx 780neg r11d 781and r11d, r10d 782xor r10d, eax 783movzx eax, al 784xor r10d, [rsi+rax*4] 785mov eax, [rsp + 8*8 + ((1+3)-((1+3)/(10))*(10))*4] 786xor r11d, [rsp + 8*8 + ((1+2)-((1+2)/(10))*(10))*4] 787add edx, r11d 788movzx r11d, al 789shr eax, 8 790xor r10d, [rsi+1024+r11*4] 791xor r10d, eax 792imul ecx, 54655307h 793rol ecx, 7 794mov [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4], r10d 795mov eax, [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4] 796mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4 + 80*4], eax 797rol eax, 8 798lea r11d, [r10d + edx] 799xor r11d, ecx 800mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4], r11d 801mov r11d, 1 802and r11d, edx 803neg r11d 804and r11d, r10d 805xor r10d, eax 806movzx eax, al 807xor r10d, [rsi+rax*4] 808mov eax, [rsp + 8*8 + ((2+3)-((2+3)/(10))*(10))*4] 809xor r11d, [rsp + 8*8 + ((2+2)-((2+2)/(10))*(10))*4] 810add ecx, r11d 811movzx r11d, al 812shr eax, 8 813xor r10d, [rsi+1024+r11*4] 814xor r10d, eax 815imul edx, 54655307h 816rol edx, 7 817mov [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4], r10d 818mov eax, [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4] 819mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4 + 80*4], eax 820rol eax, 8 821lea r11d, [r10d + ecx] 822xor r11d, edx 823mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4], r11d 824mov r11d, 1 825and r11d, ecx 826neg r11d 827and r11d, r10d 828xor r10d, eax 829movzx eax, al 830xor r10d, [rsi+rax*4] 831mov eax, [rsp + 8*8 + ((3+3)-((3+3)/(10))*(10))*4] 832xor r11d, [rsp + 8*8 + ((3+2)-((3+2)/(10))*(10))*4] 833add edx, r11d 834movzx r11d, al 835shr eax, 8 836xor r10d, [rsi+1024+r11*4] 837xor r10d, eax 838imul ecx, 54655307h 839rol ecx, 7 840mov [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4], r10d 841mov eax, [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4] 842mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4 + 80*4], eax 843rol eax, 8 844lea r11d, [r10d + edx] 845xor r11d, ecx 846mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4], r11d 847mov r11d, 1 848and r11d, edx 849neg r11d 850and r11d, r10d 851xor r10d, eax 852movzx eax, al 853xor r10d, [rsi+rax*4] 854mov eax, [rsp + 8*8 + ((4+3)-((4+3)/(10))*(10))*4] 855xor r11d, [rsp + 8*8 + ((4+2)-((4+2)/(10))*(10))*4] 856add ecx, r11d 857movzx r11d, al 858shr eax, 8 859xor r10d, [rsi+1024+r11*4] 860xor r10d, eax 861imul edx, 54655307h 862rol edx, 7 863mov [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4], r10d 864mov eax, [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4] 865mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4 + 80*4], eax 866rol eax, 8 867lea r11d, [r10d + ecx] 868xor r11d, edx 869mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4], r11d 870mov r11d, 1 871and r11d, ecx 872neg r11d 873and r11d, r10d 874xor r10d, eax 875movzx eax, al 876xor r10d, [rsi+rax*4] 877mov eax, [rsp + 8*8 + ((5+3)-((5+3)/(10))*(10))*4] 878xor r11d, [rsp + 8*8 + ((5+2)-((5+2)/(10))*(10))*4] 879add edx, r11d 880movzx r11d, al 881shr eax, 8 882xor r10d, [rsi+1024+r11*4] 883xor r10d, eax 884imul ecx, 54655307h 885rol ecx, 7 886mov [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4], r10d 887mov eax, [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4] 888mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4 + 80*4], eax 889rol eax, 8 890lea r11d, [r10d + edx] 891xor r11d, ecx 892mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4], r11d 893mov r11d, 1 894and r11d, edx 895neg r11d 896and r11d, r10d 897xor r10d, eax 898movzx eax, al 899xor r10d, [rsi+rax*4] 900mov eax, [rsp + 8*8 + ((6+3)-((6+3)/(10))*(10))*4] 901xor r11d, [rsp + 8*8 + ((6+2)-((6+2)/(10))*(10))*4] 902add ecx, r11d 903movzx r11d, al 904shr eax, 8 905xor r10d, [rsi+1024+r11*4] 906xor r10d, eax 907imul edx, 54655307h 908rol edx, 7 909mov [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4], r10d 910mov eax, [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4] 911mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4 + 80*4], eax 912rol eax, 8 913lea r11d, [r10d + ecx] 914xor r11d, edx 915mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4], r11d 916mov r11d, 1 917and r11d, ecx 918neg r11d 919and r11d, r10d 920xor r10d, eax 921movzx eax, al 922xor r10d, [rsi+rax*4] 923mov eax, [rsp + 8*8 + ((7+3)-((7+3)/(10))*(10))*4] 924xor r11d, [rsp + 8*8 + ((7+2)-((7+2)/(10))*(10))*4] 925add edx, r11d 926movzx r11d, al 927shr eax, 8 928xor r10d, [rsi+1024+r11*4] 929xor r10d, eax 930imul ecx, 54655307h 931rol ecx, 7 932mov [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4], r10d 933mov eax, [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4] 934mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4 + 80*4], eax 935rol eax, 8 936lea r11d, [r10d + edx] 937xor r11d, ecx 938mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4], r11d 939mov r11d, 1 940and r11d, edx 941neg r11d 942and r11d, r10d 943xor r10d, eax 944movzx eax, al 945xor r10d, [rsi+rax*4] 946mov eax, [rsp + 8*8 + ((8+3)-((8+3)/(10))*(10))*4] 947xor r11d, [rsp + 8*8 + ((8+2)-((8+2)/(10))*(10))*4] 948add ecx, r11d 949movzx r11d, al 950shr eax, 8 951xor r10d, [rsi+1024+r11*4] 952xor r10d, eax 953imul edx, 54655307h 954rol edx, 7 955mov [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4], r10d 956mov eax, [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4] 957mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4 + 80*4], eax 958rol eax, 8 959lea r11d, [r10d + ecx] 960xor r11d, edx 961mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4], r11d 962mov r11d, 1 963and r11d, ecx 964neg r11d 965and r11d, r10d 966xor r10d, eax 967movzx eax, al 968xor r10d, [rsi+rax*4] 969mov eax, [rsp + 8*8 + ((9+3)-((9+3)/(10))*(10))*4] 970xor r11d, [rsp + 8*8 + ((9+2)-((9+2)/(10))*(10))*4] 971add edx, r11d 972movzx r11d, al 973shr eax, 8 974xor r10d, [rsi+1024+r11*4] 975xor r10d, eax 976imul ecx, 54655307h 977rol ecx, 7 978mov [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4], r10d 979mov eax, [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4] 980mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4 + 80*4], eax 981rol eax, 8 982lea r11d, [r10d + edx] 983xor r11d, ecx 984mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4], r11d 985mov r11d, 1 986and r11d, edx 987neg r11d 988and r11d, r10d 989xor r10d, eax 990movzx eax, al 991xor r10d, [rsi+rax*4] 992mov eax, [rsp + 8*8 + ((10+3)-((10+3)/(10))*(10))*4] 993xor r11d, [rsp + 8*8 + ((10+2)-((10+2)/(10))*(10))*4] 994add ecx, r11d 995movzx r11d, al 996shr eax, 8 997xor r10d, [rsi+1024+r11*4] 998xor r10d, eax 999imul edx, 54655307h 1000rol edx, 7 1001mov [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4], r10d 1002mov eax, [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4] 1003mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4 + 80*4], eax 1004rol eax, 8 1005lea r11d, [r10d + ecx] 1006xor r11d, edx 1007mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4], r11d 1008mov r11d, 1 1009and r11d, ecx 1010neg r11d 1011and r11d, r10d 1012xor r10d, eax 1013movzx eax, al 1014xor r10d, [rsi+rax*4] 1015mov eax, [rsp + 8*8 + ((11+3)-((11+3)/(10))*(10))*4] 1016xor r11d, [rsp + 8*8 + ((11+2)-((11+2)/(10))*(10))*4] 1017add edx, r11d 1018movzx r11d, al 1019shr eax, 8 1020xor r10d, [rsi+1024+r11*4] 1021xor r10d, eax 1022imul ecx, 54655307h 1023rol ecx, 7 1024mov [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4], r10d 1025mov eax, [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4] 1026mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4 + 80*4], eax 1027rol eax, 8 1028lea r11d, [r10d + edx] 1029xor r11d, ecx 1030mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4], r11d 1031mov r11d, 1 1032and r11d, edx 1033neg r11d 1034and r11d, r10d 1035xor r10d, eax 1036movzx eax, al 1037xor r10d, [rsi+rax*4] 1038mov eax, [rsp + 8*8 + ((12+3)-((12+3)/(10))*(10))*4] 1039xor r11d, [rsp + 8*8 + ((12+2)-((12+2)/(10))*(10))*4] 1040add ecx, r11d 1041movzx r11d, al 1042shr eax, 8 1043xor r10d, [rsi+1024+r11*4] 1044xor r10d, eax 1045imul edx, 54655307h 1046rol edx, 7 1047mov [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4], r10d 1048mov eax, [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4] 1049mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4 + 80*4], eax 1050rol eax, 8 1051lea r11d, [r10d + ecx] 1052xor r11d, edx 1053mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4], r11d 1054mov r11d, 1 1055and r11d, ecx 1056neg r11d 1057and r11d, r10d 1058xor r10d, eax 1059movzx eax, al 1060xor r10d, [rsi+rax*4] 1061mov eax, [rsp + 8*8 + ((13+3)-((13+3)/(10))*(10))*4] 1062xor r11d, [rsp + 8*8 + ((13+2)-((13+2)/(10))*(10))*4] 1063add edx, r11d 1064movzx r11d, al 1065shr eax, 8 1066xor r10d, [rsi+1024+r11*4] 1067xor r10d, eax 1068imul ecx, 54655307h 1069rol ecx, 7 1070mov [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4], r10d 1071mov eax, [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4] 1072mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4 + 80*4], eax 1073rol eax, 8 1074lea r11d, [r10d + edx] 1075xor r11d, ecx 1076mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4], r11d 1077mov r11d, 1 1078and r11d, edx 1079neg r11d 1080and r11d, r10d 1081xor r10d, eax 1082movzx eax, al 1083xor r10d, [rsi+rax*4] 1084mov eax, [rsp + 8*8 + ((14+3)-((14+3)/(10))*(10))*4] 1085xor r11d, [rsp + 8*8 + ((14+2)-((14+2)/(10))*(10))*4] 1086add ecx, r11d 1087movzx r11d, al 1088shr eax, 8 1089xor r10d, [rsi+1024+r11*4] 1090xor r10d, eax 1091imul edx, 54655307h 1092rol edx, 7 1093mov [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4], r10d 1094mov eax, [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4] 1095mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4 + 80*4], eax 1096rol eax, 8 1097lea r11d, [r10d + ecx] 1098xor r11d, edx 1099mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4], r11d 1100mov r11d, 1 1101and r11d, ecx 1102neg r11d 1103and r11d, r10d 1104xor r10d, eax 1105movzx eax, al 1106xor r10d, [rsi+rax*4] 1107mov eax, [rsp + 8*8 + ((15+3)-((15+3)/(10))*(10))*4] 1108xor r11d, [rsp + 8*8 + ((15+2)-((15+2)/(10))*(10))*4] 1109add edx, r11d 1110movzx r11d, al 1111shr eax, 8 1112xor r10d, [rsi+1024+r11*4] 1113xor r10d, eax 1114imul ecx, 54655307h 1115rol ecx, 7 1116mov [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4], r10d 1117mov eax, [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4] 1118mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4 + 80*4], eax 1119rol eax, 8 1120lea r11d, [r10d + edx] 1121xor r11d, ecx 1122mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4], r11d 1123mov r11d, 1 1124and r11d, edx 1125neg r11d 1126and r11d, r10d 1127xor r10d, eax 1128movzx eax, al 1129xor r10d, [rsi+rax*4] 1130mov eax, [rsp + 8*8 + ((16+3)-((16+3)/(10))*(10))*4] 1131xor r11d, [rsp + 8*8 + ((16+2)-((16+2)/(10))*(10))*4] 1132add ecx, r11d 1133movzx r11d, al 1134shr eax, 8 1135xor r10d, [rsi+1024+r11*4] 1136xor r10d, eax 1137imul edx, 54655307h 1138rol edx, 7 1139mov [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4], r10d 1140mov eax, [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4] 1141mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4 + 80*4], eax 1142rol eax, 8 1143lea r11d, [r10d + ecx] 1144xor r11d, edx 1145mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4], r11d 1146mov r11d, 1 1147and r11d, ecx 1148neg r11d 1149and r11d, r10d 1150xor r10d, eax 1151movzx eax, al 1152xor r10d, [rsi+rax*4] 1153mov eax, [rsp + 8*8 + ((17+3)-((17+3)/(10))*(10))*4] 1154xor r11d, [rsp + 8*8 + ((17+2)-((17+2)/(10))*(10))*4] 1155add edx, r11d 1156movzx r11d, al 1157shr eax, 8 1158xor r10d, [rsi+1024+r11*4] 1159xor r10d, eax 1160imul ecx, 54655307h 1161rol ecx, 7 1162mov [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4], r10d 1163mov eax, [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4] 1164mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4 + 80*4], eax 1165rol eax, 8 1166lea r11d, [r10d + edx] 1167xor r11d, ecx 1168mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4], r11d 1169mov r11d, 1 1170and r11d, edx 1171neg r11d 1172and r11d, r10d 1173xor r10d, eax 1174movzx eax, al 1175xor r10d, [rsi+rax*4] 1176mov eax, [rsp + 8*8 + ((18+3)-((18+3)/(10))*(10))*4] 1177xor r11d, [rsp + 8*8 + ((18+2)-((18+2)/(10))*(10))*4] 1178add ecx, r11d 1179movzx r11d, al 1180shr eax, 8 1181xor r10d, [rsi+1024+r11*4] 1182xor r10d, eax 1183imul edx, 54655307h 1184rol edx, 7 1185mov [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4], r10d 1186mov eax, [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4] 1187mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4 + 80*4], eax 1188rol eax, 8 1189lea r11d, [r10d + ecx] 1190xor r11d, edx 1191mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4], r11d 1192mov r11d, 1 1193and r11d, ecx 1194neg r11d 1195and r11d, r10d 1196xor r10d, eax 1197movzx eax, al 1198xor r10d, [rsi+rax*4] 1199mov eax, [rsp + 8*8 + ((19+3)-((19+3)/(10))*(10))*4] 1200xor r11d, [rsp + 8*8 + ((19+2)-((19+2)/(10))*(10))*4] 1201add edx, r11d 1202movzx r11d, al 1203shr eax, 8 1204xor r10d, [rsi+1024+r11*4] 1205xor r10d, eax 1206imul ecx, 54655307h 1207rol ecx, 7 1208mov [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4], r10d 1209add rdi, 5*4 1210cmp rdi, QWORD PTR [rsp+4*8] 1211jne label0 1212mov rax, QWORD PTR [rsp+2*8] 1213mov r11, QWORD PTR [rsp+1*8] 1214lea rdi, [rsp + 8*8 + 12*4] 1215mov rsi, QWORD PTR [rsp+7*8] 1216label1: 1217movdqa xmm0, [rdi+0*20*4] 1218movdqa xmm2, [rdi+2*20*4] 1219movdqa xmm3, [rdi+3*20*4] 1220movdqa xmm1, [rdi+1*20*4] 1221movdqa xmm4, xmm0 1222pand xmm0, xmm2 1223pxor xmm0, xmm3 1224pxor xmm2, xmm1 1225pxor xmm2, xmm0 1226por xmm3, xmm4 1227pxor xmm3, xmm1 1228pxor xmm4, xmm2 1229movdqa xmm1, xmm3 1230por xmm3, xmm4 1231pxor xmm3, xmm0 1232pand xmm0, xmm1 1233pxor xmm4, xmm0 1234pxor xmm1, xmm3 1235pxor xmm1, xmm4 1236pxor xmm4, xmm7 1237pxor xmm2, [rdi+80*4] 1238pxor xmm3, [rdi+80*5] 1239pxor xmm1, [rdi+80*6] 1240pxor xmm4, [rdi+80*7] 1241cmp rsi, 16 1242jl label4 1243movdqa xmm6, xmm2 1244punpckldq xmm2, xmm3 1245movdqa xmm5, xmm1 1246punpckldq xmm1, xmm4 1247movdqa xmm0, xmm2 1248punpcklqdq xmm2, xmm1 1249punpckhqdq xmm0, xmm1 1250punpckhdq xmm6, xmm3 1251punpckhdq xmm5, xmm4 1252movdqa xmm3, xmm6 1253punpcklqdq xmm6, xmm5 1254punpckhqdq xmm3, xmm5 1255test rax, rax 1256jz labelSSE2_Sosemanuk_Output3 1257test rax, 15 1258jnz labelSSE2_Sosemanuk_Output7 1259pxor xmm2, [rax+0*16] 1260pxor xmm0, [rax+1*16] 1261pxor xmm6, [rax+2*16] 1262pxor xmm3, [rax+3*16] 1263add rax, 4*16 1264jmp labelSSE2_Sosemanuk_Output3 1265labelSSE2_Sosemanuk_Output7: 1266movdqu xmm1, [rax+0*16] 1267pxor xmm2, xmm1 1268movdqu xmm1, [rax+1*16] 1269pxor xmm0, xmm1 1270movdqu xmm1, [rax+2*16] 1271pxor xmm6, xmm1 1272movdqu xmm1, [rax+3*16] 1273pxor xmm3, xmm1 1274add rax, 4*16 1275labelSSE2_Sosemanuk_Output3: 1276test r11, 15 1277jnz labelSSE2_Sosemanuk_Output8 1278movdqa [r11+0*16], xmm2 1279movdqa [r11+1*16], xmm0 1280movdqa [r11+2*16], xmm6 1281movdqa [r11+3*16], xmm3 1282jmp labelSSE2_Sosemanuk_Output9 1283labelSSE2_Sosemanuk_Output8: 1284movdqu [r11+0*16], xmm2 1285movdqu [r11+1*16], xmm0 1286movdqu [r11+2*16], xmm6 1287movdqu [r11+3*16], xmm3 1288labelSSE2_Sosemanuk_Output9: 1289add r11, 4*16 1290add rdi, 4*4 1291sub rsi, 16 1292jnz label1 1293mov rsi, QWORD PTR [rsp+3*8] 1294sub rsi, 80 1295jz label6 1296mov QWORD PTR [rsp+3*8], rsi 1297mov QWORD PTR [rsp+2*8], rax 1298mov QWORD PTR [rsp+1*8], r11 1299jmp label2 1300label4: 1301test rax, rax 1302jz label5 1303movd xmm0, dword ptr [rax+0*4] 1304pxor xmm2, xmm0 1305movd xmm0, dword ptr [rax+1*4] 1306pxor xmm3, xmm0 1307movd xmm0, dword ptr [rax+2*4] 1308pxor xmm1, xmm0 1309movd xmm0, dword ptr [rax+3*4] 1310pxor xmm4, xmm0 1311add rax, 16 1312label5: 1313movd dword ptr [r11+0*4], xmm2 1314movd dword ptr [r11+1*4], xmm3 1315movd dword ptr [r11+2*4], xmm1 1316movd dword ptr [r11+3*4], xmm4 1317sub rsi, 4 1318jz label6 1319add r11, 16 1320psrldq xmm2, 4 1321psrldq xmm3, 4 1322psrldq xmm1, 4 1323psrldq xmm4, 4 1324jmp label4 1325label6: 1326mov r10, QWORD PTR [rsp+6*8] 1327movdqa xmm0, [rsp + 8*8+0*16] 1328movdqa [r10+0*16], xmm0 1329movdqa xmm0, [rsp + 8*8+1*16] 1330movdqa [r10+1*16], xmm0 1331movq xmm0, QWORD PTR [rsp + 8*8+2*16] 1332movq QWORD PTR [r10+2*16], xmm0 1333mov [r10+10*4], ecx 1334mov [r10+11*4], edx 1335movdqa xmm6, [rsp + 02f0h] 1336movdqa xmm7, [rsp + 0300h] 1337add rsp, 80*4*2+12*4+8*8 + 2*16+8 1338pop rdi 1339pop rsi 1340ret 1341Sosemanuk_OperateKeystream ENDP 1342 1343Panama_SSE2_Pull PROC FRAME 1344rex_push_reg rdi 1345alloc_stack(2*16) 1346save_xmm128 xmm6, 0h 1347save_xmm128 xmm7, 10h 1348.endprolog 1349shl rcx, 5 1350jz label5 1351mov r10d, [rdx+4*17] 1352add rcx, r10 1353mov rdi, rcx 1354movdqa xmm0, xmmword ptr [rdx+0*16] 1355movdqa xmm1, xmmword ptr [rdx+1*16] 1356movdqa xmm2, xmmword ptr [rdx+2*16] 1357movdqa xmm3, xmmword ptr [rdx+3*16] 1358mov eax, dword ptr [rdx+4*16] 1359label4: 1360movdqa xmm6, xmm2 1361movss xmm6, xmm3 1362pshufd xmm5, xmm6, 0*64+3*16+2*4+1 1363movd xmm6, eax 1364movdqa xmm7, xmm3 1365movss xmm7, xmm6 1366pshufd xmm6, xmm7, 0*64+3*16+2*4+1 1367movd ecx, xmm2 1368not ecx 1369movd r11d, xmm3 1370or ecx, r11d 1371xor eax, ecx 1372pcmpeqb xmm7, xmm7 1373pxor xmm7, xmm1 1374por xmm7, xmm2 1375pxor xmm7, xmm3 1376movd ecx, xmm7 1377rol ecx, (((((5*1) MOD (17))*(((5*1) MOD (17))+1)/2)) MOD (32)) 1378mov [rdx+((((((5*(1)) MOD (17)))*13+16)) MOD (17))*4], ecx 1379pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 1380movd ecx, xmm7 1381rol ecx, (((((5*5) MOD (17))*(((5*5) MOD (17))+1)/2)) MOD (32)) 1382mov [rdx+((((((5*(5)) MOD (17)))*13+16)) MOD (17))*4], ecx 1383punpckhqdq xmm7, xmm7 1384movd ecx, xmm7 1385rol ecx, (((((5*9) MOD (17))*(((5*9) MOD (17))+1)/2)) MOD (32)) 1386mov [rdx+((((((5*(9)) MOD (17)))*13+16)) MOD (17))*4], ecx 1387pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 1388movd ecx, xmm7 1389rol ecx, (((((5*13) MOD (17))*(((5*13) MOD (17))+1)/2)) MOD (32)) 1390mov [rdx+((((((5*(13)) MOD (17)))*13+16)) MOD (17))*4], ecx 1391pcmpeqb xmm7, xmm7 1392pxor xmm7, xmm0 1393por xmm7, xmm1 1394pxor xmm7, xmm2 1395movd ecx, xmm7 1396rol ecx, (((((5*2) MOD (17))*(((5*2) MOD (17))+1)/2)) MOD (32)) 1397mov [rdx+((((((5*(2)) MOD (17)))*13+16)) MOD (17))*4], ecx 1398pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 1399movd ecx, xmm7 1400rol ecx, (((((5*6) MOD (17))*(((5*6) MOD (17))+1)/2)) MOD (32)) 1401mov [rdx+((((((5*(6)) MOD (17)))*13+16)) MOD (17))*4], ecx 1402punpckhqdq xmm7, xmm7 1403movd ecx, xmm7 1404rol ecx, (((((5*10) MOD (17))*(((5*10) MOD (17))+1)/2)) MOD (32)) 1405mov [rdx+((((((5*(10)) MOD (17)))*13+16)) MOD (17))*4], ecx 1406pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 1407movd ecx, xmm7 1408rol ecx, (((((5*14) MOD (17))*(((5*14) MOD (17))+1)/2)) MOD (32)) 1409mov [rdx+((((((5*(14)) MOD (17)))*13+16)) MOD (17))*4], ecx 1410pcmpeqb xmm7, xmm7 1411pxor xmm7, xmm6 1412por xmm7, xmm0 1413pxor xmm7, xmm1 1414movd ecx, xmm7 1415rol ecx, (((((5*3) MOD (17))*(((5*3) MOD (17))+1)/2)) MOD (32)) 1416mov [rdx+((((((5*(3)) MOD (17)))*13+16)) MOD (17))*4], ecx 1417pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 1418movd ecx, xmm7 1419rol ecx, (((((5*7) MOD (17))*(((5*7) MOD (17))+1)/2)) MOD (32)) 1420mov [rdx+((((((5*(7)) MOD (17)))*13+16)) MOD (17))*4], ecx 1421punpckhqdq xmm7, xmm7 1422movd ecx, xmm7 1423rol ecx, (((((5*11) MOD (17))*(((5*11) MOD (17))+1)/2)) MOD (32)) 1424mov [rdx+((((((5*(11)) MOD (17)))*13+16)) MOD (17))*4], ecx 1425pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 1426movd ecx, xmm7 1427rol ecx, (((((5*15) MOD (17))*(((5*15) MOD (17))+1)/2)) MOD (32)) 1428mov [rdx+((((((5*(15)) MOD (17)))*13+16)) MOD (17))*4], ecx 1429pcmpeqb xmm7, xmm7 1430pxor xmm7, xmm5 1431por xmm7, xmm6 1432pxor xmm7, xmm0 1433movd ecx, xmm7 1434rol ecx, (((((5*4) MOD (17))*(((5*4) MOD (17))+1)/2)) MOD (32)) 1435mov [rdx+((((((5*(4)) MOD (17)))*13+16)) MOD (17))*4], ecx 1436pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 1437movd ecx, xmm7 1438rol ecx, (((((5*8) MOD (17))*(((5*8) MOD (17))+1)/2)) MOD (32)) 1439mov [rdx+((((((5*(8)) MOD (17)))*13+16)) MOD (17))*4], ecx 1440punpckhqdq xmm7, xmm7 1441movd ecx, xmm7 1442rol ecx, (((((5*12) MOD (17))*(((5*12) MOD (17))+1)/2)) MOD (32)) 1443mov [rdx+((((((5*(12)) MOD (17)))*13+16)) MOD (17))*4], ecx 1444pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 1445movd ecx, xmm7 1446rol ecx, (((((5*16) MOD (17))*(((5*16) MOD (17))+1)/2)) MOD (32)) 1447mov [rdx+((((((5*(16)) MOD (17)))*13+16)) MOD (17))*4], ecx 1448movdqa xmm4, xmm3 1449punpcklqdq xmm3, xmm2 1450punpckhdq xmm4, xmm2 1451movdqa xmm2, xmm1 1452punpcklqdq xmm1, xmm0 1453punpckhdq xmm2, xmm0 1454test r8, r8 1455jz label0 1456movdqa xmm6, xmm4 1457punpcklqdq xmm4, xmm2 1458punpckhqdq xmm6, xmm2 1459test r9, 15 1460jnz label2 1461test r9, r9 1462jz label1 1463pxor xmm4, [r9] 1464pxor xmm6, [r9+16] 1465add r9, 32 1466jmp label1 1467label2: 1468movdqu xmm0, [r9] 1469movdqu xmm2, [r9+16] 1470pxor xmm4, xmm0 1471pxor xmm6, xmm2 1472add r9, 32 1473label1: 1474test r8, 15 1475jnz label3 1476movdqa xmmword ptr [r8], xmm4 1477movdqa xmmword ptr [r8+16], xmm6 1478add r8, 32 1479jmp label0 1480label3: 1481movdqu xmmword ptr [r8], xmm4 1482movdqu xmmword ptr [r8+16], xmm6 1483add r8, 32 1484label0: 1485lea rcx, [r10 + 32] 1486and rcx, 31*32 1487lea r11, [r10 + (32-24)*32] 1488and r11, 31*32 1489movdqa xmm0, xmmword ptr [rdx+20*4+rcx+0*8] 1490pxor xmm3, xmm0 1491pshufd xmm0, xmm0, 2*64+3*16+0*4+1 1492movdqa xmmword ptr [rdx+20*4+rcx+0*8], xmm3 1493pxor xmm0, xmmword ptr [rdx+20*4+r11+2*8] 1494movdqa xmmword ptr [rdx+20*4+r11+2*8], xmm0 1495movdqa xmm4, xmmword ptr [rdx+20*4+rcx+2*8] 1496pxor xmm1, xmm4 1497movdqa xmmword ptr [rdx+20*4+rcx+2*8], xmm1 1498pxor xmm4, xmmword ptr [rdx+20*4+r11+0*8] 1499movdqa xmmword ptr [rdx+20*4+r11+0*8], xmm4 1500movdqa xmm3, xmmword ptr [rdx+3*16] 1501movdqa xmm2, xmmword ptr [rdx+2*16] 1502movdqa xmm1, xmmword ptr [rdx+1*16] 1503movdqa xmm0, xmmword ptr [rdx+0*16] 1504movd xmm6, eax 1505movdqa xmm7, xmm3 1506movss xmm7, xmm6 1507movdqa xmm6, xmm2 1508movss xmm6, xmm3 1509movdqa xmm5, xmm1 1510movss xmm5, xmm2 1511movdqa xmm4, xmm0 1512movss xmm4, xmm1 1513pshufd xmm7, xmm7, 0*64+3*16+2*4+1 1514pshufd xmm6, xmm6, 0*64+3*16+2*4+1 1515pshufd xmm5, xmm5, 0*64+3*16+2*4+1 1516pshufd xmm4, xmm4, 0*64+3*16+2*4+1 1517xor eax, 1 1518movd ecx, xmm0 1519xor eax, ecx 1520movd ecx, xmm3 1521xor eax, ecx 1522pxor xmm3, xmm2 1523pxor xmm2, xmm1 1524pxor xmm1, xmm0 1525pxor xmm0, xmm7 1526pxor xmm3, xmm7 1527pxor xmm2, xmm6 1528pxor xmm1, xmm5 1529pxor xmm0, xmm4 1530lea rcx, [r10 + (32-4)*32] 1531and rcx, 31*32 1532lea r11, [r10 + 16*32] 1533and r11, 31*32 1534movdqa xmm4, xmmword ptr [rdx+20*4+rcx+0*16] 1535movdqa xmm5, xmmword ptr [rdx+20*4+r11+0*16] 1536movdqa xmm6, xmm4 1537punpcklqdq xmm4, xmm5 1538punpckhqdq xmm6, xmm5 1539pxor xmm3, xmm4 1540pxor xmm2, xmm6 1541movdqa xmm4, xmmword ptr [rdx+20*4+rcx+1*16] 1542movdqa xmm5, xmmword ptr [rdx+20*4+r11+1*16] 1543movdqa xmm6, xmm4 1544punpcklqdq xmm4, xmm5 1545punpckhqdq xmm6, xmm5 1546pxor xmm1, xmm4 1547pxor xmm0, xmm6 1548add r10, 32 1549cmp r10, rdi 1550jne label4 1551mov [rdx+4*16], eax 1552movdqa xmmword ptr [rdx+3*16], xmm3 1553movdqa xmmword ptr [rdx+2*16], xmm2 1554movdqa xmmword ptr [rdx+1*16], xmm1 1555movdqa xmmword ptr [rdx+0*16], xmm0 1556label5: 1557movdqa xmm6, [rsp + 0h] 1558movdqa xmm7, [rsp + 10h] 1559add rsp, 2*16 1560pop rdi 1561ret 1562Panama_SSE2_Pull ENDP 1563 1564_TEXT ENDS 1565END 1566