1; LICENSE: 2; This submission to NSS is to be made available under the terms of the 3; Mozilla Public License, v. 2.0. You can obtain one at http: 4; //mozilla.org/MPL/2.0/. 5;############################################################################### 6; Copyright(c) 2014, Intel Corp. 7; Developers and authors: 8; Shay Gueron and Vlad Krasnov 9; Intel Corporation, Israel Development Centre, Haifa, Israel 10; Please send feedback directly to crypto.feedback.alias@intel.com 11 12 13.DATA 14ALIGN 16 15Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh 16Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h 17Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh 18Lcon1 dd 1,1,1,1 19Lcon2 dd 1bh,1bh,1bh,1bh 20 21.CODE 22 23ctx textequ <rcx> 24output textequ <rdx> 25input textequ <r8> 26inputLen textequ <r9d> 27 28 29aes_rnd MACRO i 30 movdqu xmm8, [i*16 + ctx] 31 aesenc xmm0, xmm8 32 aesenc xmm1, xmm8 33 aesenc xmm2, xmm8 34 aesenc xmm3, xmm8 35 aesenc xmm4, xmm8 36 aesenc xmm5, xmm8 37 aesenc xmm6, xmm8 38 aesenc xmm7, xmm8 39 ENDM 40 41aes_last_rnd MACRO i 42 movdqu xmm8, [i*16 + ctx] 43 aesenclast xmm0, xmm8 44 aesenclast xmm1, xmm8 45 aesenclast xmm2, xmm8 46 aesenclast xmm3, xmm8 47 aesenclast xmm4, xmm8 48 aesenclast xmm5, xmm8 49 aesenclast xmm6, xmm8 50 aesenclast xmm7, xmm8 51 ENDM 52 53aes_dec_rnd MACRO i 54 movdqu xmm8, [i*16 + ctx] 55 aesdec xmm0, xmm8 56 aesdec xmm1, xmm8 57 aesdec xmm2, xmm8 58 aesdec xmm3, xmm8 59 aesdec xmm4, xmm8 60 aesdec xmm5, xmm8 61 aesdec xmm6, xmm8 62 aesdec xmm7, xmm8 63 ENDM 64 65aes_dec_last_rnd MACRO i 66 movdqu xmm8, [i*16 + ctx] 67 aesdeclast xmm0, xmm8 68 aesdeclast xmm1, xmm8 69 aesdeclast xmm2, xmm8 70 aesdeclast xmm3, xmm8 71 aesdeclast xmm4, xmm8 72 aesdeclast xmm5, xmm8 73 aesdeclast xmm6, xmm8 74 aesdeclast xmm7, xmm8 75 ENDM 76 77 78gen_aes_ecb_func MACRO enc, rnds 79 80LOCAL loop8 81LOCAL loop1 82LOCAL bail 83 84 xor inputLen, inputLen 85 mov input, [rsp + 1*8 + 8*4] 86 mov inputLen, [rsp + 1*8 + 8*5] 87 88 sub rsp, 3*16 89 90 movdqu [rsp + 0*16], xmm6 91 movdqu [rsp + 1*16], xmm7 92 movdqu [rsp + 2*16], xmm8 93 94 lea ctx, [48+ctx] 95 96loop8: 97 cmp inputLen, 8*16 98 jb loop1 99 100 movdqu xmm0, [0*16 + input] 101 movdqu xmm1, [1*16 + input] 102 movdqu xmm2, [2*16 + input] 103 movdqu xmm3, [3*16 + input] 104 movdqu xmm4, [4*16 + input] 105 movdqu xmm5, [5*16 + input] 106 movdqu xmm6, [6*16 + input] 107 movdqu xmm7, [7*16 + input] 108 109 movdqu xmm8, [0*16 + ctx] 110 pxor xmm0, xmm8 111 pxor xmm1, xmm8 112 pxor xmm2, xmm8 113 pxor xmm3, xmm8 114 pxor xmm4, xmm8 115 pxor xmm5, xmm8 116 pxor xmm6, xmm8 117 pxor xmm7, xmm8 118 119IF enc eq 1 120 rnd textequ <aes_rnd> 121 lastrnd textequ <aes_last_rnd> 122 aesinst textequ <aesenc> 123 aeslastinst textequ <aesenclast> 124ELSE 125 rnd textequ <aes_dec_rnd> 126 lastrnd textequ <aes_dec_last_rnd> 127 aesinst textequ <aesdec> 128 aeslastinst textequ <aesdeclast> 129ENDIF 130 131 i = 1 132 WHILE i LT rnds 133 rnd i 134 i = i+1 135 ENDM 136 lastrnd rnds 137 138 movdqu [0*16 + output], xmm0 139 movdqu [1*16 + output], xmm1 140 movdqu [2*16 + output], xmm2 141 movdqu [3*16 + output], xmm3 142 movdqu [4*16 + output], xmm4 143 movdqu [5*16 + output], xmm5 144 movdqu [6*16 + output], xmm6 145 movdqu [7*16 + output], xmm7 146 147 lea input, [8*16 + input] 148 lea output, [8*16 + output] 149 sub inputLen, 8*16 150 jmp loop8 151 152loop1: 153 cmp inputLen, 1*16 154 jb bail 155 156 movdqu xmm0, [input] 157 movdqu xmm7, [0*16 + ctx] 158 pxor xmm0, xmm7 159 160 i = 1 161 WHILE i LT rnds 162 movdqu xmm7, [i*16 + ctx] 163 aesinst xmm0, xmm7 164 i = i+1 165 ENDM 166 movdqu xmm7, [rnds*16 + ctx] 167 aeslastinst xmm0, xmm7 168 169 movdqu [output], xmm0 170 171 lea input, [1*16 + input] 172 lea output, [1*16 + output] 173 sub inputLen, 1*16 174 jmp loop1 175 176bail: 177 xor rax, rax 178 179 movdqu xmm6, [rsp + 0*16] 180 movdqu xmm7, [rsp + 1*16] 181 movdqu xmm8, [rsp + 2*16] 182 add rsp, 3*16 183 ret 184ENDM 185 186intel_aes_encrypt_ecb_128 PROC 187gen_aes_ecb_func 1, 10 188intel_aes_encrypt_ecb_128 ENDP 189 190intel_aes_encrypt_ecb_192 PROC 191gen_aes_ecb_func 1, 12 192intel_aes_encrypt_ecb_192 ENDP 193 194intel_aes_encrypt_ecb_256 PROC 195gen_aes_ecb_func 1, 14 196intel_aes_encrypt_ecb_256 ENDP 197 198intel_aes_decrypt_ecb_128 PROC 199gen_aes_ecb_func 0, 10 200intel_aes_decrypt_ecb_128 ENDP 201 202intel_aes_decrypt_ecb_192 PROC 203gen_aes_ecb_func 0, 12 204intel_aes_decrypt_ecb_192 ENDP 205 206intel_aes_decrypt_ecb_256 PROC 207gen_aes_ecb_func 0, 14 208intel_aes_decrypt_ecb_256 ENDP 209 210 211KEY textequ <rcx> 212KS textequ <rdx> 213ITR textequ <r8> 214 215intel_aes_encrypt_init_128 PROC 216 217 movdqu xmm1, [KEY] 218 movdqu [KS], xmm1 219 movdqa xmm2, xmm1 220 221 lea ITR, Lcon1 222 movdqa xmm0, [ITR] 223 lea ITR, Lmask 224 movdqa xmm4, [ITR] 225 226 mov ITR, 8 227 228Lenc_128_ks_loop: 229 lea KS, [16 + KS] 230 dec ITR 231 232 pshufb xmm2, xmm4 233 aesenclast xmm2, xmm0 234 pslld xmm0, 1 235 movdqa xmm3, xmm1 236 pslldq xmm3, 4 237 pxor xmm1, xmm3 238 pslldq xmm3, 4 239 pxor xmm1, xmm3 240 pslldq xmm3, 4 241 pxor xmm1, xmm3 242 pxor xmm1, xmm2 243 movdqu [KS], xmm1 244 movdqa xmm2, xmm1 245 246 jne Lenc_128_ks_loop 247 248 lea ITR, Lcon2 249 movdqa xmm0, [ITR] 250 251 pshufb xmm2, xmm4 252 aesenclast xmm2, xmm0 253 pslld xmm0, 1 254 movdqa xmm3, xmm1 255 pslldq xmm3, 4 256 pxor xmm1, xmm3 257 pslldq xmm3, 4 258 pxor xmm1, xmm3 259 pslldq xmm3, 4 260 pxor xmm1, xmm3 261 pxor xmm1, xmm2 262 movdqu [16 + KS], xmm1 263 movdqa xmm2, xmm1 264 265 pshufb xmm2, xmm4 266 aesenclast xmm2, xmm0 267 movdqa xmm3, xmm1 268 pslldq xmm3, 4 269 pxor xmm1, xmm3 270 pslldq xmm3, 4 271 pxor xmm1, xmm3 272 pslldq xmm3, 4 273 pxor xmm1, xmm3 274 pxor xmm1, xmm2 275 movdqu [32 + KS], xmm1 276 movdqa xmm2, xmm1 277 278 ret 279intel_aes_encrypt_init_128 ENDP 280 281 282intel_aes_decrypt_init_128 PROC 283 284 push KS 285 push KEY 286 287 call intel_aes_encrypt_init_128 288 289 pop KEY 290 pop KS 291 292 movdqu xmm0, [0*16 + KS] 293 movdqu xmm1, [10*16 + KS] 294 movdqu [10*16 + KS], xmm0 295 movdqu [0*16 + KS], xmm1 296 297 i = 1 298 WHILE i LT 5 299 movdqu xmm0, [i*16 + KS] 300 movdqu xmm1, [(10-i)*16 + KS] 301 302 aesimc xmm0, xmm0 303 aesimc xmm1, xmm1 304 305 movdqu [(10-i)*16 + KS], xmm0 306 movdqu [i*16 + KS], xmm1 307 308 i = i+1 309 ENDM 310 311 movdqu xmm0, [5*16 + KS] 312 aesimc xmm0, xmm0 313 movdqu [5*16 + KS], xmm0 314 ret 315intel_aes_decrypt_init_128 ENDP 316 317 318intel_aes_encrypt_init_192 PROC 319 320 sub rsp, 16*2 321 movdqu [16*0 + rsp], xmm6 322 movdqu [16*1 + rsp], xmm7 323 324 movdqu xmm1, [KEY] 325 mov ITR, [16 + KEY] 326 movd xmm3, ITR 327 328 movdqu [KS], xmm1 329 movdqa xmm5, xmm3 330 331 lea ITR, Lcon1 332 movdqu xmm0, [ITR] 333 lea ITR, Lmask192 334 movdqu xmm4, [ITR] 335 336 mov ITR, 4 337 338Lenc_192_ks_loop: 339 movdqa xmm2, xmm3 340 pshufb xmm2, xmm4 341 aesenclast xmm2, xmm0 342 pslld xmm0, 1 343 344 movdqa xmm6, xmm1 345 movdqa xmm7, xmm3 346 pslldq xmm6, 4 347 pslldq xmm7, 4 348 pxor xmm1, xmm6 349 pxor xmm3, xmm7 350 pslldq xmm6, 4 351 pxor xmm1, xmm6 352 pslldq xmm6, 4 353 pxor xmm1, xmm6 354 pxor xmm1, xmm2 355 pshufd xmm2, xmm1, 0ffh 356 pxor xmm3, xmm2 357 358 movdqa xmm6, xmm1 359 shufpd xmm5, xmm1, 00h 360 shufpd xmm6, xmm3, 01h 361 362 movdqu [16 + KS], xmm5 363 movdqu [32 + KS], xmm6 364 365 movdqa xmm2, xmm3 366 pshufb xmm2, xmm4 367 aesenclast xmm2, xmm0 368 pslld xmm0, 1 369 370 movdqa xmm6, xmm1 371 movdqa xmm7, xmm3 372 pslldq xmm6, 4 373 pslldq xmm7, 4 374 pxor xmm1, xmm6 375 pxor xmm3, xmm7 376 pslldq xmm6, 4 377 pxor xmm1, xmm6 378 pslldq xmm6, 4 379 pxor xmm1, xmm6 380 pxor xmm1, xmm2 381 pshufd xmm2, xmm1, 0ffh 382 pxor xmm3, xmm2 383 384 movdqu [48 + KS], xmm1 385 movdqa xmm5, xmm3 386 387 lea KS, [48 + KS] 388 389 dec ITR 390 jnz Lenc_192_ks_loop 391 392 movdqu [16 + KS], xmm5 393 394 movdqu xmm7, [16*1 + rsp] 395 movdqu xmm6, [16*0 + rsp] 396 add rsp, 16*2 397 ret 398intel_aes_encrypt_init_192 ENDP 399 400intel_aes_decrypt_init_192 PROC 401 push KS 402 push KEY 403 404 call intel_aes_encrypt_init_192 405 406 pop KEY 407 pop KS 408 409 movdqu xmm0, [0*16 + KS] 410 movdqu xmm1, [12*16 + KS] 411 movdqu [12*16 + KS], xmm0 412 movdqu [0*16 + KS], xmm1 413 414 i = 1 415 WHILE i LT 6 416 movdqu xmm0, [i*16 + KS] 417 movdqu xmm1, [(12-i)*16 + KS] 418 419 aesimc xmm0, xmm0 420 aesimc xmm1, xmm1 421 422 movdqu [(12-i)*16 + KS], xmm0 423 movdqu [i*16 + KS], xmm1 424 425 i = i+1 426 ENDM 427 428 movdqu xmm0, [6*16 + KS] 429 aesimc xmm0, xmm0 430 movdqu [6*16 + KS], xmm0 431 ret 432intel_aes_decrypt_init_192 ENDP 433 434 435intel_aes_encrypt_init_256 PROC 436 sub rsp, 16*2 437 movdqu [16*0 + rsp], xmm6 438 movdqu [16*1 + rsp], xmm7 439 440 movdqu xmm1, [16*0 + KEY] 441 movdqu xmm3, [16*1 + KEY] 442 443 movdqu [16*0 + KS], xmm1 444 movdqu [16*1 + KS], xmm3 445 446 lea ITR, Lcon1 447 movdqu xmm0, [ITR] 448 lea ITR, Lmask256 449 movdqu xmm5, [ITR] 450 451 pxor xmm6, xmm6 452 453 mov ITR, 6 454 455Lenc_256_ks_loop: 456 457 movdqa xmm2, xmm3 458 pshufb xmm2, xmm5 459 aesenclast xmm2, xmm0 460 pslld xmm0, 1 461 movdqa xmm4, xmm1 462 pslldq xmm4, 4 463 pxor xmm1, xmm4 464 pslldq xmm4, 4 465 pxor xmm1, xmm4 466 pslldq xmm4, 4 467 pxor xmm1, xmm4 468 pxor xmm1, xmm2 469 movdqu [16*2 + KS], xmm1 470 471 pshufd xmm2, xmm1, 0ffh 472 aesenclast xmm2, xmm6 473 movdqa xmm4, xmm3 474 pslldq xmm4, 4 475 pxor xmm3, xmm4 476 pslldq xmm4, 4 477 pxor xmm3, xmm4 478 pslldq xmm4, 4 479 pxor xmm3, xmm4 480 pxor xmm3, xmm2 481 movdqu [16*3 + KS], xmm3 482 483 lea KS, [32 + KS] 484 dec ITR 485 jnz Lenc_256_ks_loop 486 487 movdqa xmm2, xmm3 488 pshufb xmm2, xmm5 489 aesenclast xmm2, xmm0 490 movdqa xmm4, xmm1 491 pslldq xmm4, 4 492 pxor xmm1, xmm4 493 pslldq xmm4, 4 494 pxor xmm1, xmm4 495 pslldq xmm4, 4 496 pxor xmm1, xmm4 497 pxor xmm1, xmm2 498 movdqu [16*2 + KS], xmm1 499 500 movdqu xmm7, [16*1 + rsp] 501 movdqu xmm6, [16*0 + rsp] 502 add rsp, 16*2 503 ret 504 505intel_aes_encrypt_init_256 ENDP 506 507 508intel_aes_decrypt_init_256 PROC 509 push KS 510 push KEY 511 512 call intel_aes_encrypt_init_256 513 514 pop KEY 515 pop KS 516 517 movdqu xmm0, [0*16 + KS] 518 movdqu xmm1, [14*16 + KS] 519 movdqu [14*16 + KS], xmm0 520 movdqu [0*16 + KS], xmm1 521 522 i = 1 523 WHILE i LT 7 524 movdqu xmm0, [i*16 + KS] 525 movdqu xmm1, [(14-i)*16 + KS] 526 527 aesimc xmm0, xmm0 528 aesimc xmm1, xmm1 529 530 movdqu [(14-i)*16 + KS], xmm0 531 movdqu [i*16 + KS], xmm1 532 533 i = i+1 534 ENDM 535 536 movdqu xmm0, [7*16 + KS] 537 aesimc xmm0, xmm0 538 movdqu [7*16 + KS], xmm0 539 ret 540intel_aes_decrypt_init_256 ENDP 541 542 543 544gen_aes_cbc_enc_func MACRO rnds 545 546LOCAL loop1 547LOCAL bail 548 549 mov input, [rsp + 1*8 + 8*4] 550 mov inputLen, [rsp + 1*8 + 8*5] 551 552 sub rsp, 3*16 553 554 movdqu [rsp + 0*16], xmm6 555 movdqu [rsp + 1*16], xmm7 556 movdqu [rsp + 2*16], xmm8 557 558 lea ctx, [48+ctx] 559 560 movdqu xmm0, [-32+ctx] 561 562 movdqu xmm2, [0*16 + ctx] 563 movdqu xmm3, [1*16 + ctx] 564 movdqu xmm4, [2*16 + ctx] 565 movdqu xmm5, [3*16 + ctx] 566 movdqu xmm6, [4*16 + ctx] 567 movdqu xmm7, [5*16 + ctx] 568 569loop1: 570 cmp inputLen, 1*16 571 jb bail 572 573 movdqu xmm1, [input] 574 pxor xmm1, xmm2 575 pxor xmm0, xmm1 576 577 aesenc xmm0, xmm3 578 aesenc xmm0, xmm4 579 aesenc xmm0, xmm5 580 aesenc xmm0, xmm6 581 aesenc xmm0, xmm7 582 583 i = 6 584 WHILE i LT rnds 585 movdqu xmm8, [i*16 + ctx] 586 aesenc xmm0, xmm8 587 i = i+1 588 ENDM 589 movdqu xmm8, [rnds*16 + ctx] 590 aesenclast xmm0, xmm8 591 592 movdqu [output], xmm0 593 594 lea input, [1*16 + input] 595 lea output, [1*16 + output] 596 sub inputLen, 1*16 597 jmp loop1 598 599bail: 600 movdqu [-32+ctx], xmm0 601 602 xor rax, rax 603 604 movdqu xmm6, [rsp + 0*16] 605 movdqu xmm7, [rsp + 1*16] 606 movdqu xmm8, [rsp + 2*16] 607 add rsp, 3*16 608 ret 609 610ENDM 611 612gen_aes_cbc_dec_func MACRO rnds 613 614LOCAL loop8 615LOCAL loop1 616LOCAL dec1 617LOCAL bail 618 619 mov input, [rsp + 1*8 + 8*4] 620 mov inputLen, [rsp + 1*8 + 8*5] 621 622 sub rsp, 3*16 623 624 movdqu [rsp + 0*16], xmm6 625 movdqu [rsp + 1*16], xmm7 626 movdqu [rsp + 2*16], xmm8 627 628 lea ctx, [48+ctx] 629 630loop8: 631 cmp inputLen, 8*16 632 jb dec1 633 634 movdqu xmm0, [0*16 + input] 635 movdqu xmm1, [1*16 + input] 636 movdqu xmm2, [2*16 + input] 637 movdqu xmm3, [3*16 + input] 638 movdqu xmm4, [4*16 + input] 639 movdqu xmm5, [5*16 + input] 640 movdqu xmm6, [6*16 + input] 641 movdqu xmm7, [7*16 + input] 642 643 movdqu xmm8, [0*16 + ctx] 644 pxor xmm0, xmm8 645 pxor xmm1, xmm8 646 pxor xmm2, xmm8 647 pxor xmm3, xmm8 648 pxor xmm4, xmm8 649 pxor xmm5, xmm8 650 pxor xmm6, xmm8 651 pxor xmm7, xmm8 652 653 i = 1 654 WHILE i LT rnds 655 aes_dec_rnd i 656 i = i+1 657 ENDM 658 aes_dec_last_rnd rnds 659 660 movdqu xmm8, [-32 + ctx] 661 pxor xmm0, xmm8 662 movdqu xmm8, [0*16 + input] 663 pxor xmm1, xmm8 664 movdqu xmm8, [1*16 + input] 665 pxor xmm2, xmm8 666 movdqu xmm8, [2*16 + input] 667 pxor xmm3, xmm8 668 movdqu xmm8, [3*16 + input] 669 pxor xmm4, xmm8 670 movdqu xmm8, [4*16 + input] 671 pxor xmm5, xmm8 672 movdqu xmm8, [5*16 + input] 673 pxor xmm6, xmm8 674 movdqu xmm8, [6*16 + input] 675 pxor xmm7, xmm8 676 movdqu xmm8, [7*16 + input] 677 678 movdqu [0*16 + output], xmm0 679 movdqu [1*16 + output], xmm1 680 movdqu [2*16 + output], xmm2 681 movdqu [3*16 + output], xmm3 682 movdqu [4*16 + output], xmm4 683 movdqu [5*16 + output], xmm5 684 movdqu [6*16 + output], xmm6 685 movdqu [7*16 + output], xmm7 686 movdqu [-32 + ctx], xmm8 687 688 lea input, [8*16 + input] 689 lea output, [8*16 + output] 690 sub inputLen, 8*16 691 jmp loop8 692dec1: 693 694 movdqu xmm3, [-32 + ctx] 695 696loop1: 697 cmp inputLen, 1*16 698 jb bail 699 700 movdqu xmm0, [input] 701 movdqa xmm4, xmm0 702 movdqu xmm7, [0*16 + ctx] 703 pxor xmm0, xmm7 704 705 i = 1 706 WHILE i LT rnds 707 movdqu xmm7, [i*16 + ctx] 708 aesdec xmm0, xmm7 709 i = i+1 710 ENDM 711 movdqu xmm7, [rnds*16 + ctx] 712 aesdeclast xmm0, xmm7 713 pxor xmm3, xmm0 714 715 movdqu [output], xmm3 716 movdqa xmm3, xmm4 717 718 lea input, [1*16 + input] 719 lea output, [1*16 + output] 720 sub inputLen, 1*16 721 jmp loop1 722 723bail: 724 movdqu [-32 + ctx], xmm3 725 xor rax, rax 726 727 movdqu xmm6, [rsp + 0*16] 728 movdqu xmm7, [rsp + 1*16] 729 movdqu xmm8, [rsp + 2*16] 730 add rsp, 3*16 731 ret 732ENDM 733 734intel_aes_encrypt_cbc_128 PROC 735gen_aes_cbc_enc_func 10 736intel_aes_encrypt_cbc_128 ENDP 737 738intel_aes_encrypt_cbc_192 PROC 739gen_aes_cbc_enc_func 12 740intel_aes_encrypt_cbc_192 ENDP 741 742intel_aes_encrypt_cbc_256 PROC 743gen_aes_cbc_enc_func 14 744intel_aes_encrypt_cbc_256 ENDP 745 746intel_aes_decrypt_cbc_128 PROC 747gen_aes_cbc_dec_func 10 748intel_aes_decrypt_cbc_128 ENDP 749 750intel_aes_decrypt_cbc_192 PROC 751gen_aes_cbc_dec_func 12 752intel_aes_decrypt_cbc_192 ENDP 753 754intel_aes_decrypt_cbc_256 PROC 755gen_aes_cbc_dec_func 14 756intel_aes_decrypt_cbc_256 ENDP 757 758 759 760ctrCtx textequ <r10> 761CTR textequ <r11d> 762CTRSave textequ <eax> 763 764gen_aes_ctr_func MACRO rnds 765 766LOCAL loop8 767LOCAL loop1 768LOCAL enc1 769LOCAL bail 770 771 mov input, [rsp + 8*1 + 4*8] 772 mov inputLen, [rsp + 8*1 + 5*8] 773 774 mov ctrCtx, ctx 775 mov ctx, [8+ctrCtx] 776 lea ctx, [48+ctx] 777 778 sub rsp, 3*16 779 movdqu [rsp + 0*16], xmm6 780 movdqu [rsp + 1*16], xmm7 781 movdqu [rsp + 2*16], xmm8 782 783 784 push rbp 785 mov rbp, rsp 786 sub rsp, 8*16 787 and rsp, -16 788 789 790 movdqu xmm0, [16+ctrCtx] 791 mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4] 792 bswap CTRSave 793 movdqu xmm1, [ctx + 0*16] 794 795 pxor xmm0, xmm1 796 797 movdqa [rsp + 0*16], xmm0 798 movdqa [rsp + 1*16], xmm0 799 movdqa [rsp + 2*16], xmm0 800 movdqa [rsp + 3*16], xmm0 801 movdqa [rsp + 4*16], xmm0 802 movdqa [rsp + 5*16], xmm0 803 movdqa [rsp + 6*16], xmm0 804 movdqa [rsp + 7*16], xmm0 805 806 inc CTRSave 807 mov CTR, CTRSave 808 bswap CTR 809 xor CTR, DWORD PTR [ctx + 3*4] 810 mov DWORD PTR [rsp + 1*16 + 3*4], CTR 811 812 inc CTRSave 813 mov CTR, CTRSave 814 bswap CTR 815 xor CTR, DWORD PTR [ctx + 3*4] 816 mov DWORD PTR [rsp + 2*16 + 3*4], CTR 817 818 inc CTRSave 819 mov CTR, CTRSave 820 bswap CTR 821 xor CTR, DWORD PTR [ctx + 3*4] 822 mov DWORD PTR [rsp + 3*16 + 3*4], CTR 823 824 inc CTRSave 825 mov CTR, CTRSave 826 bswap CTR 827 xor CTR, DWORD PTR [ctx + 3*4] 828 mov DWORD PTR [rsp + 4*16 + 3*4], CTR 829 830 inc CTRSave 831 mov CTR, CTRSave 832 bswap CTR 833 xor CTR, DWORD PTR [ctx + 3*4] 834 mov DWORD PTR [rsp + 5*16 + 3*4], CTR 835 836 inc CTRSave 837 mov CTR, CTRSave 838 bswap CTR 839 xor CTR, DWORD PTR [ctx + 3*4] 840 mov DWORD PTR [rsp + 6*16 + 3*4], CTR 841 842 inc CTRSave 843 mov CTR, CTRSave 844 bswap CTR 845 xor CTR, DWORD PTR [ctx + 3*4] 846 mov DWORD PTR [rsp + 7*16 + 3*4], CTR 847 848 849loop8: 850 cmp inputLen, 8*16 851 jb loop1 852 853 movdqu xmm0, [0*16 + rsp] 854 movdqu xmm1, [1*16 + rsp] 855 movdqu xmm2, [2*16 + rsp] 856 movdqu xmm3, [3*16 + rsp] 857 movdqu xmm4, [4*16 + rsp] 858 movdqu xmm5, [5*16 + rsp] 859 movdqu xmm6, [6*16 + rsp] 860 movdqu xmm7, [7*16 + rsp] 861 862 i = 1 863 WHILE i LE 8 864 aes_rnd i 865 866 inc CTRSave 867 mov CTR, CTRSave 868 bswap CTR 869 xor CTR, DWORD PTR [ctx + 3*4] 870 mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR 871 872 i = i+1 873 ENDM 874 WHILE i LT rnds 875 aes_rnd i 876 i = i+1 877 ENDM 878 aes_last_rnd rnds 879 880 movdqu xmm8, [0*16 + input] 881 pxor xmm0, xmm8 882 movdqu xmm8, [1*16 + input] 883 pxor xmm1, xmm8 884 movdqu xmm8, [2*16 + input] 885 pxor xmm2, xmm8 886 movdqu xmm8, [3*16 + input] 887 pxor xmm3, xmm8 888 movdqu xmm8, [4*16 + input] 889 pxor xmm4, xmm8 890 movdqu xmm8, [5*16 + input] 891 pxor xmm5, xmm8 892 movdqu xmm8, [6*16 + input] 893 pxor xmm6, xmm8 894 movdqu xmm8, [7*16 + input] 895 pxor xmm7, xmm8 896 897 movdqu [0*16 + output], xmm0 898 movdqu [1*16 + output], xmm1 899 movdqu [2*16 + output], xmm2 900 movdqu [3*16 + output], xmm3 901 movdqu [4*16 + output], xmm4 902 movdqu [5*16 + output], xmm5 903 movdqu [6*16 + output], xmm6 904 movdqu [7*16 + output], xmm7 905 906 lea input, [8*16 + input] 907 lea output, [8*16 + output] 908 sub inputLen, 8*16 909 jmp loop8 910 911 912loop1: 913 cmp inputLen, 1*16 914 jb bail 915 916 movdqu xmm0, [rsp] 917 add rsp, 16 918 919 i = 1 920 WHILE i LT rnds 921 movdqu xmm7, [i*16 + ctx] 922 aesenc xmm0, xmm7 923 i = i+1 924 ENDM 925 movdqu xmm7, [rnds*16 + ctx] 926 aesenclast xmm0, xmm7 927 928 movdqu xmm7, [input] 929 pxor xmm0, xmm7 930 movdqu [output], xmm0 931 932 lea input, [1*16 + input] 933 lea output, [1*16 + output] 934 sub inputLen, 1*16 935 jmp loop1 936 937bail: 938 939 movdqu xmm0, [rsp] 940 movdqu xmm1, [ctx + 0*16] 941 pxor xmm0, xmm1 942 movdqu [16+ctrCtx], xmm0 943 944 945 xor rax, rax 946 mov rsp, rbp 947 pop rbp 948 949 movdqu xmm6, [rsp + 0*16] 950 movdqu xmm7, [rsp + 1*16] 951 movdqu xmm8, [rsp + 2*16] 952 add rsp, 3*16 953 954 ret 955ENDM 956 957 958intel_aes_encrypt_ctr_128 PROC 959gen_aes_ctr_func 10 960intel_aes_encrypt_ctr_128 ENDP 961 962intel_aes_encrypt_ctr_192 PROC 963gen_aes_ctr_func 12 964intel_aes_encrypt_ctr_192 ENDP 965 966intel_aes_encrypt_ctr_256 PROC 967gen_aes_ctr_func 14 968intel_aes_encrypt_ctr_256 ENDP 969 970 971END 972