1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov 5# <appro@openssl.org>. The module is licensed under 2-clause BSD 6# license. October 2012. All rights reserved. 7# ==================================================================== 8 9###################################################################### 10# AES for SPARC T4. 11# 12# AES round instructions complete in 3 cycles and can be issued every 13# cycle. It means that round calculations should take 4*rounds cycles, 14# because any given round instruction depends on result of *both* 15# previous instructions: 16# 17# |0 |1 |2 |3 |4 18# |01|01|01| 19# |23|23|23| 20# |01|01|... 21# |23|... 22# 23# Provided that fxor [with IV] takes 3 cycles to complete, critical 24# path length for CBC encrypt would be 3+4*rounds, or in other words 25# it should process one byte in at least (3+4*rounds)/16 cycles. This 26# estimate doesn't account for "collateral" instructions, such as 27# fetching input from memory, xor-ing it with zero-round key and 28# storing the result. Yet, *measured* performance [for data aligned 29# at 64-bit boundary!] deviates from this equation by less than 0.5%: 30# 31# 128-bit key 192- 256- 32# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90 33# (*) numbers after slash are for 34# misaligned data. 35# 36# Out-of-order execution logic managed to fully overlap "collateral" 37# instructions with those on critical path. Amazing! 38# 39# As with Intel AES-NI, question is if it's possible to improve 40# performance of parallelizeable modes by interleaving round 41# instructions. Provided round instruction latency and throughput 42# optimal interleave factor is 2. But can we expect 2x performance 43# improvement? Well, as round instructions can be issued one per 44# cycle, they don't saturate the 2-way issue pipeline and therefore 45# there is room for "collateral" calculations... Yet, 2x speed-up 46# over CBC encrypt remains unattaintable: 47# 48# 128-bit key 192- 256- 49# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61 50# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61 51# (*) numbers after slash are for 52# misaligned data. 53# 54# Estimates based on amount of instructions under assumption that 55# round instructions are not pairable with any other instruction 56# suggest that latter is the actual case and pipeline runs 57# underutilized. It should be noted that T4 out-of-order execution 58# logic is so capable that performance gain from 2x interleave is 59# not even impressive, ~7-13% over non-interleaved code, largest 60# for 256-bit keys. 61 62# To anchor to something else, software implementation processes 63# one byte in 29 cycles with 128-bit key on same processor. Intel 64# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts 65# in 0.93, naturally with AES-NI. 66 67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68push(@INC,"${dir}","${dir}../../perlasm"); 69require "sparcv9_modes.pl"; 70 71&asm_init(@ARGV); 72 73$::evp=1; # if $evp is set to 0, script generates module with 74# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry 75# points. These however are not fully compatible with openssl/aes.h, 76# because they expect AES_KEY to be aligned at 64-bit boundary. When 77# used through EVP, alignment is arranged at EVP layer. Second thing 78# that is arranged by EVP is at least 32-bit alignment of IV. 79 80###################################################################### 81# single-round subroutines 82# 83{ 84my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); 85 86$code.=<<___ if ($::abibits==64); 87.register %g2,#scratch 88.register %g3,#scratch 89 90___ 91$code.=<<___; 92.text 93 94.globl aes_t4_encrypt 95.align 32 96aes_t4_encrypt: 97 andcc $inp, 7, %g1 ! is input aligned? 98 andn $inp, 7, $inp 99 100 ldx [$key + 0], %g4 101 ldx [$key + 8], %g5 102 103 ldx [$inp + 0], %o4 104 bz,pt %icc, 1f 105 ldx [$inp + 8], %o5 106 ldx [$inp + 16], $inp 107 sll %g1, 3, %g1 108 sub %g0, %g1, %o3 109 sllx %o4, %g1, %o4 110 sllx %o5, %g1, %g1 111 srlx %o5, %o3, %o5 112 srlx $inp, %o3, %o3 113 or %o5, %o4, %o4 114 or %o3, %g1, %o5 1151: 116 ld [$key + 240], $rounds 117 ldd [$key + 16], %f12 118 ldd [$key + 24], %f14 119 xor %g4, %o4, %o4 120 xor %g5, %o5, %o5 121 movxtod %o4, %f0 122 movxtod %o5, %f2 123 srl $rounds, 1, $rounds 124 ldd [$key + 32], %f16 125 sub $rounds, 1, $rounds 126 ldd [$key + 40], %f18 127 add $key, 48, $key 128 129.Lenc: 130 aes_eround01 %f12, %f0, %f2, %f4 131 aes_eround23 %f14, %f0, %f2, %f2 132 ldd [$key + 0], %f12 133 ldd [$key + 8], %f14 134 sub $rounds,1,$rounds 135 aes_eround01 %f16, %f4, %f2, %f0 136 aes_eround23 %f18, %f4, %f2, %f2 137 ldd [$key + 16], %f16 138 ldd [$key + 24], %f18 139 brnz,pt $rounds, .Lenc 140 add $key, 32, $key 141 142 andcc $out, 7, $tmp ! is output aligned? 143 aes_eround01 %f12, %f0, %f2, %f4 144 aes_eround23 %f14, %f0, %f2, %f2 145 aes_eround01_l %f16, %f4, %f2, %f0 146 aes_eround23_l %f18, %f4, %f2, %f2 147 148 bnz,pn %icc, 2f 149 nop 150 151 std %f0, [$out + 0] 152 retl 153 std %f2, [$out + 8] 154 1552: alignaddrl $out, %g0, $out 156 mov 0xff, $mask 157 srl $mask, $tmp, $mask 158 159 faligndata %f0, %f0, %f4 160 faligndata %f0, %f2, %f6 161 faligndata %f2, %f2, %f8 162 163 stda %f4, [$out + $mask]0xc0 ! partial store 164 std %f6, [$out + 8] 165 add $out, 16, $out 166 orn %g0, $mask, $mask 167 retl 168 stda %f8, [$out + $mask]0xc0 ! partial store 169.type aes_t4_encrypt,#function 170.size aes_t4_encrypt,.-aes_t4_encrypt 171 172.globl aes_t4_decrypt 173.align 32 174aes_t4_decrypt: 175 andcc $inp, 7, %g1 ! is input aligned? 176 andn $inp, 7, $inp 177 178 ldx [$key + 0], %g4 179 ldx [$key + 8], %g5 180 181 ldx [$inp + 0], %o4 182 bz,pt %icc, 1f 183 ldx [$inp + 8], %o5 184 ldx [$inp + 16], $inp 185 sll %g1, 3, %g1 186 sub %g0, %g1, %o3 187 sllx %o4, %g1, %o4 188 sllx %o5, %g1, %g1 189 srlx %o5, %o3, %o5 190 srlx $inp, %o3, %o3 191 or %o5, %o4, %o4 192 or %o3, %g1, %o5 1931: 194 ld [$key + 240], $rounds 195 ldd [$key + 16], %f12 196 ldd [$key + 24], %f14 197 xor %g4, %o4, %o4 198 xor %g5, %o5, %o5 199 movxtod %o4, %f0 200 movxtod %o5, %f2 201 srl $rounds, 1, $rounds 202 ldd [$key + 32], %f16 203 sub $rounds, 1, $rounds 204 ldd [$key + 40], %f18 205 add $key, 48, $key 206 207.Ldec: 208 aes_dround01 %f12, %f0, %f2, %f4 209 aes_dround23 %f14, %f0, %f2, %f2 210 ldd [$key + 0], %f12 211 ldd [$key + 8], %f14 212 sub $rounds,1,$rounds 213 aes_dround01 %f16, %f4, %f2, %f0 214 aes_dround23 %f18, %f4, %f2, %f2 215 ldd [$key + 16], %f16 216 ldd [$key + 24], %f18 217 brnz,pt $rounds, .Ldec 218 add $key, 32, $key 219 220 andcc $out, 7, $tmp ! is output aligned? 221 aes_dround01 %f12, %f0, %f2, %f4 222 aes_dround23 %f14, %f0, %f2, %f2 223 aes_dround01_l %f16, %f4, %f2, %f0 224 aes_dround23_l %f18, %f4, %f2, %f2 225 226 bnz,pn %icc, 2f 227 nop 228 229 std %f0, [$out + 0] 230 retl 231 std %f2, [$out + 8] 232 2332: alignaddrl $out, %g0, $out 234 mov 0xff, $mask 235 srl $mask, $tmp, $mask 236 237 faligndata %f0, %f0, %f4 238 faligndata %f0, %f2, %f6 239 faligndata %f2, %f2, %f8 240 241 stda %f4, [$out + $mask]0xc0 ! partial store 242 std %f6, [$out + 8] 243 add $out, 16, $out 244 orn %g0, $mask, $mask 245 retl 246 stda %f8, [$out + $mask]0xc0 ! partial store 247.type aes_t4_decrypt,#function 248.size aes_t4_decrypt,.-aes_t4_decrypt 249___ 250} 251 252###################################################################### 253# key setup subroutines 254# 255{ 256my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5)); 257$code.=<<___; 258.globl aes_t4_set_encrypt_key 259.align 32 260aes_t4_set_encrypt_key: 261.Lset_encrypt_key: 262 and $inp, 7, $tmp 263 alignaddr $inp, %g0, $inp 264 cmp $bits, 192 265 ldd [$inp + 0], %f0 266 bl,pt %icc,.L128 267 ldd [$inp + 8], %f2 268 269 be,pt %icc,.L192 270 ldd [$inp + 16], %f4 271 brz,pt $tmp, .L256aligned 272 ldd [$inp + 24], %f6 273 274 ldd [$inp + 32], %f8 275 faligndata %f0, %f2, %f0 276 faligndata %f2, %f4, %f2 277 faligndata %f4, %f6, %f4 278 faligndata %f6, %f8, %f6 279.L256aligned: 280___ 281for ($i=0; $i<6; $i++) { 282 $code.=<<___; 283 std %f0, [$out + `32*$i+0`] 284 aes_kexpand1 %f0, %f6, $i, %f0 285 std %f2, [$out + `32*$i+8`] 286 aes_kexpand2 %f2, %f0, %f2 287 std %f4, [$out + `32*$i+16`] 288 aes_kexpand0 %f4, %f2, %f4 289 std %f6, [$out + `32*$i+24`] 290 aes_kexpand2 %f6, %f4, %f6 291___ 292} 293$code.=<<___; 294 std %f0, [$out + `32*$i+0`] 295 aes_kexpand1 %f0, %f6, $i, %f0 296 std %f2, [$out + `32*$i+8`] 297 aes_kexpand2 %f2, %f0, %f2 298 std %f4, [$out + `32*$i+16`] 299 std %f6, [$out + `32*$i+24`] 300 std %f0, [$out + `32*$i+32`] 301 std %f2, [$out + `32*$i+40`] 302 303 mov 14, $tmp 304 st $tmp, [$out + 240] 305 retl 306 xor %o0, %o0, %o0 307 308.align 16 309.L192: 310 brz,pt $tmp, .L192aligned 311 nop 312 313 ldd [$inp + 24], %f6 314 faligndata %f0, %f2, %f0 315 faligndata %f2, %f4, %f2 316 faligndata %f4, %f6, %f4 317.L192aligned: 318___ 319for ($i=0; $i<7; $i++) { 320 $code.=<<___; 321 std %f0, [$out + `24*$i+0`] 322 aes_kexpand1 %f0, %f4, $i, %f0 323 std %f2, [$out + `24*$i+8`] 324 aes_kexpand2 %f2, %f0, %f2 325 std %f4, [$out + `24*$i+16`] 326 aes_kexpand2 %f4, %f2, %f4 327___ 328} 329$code.=<<___; 330 std %f0, [$out + `24*$i+0`] 331 aes_kexpand1 %f0, %f4, $i, %f0 332 std %f2, [$out + `24*$i+8`] 333 aes_kexpand2 %f2, %f0, %f2 334 std %f4, [$out + `24*$i+16`] 335 std %f0, [$out + `24*$i+24`] 336 std %f2, [$out + `24*$i+32`] 337 338 mov 12, $tmp 339 st $tmp, [$out + 240] 340 retl 341 xor %o0, %o0, %o0 342 343.align 16 344.L128: 345 brz,pt $tmp, .L128aligned 346 nop 347 348 ldd [$inp + 16], %f4 349 faligndata %f0, %f2, %f0 350 faligndata %f2, %f4, %f2 351.L128aligned: 352___ 353for ($i=0; $i<10; $i++) { 354 $code.=<<___; 355 std %f0, [$out + `16*$i+0`] 356 aes_kexpand1 %f0, %f2, $i, %f0 357 std %f2, [$out + `16*$i+8`] 358 aes_kexpand2 %f2, %f0, %f2 359___ 360} 361$code.=<<___; 362 std %f0, [$out + `16*$i+0`] 363 std %f2, [$out + `16*$i+8`] 364 365 mov 10, $tmp 366 st $tmp, [$out + 240] 367 retl 368 xor %o0, %o0, %o0 369.type aes_t4_set_encrypt_key,#function 370.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key 371 372.globl aes_t4_set_decrypt_key 373.align 32 374aes_t4_set_decrypt_key: 375 mov %o7, %o5 376 call .Lset_encrypt_key 377 nop 378 379 mov %o5, %o7 380 sll $tmp, 4, $inp ! $tmp is number of rounds 381 add $tmp, 2, $tmp 382 add $out, $inp, $inp ! $inp=$out+16*rounds 383 srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4 384 385.Lkey_flip: 386 ldd [$out + 0], %f0 387 ldd [$out + 8], %f2 388 ldd [$out + 16], %f4 389 ldd [$out + 24], %f6 390 ldd [$inp + 0], %f8 391 ldd [$inp + 8], %f10 392 ldd [$inp - 16], %f12 393 ldd [$inp - 8], %f14 394 sub $tmp, 1, $tmp 395 std %f0, [$inp + 0] 396 std %f2, [$inp + 8] 397 std %f4, [$inp - 16] 398 std %f6, [$inp - 8] 399 std %f8, [$out + 0] 400 std %f10, [$out + 8] 401 std %f12, [$out + 16] 402 std %f14, [$out + 24] 403 add $out, 32, $out 404 brnz $tmp, .Lkey_flip 405 sub $inp, 32, $inp 406 407 retl 408 xor %o0, %o0, %o0 409.type aes_t4_set_decrypt_key,#function 410.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key 411___ 412} 413 414{{{ 415my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5)); 416my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7)); 417 418$code.=<<___; 419.align 32 420_aes128_encrypt_1x: 421___ 422for ($i=0; $i<4; $i++) { 423 $code.=<<___; 424 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 425 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 426 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 427 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 428___ 429} 430$code.=<<___; 431 aes_eround01 %f48, %f0, %f2, %f4 432 aes_eround23 %f50, %f0, %f2, %f2 433 aes_eround01_l %f52, %f4, %f2, %f0 434 retl 435 aes_eround23_l %f54, %f4, %f2, %f2 436.type _aes128_encrypt_1x,#function 437.size _aes128_encrypt_1x,.-_aes128_encrypt_1x 438 439.align 32 440_aes128_encrypt_2x: 441___ 442for ($i=0; $i<4; $i++) { 443 $code.=<<___; 444 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 445 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 446 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 447 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 448 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 449 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 450 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 451 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 452___ 453} 454$code.=<<___; 455 aes_eround01 %f48, %f0, %f2, %f8 456 aes_eround23 %f50, %f0, %f2, %f2 457 aes_eround01 %f48, %f4, %f6, %f10 458 aes_eround23 %f50, %f4, %f6, %f6 459 aes_eround01_l %f52, %f8, %f2, %f0 460 aes_eround23_l %f54, %f8, %f2, %f2 461 aes_eround01_l %f52, %f10, %f6, %f4 462 retl 463 aes_eround23_l %f54, %f10, %f6, %f6 464.type _aes128_encrypt_2x,#function 465.size _aes128_encrypt_2x,.-_aes128_encrypt_2x 466 467.align 32 468_aes128_loadkey: 469 ldx [$key + 0], %g4 470 ldx [$key + 8], %g5 471___ 472for ($i=2; $i<22;$i++) { # load key schedule 473 $code.=<<___; 474 ldd [$key + `8*$i`], %f`12+2*$i` 475___ 476} 477$code.=<<___; 478 retl 479 nop 480.type _aes128_loadkey,#function 481.size _aes128_loadkey,.-_aes128_loadkey 482_aes128_load_enckey=_aes128_loadkey 483_aes128_load_deckey=_aes128_loadkey 484 485___ 486 487&alg_cbc_encrypt_implement("aes",128); 488if ($::evp) { 489 &alg_ctr32_implement("aes",128); 490 &alg_xts_implement("aes",128,"en"); 491 &alg_xts_implement("aes",128,"de"); 492} 493&alg_cbc_decrypt_implement("aes",128); 494 495$code.=<<___; 496.align 32 497_aes128_decrypt_1x: 498___ 499for ($i=0; $i<4; $i++) { 500 $code.=<<___; 501 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 502 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 503 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 504 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 505___ 506} 507$code.=<<___; 508 aes_dround01 %f48, %f0, %f2, %f4 509 aes_dround23 %f50, %f0, %f2, %f2 510 aes_dround01_l %f52, %f4, %f2, %f0 511 retl 512 aes_dround23_l %f54, %f4, %f2, %f2 513.type _aes128_decrypt_1x,#function 514.size _aes128_decrypt_1x,.-_aes128_decrypt_1x 515 516.align 32 517_aes128_decrypt_2x: 518___ 519for ($i=0; $i<4; $i++) { 520 $code.=<<___; 521 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 522 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 523 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 524 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 525 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 526 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 527 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 528 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 529___ 530} 531$code.=<<___; 532 aes_dround01 %f48, %f0, %f2, %f8 533 aes_dround23 %f50, %f0, %f2, %f2 534 aes_dround01 %f48, %f4, %f6, %f10 535 aes_dround23 %f50, %f4, %f6, %f6 536 aes_dround01_l %f52, %f8, %f2, %f0 537 aes_dround23_l %f54, %f8, %f2, %f2 538 aes_dround01_l %f52, %f10, %f6, %f4 539 retl 540 aes_dround23_l %f54, %f10, %f6, %f6 541.type _aes128_decrypt_2x,#function 542.size _aes128_decrypt_2x,.-_aes128_decrypt_2x 543___ 544 545$code.=<<___; 546.align 32 547_aes192_encrypt_1x: 548___ 549for ($i=0; $i<5; $i++) { 550 $code.=<<___; 551 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 552 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 553 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 554 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 555___ 556} 557$code.=<<___; 558 aes_eround01 %f56, %f0, %f2, %f4 559 aes_eround23 %f58, %f0, %f2, %f2 560 aes_eround01_l %f60, %f4, %f2, %f0 561 retl 562 aes_eround23_l %f62, %f4, %f2, %f2 563.type _aes192_encrypt_1x,#function 564.size _aes192_encrypt_1x,.-_aes192_encrypt_1x 565 566.align 32 567_aes192_encrypt_2x: 568___ 569for ($i=0; $i<5; $i++) { 570 $code.=<<___; 571 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 572 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 573 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 574 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 575 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 576 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 577 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 578 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 579___ 580} 581$code.=<<___; 582 aes_eround01 %f56, %f0, %f2, %f8 583 aes_eround23 %f58, %f0, %f2, %f2 584 aes_eround01 %f56, %f4, %f6, %f10 585 aes_eround23 %f58, %f4, %f6, %f6 586 aes_eround01_l %f60, %f8, %f2, %f0 587 aes_eround23_l %f62, %f8, %f2, %f2 588 aes_eround01_l %f60, %f10, %f6, %f4 589 retl 590 aes_eround23_l %f62, %f10, %f6, %f6 591.type _aes192_encrypt_2x,#function 592.size _aes192_encrypt_2x,.-_aes192_encrypt_2x 593 594.align 32 595_aes256_encrypt_1x: 596 aes_eround01 %f16, %f0, %f2, %f4 597 aes_eround23 %f18, %f0, %f2, %f2 598 ldd [$key + 208], %f16 599 ldd [$key + 216], %f18 600 aes_eround01 %f20, %f4, %f2, %f0 601 aes_eround23 %f22, %f4, %f2, %f2 602 ldd [$key + 224], %f20 603 ldd [$key + 232], %f22 604___ 605for ($i=1; $i<6; $i++) { 606 $code.=<<___; 607 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 608 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 609 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 610 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 611___ 612} 613$code.=<<___; 614 aes_eround01 %f16, %f0, %f2, %f4 615 aes_eround23 %f18, %f0, %f2, %f2 616 ldd [$key + 16], %f16 617 ldd [$key + 24], %f18 618 aes_eround01_l %f20, %f4, %f2, %f0 619 aes_eround23_l %f22, %f4, %f2, %f2 620 ldd [$key + 32], %f20 621 retl 622 ldd [$key + 40], %f22 623.type _aes256_encrypt_1x,#function 624.size _aes256_encrypt_1x,.-_aes256_encrypt_1x 625 626.align 32 627_aes256_encrypt_2x: 628 aes_eround01 %f16, %f0, %f2, %f8 629 aes_eround23 %f18, %f0, %f2, %f2 630 aes_eround01 %f16, %f4, %f6, %f10 631 aes_eround23 %f18, %f4, %f6, %f6 632 ldd [$key + 208], %f16 633 ldd [$key + 216], %f18 634 aes_eround01 %f20, %f8, %f2, %f0 635 aes_eround23 %f22, %f8, %f2, %f2 636 aes_eround01 %f20, %f10, %f6, %f4 637 aes_eround23 %f22, %f10, %f6, %f6 638 ldd [$key + 224], %f20 639 ldd [$key + 232], %f22 640___ 641for ($i=1; $i<6; $i++) { 642 $code.=<<___; 643 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 644 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 645 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 646 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 647 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 648 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 649 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 650 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 651___ 652} 653$code.=<<___; 654 aes_eround01 %f16, %f0, %f2, %f8 655 aes_eround23 %f18, %f0, %f2, %f2 656 aes_eround01 %f16, %f4, %f6, %f10 657 aes_eround23 %f18, %f4, %f6, %f6 658 ldd [$key + 16], %f16 659 ldd [$key + 24], %f18 660 aes_eround01_l %f20, %f8, %f2, %f0 661 aes_eround23_l %f22, %f8, %f2, %f2 662 aes_eround01_l %f20, %f10, %f6, %f4 663 aes_eround23_l %f22, %f10, %f6, %f6 664 ldd [$key + 32], %f20 665 retl 666 ldd [$key + 40], %f22 667.type _aes256_encrypt_2x,#function 668.size _aes256_encrypt_2x,.-_aes256_encrypt_2x 669 670.align 32 671_aes192_loadkey: 672 ldx [$key + 0], %g4 673 ldx [$key + 8], %g5 674___ 675for ($i=2; $i<26;$i++) { # load key schedule 676 $code.=<<___; 677 ldd [$key + `8*$i`], %f`12+2*$i` 678___ 679} 680$code.=<<___; 681 retl 682 nop 683.type _aes192_loadkey,#function 684.size _aes192_loadkey,.-_aes192_loadkey 685_aes256_loadkey=_aes192_loadkey 686_aes192_load_enckey=_aes192_loadkey 687_aes192_load_deckey=_aes192_loadkey 688_aes256_load_enckey=_aes192_loadkey 689_aes256_load_deckey=_aes192_loadkey 690___ 691 692&alg_cbc_encrypt_implement("aes",256); 693&alg_cbc_encrypt_implement("aes",192); 694if ($::evp) { 695 &alg_ctr32_implement("aes",256); 696 &alg_xts_implement("aes",256,"en"); 697 &alg_xts_implement("aes",256,"de"); 698 &alg_ctr32_implement("aes",192); 699} 700&alg_cbc_decrypt_implement("aes",192); 701&alg_cbc_decrypt_implement("aes",256); 702 703$code.=<<___; 704.align 32 705_aes256_decrypt_1x: 706 aes_dround01 %f16, %f0, %f2, %f4 707 aes_dround23 %f18, %f0, %f2, %f2 708 ldd [$key + 208], %f16 709 ldd [$key + 216], %f18 710 aes_dround01 %f20, %f4, %f2, %f0 711 aes_dround23 %f22, %f4, %f2, %f2 712 ldd [$key + 224], %f20 713 ldd [$key + 232], %f22 714___ 715for ($i=1; $i<6; $i++) { 716 $code.=<<___; 717 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 718 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 719 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 720 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 721___ 722} 723$code.=<<___; 724 aes_dround01 %f16, %f0, %f2, %f4 725 aes_dround23 %f18, %f0, %f2, %f2 726 ldd [$key + 16], %f16 727 ldd [$key + 24], %f18 728 aes_dround01_l %f20, %f4, %f2, %f0 729 aes_dround23_l %f22, %f4, %f2, %f2 730 ldd [$key + 32], %f20 731 retl 732 ldd [$key + 40], %f22 733.type _aes256_decrypt_1x,#function 734.size _aes256_decrypt_1x,.-_aes256_decrypt_1x 735 736.align 32 737_aes256_decrypt_2x: 738 aes_dround01 %f16, %f0, %f2, %f8 739 aes_dround23 %f18, %f0, %f2, %f2 740 aes_dround01 %f16, %f4, %f6, %f10 741 aes_dround23 %f18, %f4, %f6, %f6 742 ldd [$key + 208], %f16 743 ldd [$key + 216], %f18 744 aes_dround01 %f20, %f8, %f2, %f0 745 aes_dround23 %f22, %f8, %f2, %f2 746 aes_dround01 %f20, %f10, %f6, %f4 747 aes_dround23 %f22, %f10, %f6, %f6 748 ldd [$key + 224], %f20 749 ldd [$key + 232], %f22 750___ 751for ($i=1; $i<6; $i++) { 752 $code.=<<___; 753 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 754 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 755 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 756 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 757 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 758 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 759 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 760 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 761___ 762} 763$code.=<<___; 764 aes_dround01 %f16, %f0, %f2, %f8 765 aes_dround23 %f18, %f0, %f2, %f2 766 aes_dround01 %f16, %f4, %f6, %f10 767 aes_dround23 %f18, %f4, %f6, %f6 768 ldd [$key + 16], %f16 769 ldd [$key + 24], %f18 770 aes_dround01_l %f20, %f8, %f2, %f0 771 aes_dround23_l %f22, %f8, %f2, %f2 772 aes_dround01_l %f20, %f10, %f6, %f4 773 aes_dround23_l %f22, %f10, %f6, %f6 774 ldd [$key + 32], %f20 775 retl 776 ldd [$key + 40], %f22 777.type _aes256_decrypt_2x,#function 778.size _aes256_decrypt_2x,.-_aes256_decrypt_2x 779 780.align 32 781_aes192_decrypt_1x: 782___ 783for ($i=0; $i<5; $i++) { 784 $code.=<<___; 785 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 786 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 787 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 788 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 789___ 790} 791$code.=<<___; 792 aes_dround01 %f56, %f0, %f2, %f4 793 aes_dround23 %f58, %f0, %f2, %f2 794 aes_dround01_l %f60, %f4, %f2, %f0 795 retl 796 aes_dround23_l %f62, %f4, %f2, %f2 797.type _aes192_decrypt_1x,#function 798.size _aes192_decrypt_1x,.-_aes192_decrypt_1x 799 800.align 32 801_aes192_decrypt_2x: 802___ 803for ($i=0; $i<5; $i++) { 804 $code.=<<___; 805 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 806 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 807 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 808 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 809 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 810 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 811 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 812 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 813___ 814} 815$code.=<<___; 816 aes_dround01 %f56, %f0, %f2, %f8 817 aes_dround23 %f58, %f0, %f2, %f2 818 aes_dround01 %f56, %f4, %f6, %f10 819 aes_dround23 %f58, %f4, %f6, %f6 820 aes_dround01_l %f60, %f8, %f2, %f0 821 aes_dround23_l %f62, %f8, %f2, %f2 822 aes_dround01_l %f60, %f10, %f6, %f4 823 retl 824 aes_dround23_l %f62, %f10, %f6, %f6 825.type _aes192_decrypt_2x,#function 826.size _aes192_decrypt_2x,.-_aes192_decrypt_2x 827___ 828}}} 829 830if (!$::evp) { 831$code.=<<___; 832.global AES_encrypt 833AES_encrypt=aes_t4_encrypt 834.global AES_decrypt 835AES_decrypt=aes_t4_decrypt 836.global AES_set_encrypt_key 837.align 32 838AES_set_encrypt_key: 839 andcc %o2, 7, %g0 ! check alignment 840 bnz,a,pn %icc, 1f 841 mov -1, %o0 842 brz,a,pn %o0, 1f 843 mov -1, %o0 844 brz,a,pn %o2, 1f 845 mov -1, %o0 846 andncc %o1, 0x1c0, %g0 847 bnz,a,pn %icc, 1f 848 mov -2, %o0 849 cmp %o1, 128 850 bl,a,pn %icc, 1f 851 mov -2, %o0 852 b aes_t4_set_encrypt_key 853 nop 8541: retl 855 nop 856.type AES_set_encrypt_key,#function 857.size AES_set_encrypt_key,.-AES_set_encrypt_key 858 859.global AES_set_decrypt_key 860.align 32 861AES_set_decrypt_key: 862 andcc %o2, 7, %g0 ! check alignment 863 bnz,a,pn %icc, 1f 864 mov -1, %o0 865 brz,a,pn %o0, 1f 866 mov -1, %o0 867 brz,a,pn %o2, 1f 868 mov -1, %o0 869 andncc %o1, 0x1c0, %g0 870 bnz,a,pn %icc, 1f 871 mov -2, %o0 872 cmp %o1, 128 873 bl,a,pn %icc, 1f 874 mov -2, %o0 875 b aes_t4_set_decrypt_key 876 nop 8771: retl 878 nop 879.type AES_set_decrypt_key,#function 880.size AES_set_decrypt_key,.-AES_set_decrypt_key 881___ 882 883my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5)); 884 885$code.=<<___; 886.globl AES_cbc_encrypt 887.align 32 888AES_cbc_encrypt: 889 ld [$key + 240], %g1 890 nop 891 brz $enc, .Lcbc_decrypt 892 cmp %g1, 12 893 894 bl,pt %icc, aes128_t4_cbc_encrypt 895 nop 896 be,pn %icc, aes192_t4_cbc_encrypt 897 nop 898 ba aes256_t4_cbc_encrypt 899 nop 900 901.Lcbc_decrypt: 902 bl,pt %icc, aes128_t4_cbc_decrypt 903 nop 904 be,pn %icc, aes192_t4_cbc_decrypt 905 nop 906 ba aes256_t4_cbc_decrypt 907 nop 908.type AES_cbc_encrypt,#function 909.size AES_cbc_encrypt,.-AES_cbc_encrypt 910___ 911} 912$code.=<<___; 913.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov" 914.align 4 915___ 916 917&emit_assembler(); 918 919close STDOUT; 920