1#!/usr/bin/env perl 2 3# Specific modes implementations for SPARC Architecture 2011. There 4# is T4 dependency though, an ASI value that is not specified in the 5# Architecture Manual. But as SPARC universe is rather monocultural, 6# we imply that processor capable of executing crypto instructions 7# can handle the ASI in question as well. This means that we ought to 8# keep eyes open when new processors emerge... 9# 10# As for above mentioned ASI. It's so called "block initializing 11# store" which cancels "read" in "read-update-write" on cache lines. 12# This is "cooperative" optimization, as it reduces overall pressure 13# on memory interface. Benefits can't be observed/quantified with 14# usual benchmarks, on the contrary you can notice that single-thread 15# performance for parallelizable modes is ~1.5% worse for largest 16# block sizes [though few percent better for not so long ones]. All 17# this based on suggestions from David Miller. 18 19sub asm_init { # to be called with @ARGV as argument 20 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); } 21 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; } 22 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; } 23} 24 25# unified interface 26my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5)); 27# local variables 28my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7)); 29 30sub alg_cbc_encrypt_implement { 31my ($alg,$bits) = @_; 32 33$::code.=<<___; 34.globl ${alg}${bits}_t4_cbc_encrypt 35.align 32 36${alg}${bits}_t4_cbc_encrypt: 37 save %sp, -$::frame, %sp 38 cmp $len, 0 39 be,pn $::size_t_cc, .L${bits}_cbc_enc_abort 40 sub $inp, $out, $blk_init ! $inp!=$out 41___ 42$::code.=<<___ if (!$::evp); 43 andcc $ivec, 7, $ivoff 44 alignaddr $ivec, %g0, $ivec 45 46 ldd [$ivec + 0], %f0 ! load ivec 47 bz,pt %icc, 1f 48 ldd [$ivec + 8], %f2 49 ldd [$ivec + 16], %f4 50 faligndata %f0, %f2, %f0 51 faligndata %f2, %f4, %f2 521: 53___ 54$::code.=<<___ if ($::evp); 55 ld [$ivec + 0], %f0 56 ld [$ivec + 4], %f1 57 ld [$ivec + 8], %f2 58 ld [$ivec + 12], %f3 59___ 60$::code.=<<___; 61 prefetch [$inp], 20 62 prefetch [$inp + 63], 20 63 call _${alg}${bits}_load_enckey 64 and $inp, 7, $ileft 65 andn $inp, 7, $inp 66 sll $ileft, 3, $ileft 67 mov 64, $iright 68 mov 0xff, $omask 69 sub $iright, $ileft, $iright 70 and $out, 7, $ooff 71 cmp $len, 127 72 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 73 movleu $::size_t_cc, 0, $blk_init ! $len<128 || 74 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out) 75 srl $omask, $ooff, $omask 76 77 alignaddrl $out, %g0, $out 78 srlx $len, 4, $len 79 prefetch [$out], 22 80 81.L${bits}_cbc_enc_loop: 82 ldx [$inp + 0], %o0 83 brz,pt $ileft, 4f 84 ldx [$inp + 8], %o1 85 86 ldx [$inp + 16], %o2 87 sllx %o0, $ileft, %o0 88 srlx %o1, $iright, %g1 89 sllx %o1, $ileft, %o1 90 or %g1, %o0, %o0 91 srlx %o2, $iright, %o2 92 or %o2, %o1, %o1 934: 94 xor %g4, %o0, %o0 ! ^= rk[0] 95 xor %g5, %o1, %o1 96 movxtod %o0, %f12 97 movxtod %o1, %f14 98 99 fxor %f12, %f0, %f0 ! ^= ivec 100 fxor %f14, %f2, %f2 101 prefetch [$out + 63], 22 102 prefetch [$inp + 16+63], 20 103 call _${alg}${bits}_encrypt_1x 104 add $inp, 16, $inp 105 106 brnz,pn $ooff, 2f 107 sub $len, 1, $len 108 109 std %f0, [$out + 0] 110 std %f2, [$out + 8] 111 brnz,pt $len, .L${bits}_cbc_enc_loop 112 add $out, 16, $out 113___ 114$::code.=<<___ if ($::evp); 115 st %f0, [$ivec + 0] 116 st %f1, [$ivec + 4] 117 st %f2, [$ivec + 8] 118 st %f3, [$ivec + 12] 119___ 120$::code.=<<___ if (!$::evp); 121 brnz,pn $ivoff, 3f 122 nop 123 124 std %f0, [$ivec + 0] ! write out ivec 125 std %f2, [$ivec + 8] 126___ 127$::code.=<<___; 128.L${bits}_cbc_enc_abort: 129 ret 130 restore 131 132.align 16 1332: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 134 ! and ~3x deterioration 135 ! in inp==out case 136 faligndata %f0, %f0, %f4 ! handle unaligned output 137 faligndata %f0, %f2, %f6 138 faligndata %f2, %f2, %f8 139 140 stda %f4, [$out + $omask]0xc0 ! partial store 141 std %f6, [$out + 8] 142 add $out, 16, $out 143 orn %g0, $omask, $omask 144 stda %f8, [$out + $omask]0xc0 ! partial store 145 146 brnz,pt $len, .L${bits}_cbc_enc_loop+4 147 orn %g0, $omask, $omask 148___ 149$::code.=<<___ if ($::evp); 150 st %f0, [$ivec + 0] 151 st %f1, [$ivec + 4] 152 st %f2, [$ivec + 8] 153 st %f3, [$ivec + 12] 154___ 155$::code.=<<___ if (!$::evp); 156 brnz,pn $ivoff, 3f 157 nop 158 159 std %f0, [$ivec + 0] ! write out ivec 160 std %f2, [$ivec + 8] 161 ret 162 restore 163 164.align 16 1653: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec 166 mov 0xff, $omask 167 srl $omask, $ivoff, $omask 168 faligndata %f0, %f0, %f4 169 faligndata %f0, %f2, %f6 170 faligndata %f2, %f2, %f8 171 stda %f4, [$ivec + $omask]0xc0 172 std %f6, [$ivec + 8] 173 add $ivec, 16, $ivec 174 orn %g0, $omask, $omask 175 stda %f8, [$ivec + $omask]0xc0 176___ 177$::code.=<<___; 178 ret 179 restore 180 181!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 182.align 32 183.L${bits}cbc_enc_blk: 184 add $out, $len, $blk_init 185 and $blk_init, 63, $blk_init ! tail 186 sub $len, $blk_init, $len 187 add $blk_init, 15, $blk_init ! round up to 16n 188 srlx $len, 4, $len 189 srl $blk_init, 4, $blk_init 190 191.L${bits}_cbc_enc_blk_loop: 192 ldx [$inp + 0], %o0 193 brz,pt $ileft, 5f 194 ldx [$inp + 8], %o1 195 196 ldx [$inp + 16], %o2 197 sllx %o0, $ileft, %o0 198 srlx %o1, $iright, %g1 199 sllx %o1, $ileft, %o1 200 or %g1, %o0, %o0 201 srlx %o2, $iright, %o2 202 or %o2, %o1, %o1 2035: 204 xor %g4, %o0, %o0 ! ^= rk[0] 205 xor %g5, %o1, %o1 206 movxtod %o0, %f12 207 movxtod %o1, %f14 208 209 fxor %f12, %f0, %f0 ! ^= ivec 210 fxor %f14, %f2, %f2 211 prefetch [$inp + 16+63], 20 212 call _${alg}${bits}_encrypt_1x 213 add $inp, 16, $inp 214 sub $len, 1, $len 215 216 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 217 add $out, 8, $out 218 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 219 brnz,pt $len, .L${bits}_cbc_enc_blk_loop 220 add $out, 8, $out 221 222 membar #StoreLoad|#StoreStore 223 brnz,pt $blk_init, .L${bits}_cbc_enc_loop 224 mov $blk_init, $len 225___ 226$::code.=<<___ if ($::evp); 227 st %f0, [$ivec + 0] 228 st %f1, [$ivec + 4] 229 st %f2, [$ivec + 8] 230 st %f3, [$ivec + 12] 231___ 232$::code.=<<___ if (!$::evp); 233 brnz,pn $ivoff, 3b 234 nop 235 236 std %f0, [$ivec + 0] ! write out ivec 237 std %f2, [$ivec + 8] 238___ 239$::code.=<<___; 240 ret 241 restore 242.type ${alg}${bits}_t4_cbc_encrypt,#function 243.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt 244___ 245} 246 247sub alg_cbc_decrypt_implement { 248my ($alg,$bits) = @_; 249 250$::code.=<<___; 251.globl ${alg}${bits}_t4_cbc_decrypt 252.align 32 253${alg}${bits}_t4_cbc_decrypt: 254 save %sp, -$::frame, %sp 255 cmp $len, 0 256 be,pn $::size_t_cc, .L${bits}_cbc_dec_abort 257 sub $inp, $out, $blk_init ! $inp!=$out 258___ 259$::code.=<<___ if (!$::evp); 260 andcc $ivec, 7, $ivoff 261 alignaddr $ivec, %g0, $ivec 262 263 ldd [$ivec + 0], %f12 ! load ivec 264 bz,pt %icc, 1f 265 ldd [$ivec + 8], %f14 266 ldd [$ivec + 16], %f0 267 faligndata %f12, %f14, %f12 268 faligndata %f14, %f0, %f14 2691: 270___ 271$::code.=<<___ if ($::evp); 272 ld [$ivec + 0], %f12 ! load ivec 273 ld [$ivec + 4], %f13 274 ld [$ivec + 8], %f14 275 ld [$ivec + 12], %f15 276___ 277$::code.=<<___; 278 prefetch [$inp], 20 279 prefetch [$inp + 63], 20 280 call _${alg}${bits}_load_deckey 281 and $inp, 7, $ileft 282 andn $inp, 7, $inp 283 sll $ileft, 3, $ileft 284 mov 64, $iright 285 mov 0xff, $omask 286 sub $iright, $ileft, $iright 287 and $out, 7, $ooff 288 cmp $len, 255 289 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 290 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 291 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out) 292 srl $omask, $ooff, $omask 293 294 andcc $len, 16, %g0 ! is number of blocks even? 295 srlx $len, 4, $len 296 alignaddrl $out, %g0, $out 297 bz %icc, .L${bits}_cbc_dec_loop2x 298 prefetch [$out], 22 299.L${bits}_cbc_dec_loop: 300 ldx [$inp + 0], %o0 301 brz,pt $ileft, 4f 302 ldx [$inp + 8], %o1 303 304 ldx [$inp + 16], %o2 305 sllx %o0, $ileft, %o0 306 srlx %o1, $iright, %g1 307 sllx %o1, $ileft, %o1 308 or %g1, %o0, %o0 309 srlx %o2, $iright, %o2 310 or %o2, %o1, %o1 3114: 312 xor %g4, %o0, %o2 ! ^= rk[0] 313 xor %g5, %o1, %o3 314 movxtod %o2, %f0 315 movxtod %o3, %f2 316 317 prefetch [$out + 63], 22 318 prefetch [$inp + 16+63], 20 319 call _${alg}${bits}_decrypt_1x 320 add $inp, 16, $inp 321 322 fxor %f12, %f0, %f0 ! ^= ivec 323 fxor %f14, %f2, %f2 324 movxtod %o0, %f12 325 movxtod %o1, %f14 326 327 brnz,pn $ooff, 2f 328 sub $len, 1, $len 329 330 std %f0, [$out + 0] 331 std %f2, [$out + 8] 332 brnz,pt $len, .L${bits}_cbc_dec_loop2x 333 add $out, 16, $out 334___ 335$::code.=<<___ if ($::evp); 336 st %f12, [$ivec + 0] 337 st %f13, [$ivec + 4] 338 st %f14, [$ivec + 8] 339 st %f15, [$ivec + 12] 340___ 341$::code.=<<___ if (!$::evp); 342 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 343 nop 344 345 std %f12, [$ivec + 0] ! write out ivec 346 std %f14, [$ivec + 8] 347___ 348$::code.=<<___; 349.L${bits}_cbc_dec_abort: 350 ret 351 restore 352 353.align 16 3542: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 355 ! and ~3x deterioration 356 ! in inp==out case 357 faligndata %f0, %f0, %f4 ! handle unaligned output 358 faligndata %f0, %f2, %f6 359 faligndata %f2, %f2, %f8 360 361 stda %f4, [$out + $omask]0xc0 ! partial store 362 std %f6, [$out + 8] 363 add $out, 16, $out 364 orn %g0, $omask, $omask 365 stda %f8, [$out + $omask]0xc0 ! partial store 366 367 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 368 orn %g0, $omask, $omask 369___ 370$::code.=<<___ if ($::evp); 371 st %f12, [$ivec + 0] 372 st %f13, [$ivec + 4] 373 st %f14, [$ivec + 8] 374 st %f15, [$ivec + 12] 375___ 376$::code.=<<___ if (!$::evp); 377 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 378 nop 379 380 std %f12, [$ivec + 0] ! write out ivec 381 std %f14, [$ivec + 8] 382___ 383$::code.=<<___; 384 ret 385 restore 386 387!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 388.align 32 389.L${bits}_cbc_dec_loop2x: 390 ldx [$inp + 0], %o0 391 ldx [$inp + 8], %o1 392 ldx [$inp + 16], %o2 393 brz,pt $ileft, 4f 394 ldx [$inp + 24], %o3 395 396 ldx [$inp + 32], %o4 397 sllx %o0, $ileft, %o0 398 srlx %o1, $iright, %g1 399 or %g1, %o0, %o0 400 sllx %o1, $ileft, %o1 401 srlx %o2, $iright, %g1 402 or %g1, %o1, %o1 403 sllx %o2, $ileft, %o2 404 srlx %o3, $iright, %g1 405 or %g1, %o2, %o2 406 sllx %o3, $ileft, %o3 407 srlx %o4, $iright, %o4 408 or %o4, %o3, %o3 4094: 410 xor %g4, %o0, %o4 ! ^= rk[0] 411 xor %g5, %o1, %o5 412 movxtod %o4, %f0 413 movxtod %o5, %f2 414 xor %g4, %o2, %o4 415 xor %g5, %o3, %o5 416 movxtod %o4, %f4 417 movxtod %o5, %f6 418 419 prefetch [$out + 63], 22 420 prefetch [$inp + 32+63], 20 421 call _${alg}${bits}_decrypt_2x 422 add $inp, 32, $inp 423 424 movxtod %o0, %f8 425 movxtod %o1, %f10 426 fxor %f12, %f0, %f0 ! ^= ivec 427 fxor %f14, %f2, %f2 428 movxtod %o2, %f12 429 movxtod %o3, %f14 430 fxor %f8, %f4, %f4 431 fxor %f10, %f6, %f6 432 433 brnz,pn $ooff, 2f 434 sub $len, 2, $len 435 436 std %f0, [$out + 0] 437 std %f2, [$out + 8] 438 std %f4, [$out + 16] 439 std %f6, [$out + 24] 440 brnz,pt $len, .L${bits}_cbc_dec_loop2x 441 add $out, 32, $out 442___ 443$::code.=<<___ if ($::evp); 444 st %f12, [$ivec + 0] 445 st %f13, [$ivec + 4] 446 st %f14, [$ivec + 8] 447 st %f15, [$ivec + 12] 448___ 449$::code.=<<___ if (!$::evp); 450 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 451 nop 452 453 std %f12, [$ivec + 0] ! write out ivec 454 std %f14, [$ivec + 8] 455___ 456$::code.=<<___; 457 ret 458 restore 459 460.align 16 4612: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 462 ! and ~3x deterioration 463 ! in inp==out case 464 faligndata %f0, %f0, %f8 ! handle unaligned output 465 faligndata %f0, %f2, %f0 466 faligndata %f2, %f4, %f2 467 faligndata %f4, %f6, %f4 468 faligndata %f6, %f6, %f6 469 stda %f8, [$out + $omask]0xc0 ! partial store 470 std %f0, [$out + 8] 471 std %f2, [$out + 16] 472 std %f4, [$out + 24] 473 add $out, 32, $out 474 orn %g0, $omask, $omask 475 stda %f6, [$out + $omask]0xc0 ! partial store 476 477 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 478 orn %g0, $omask, $omask 479___ 480$::code.=<<___ if ($::evp); 481 st %f12, [$ivec + 0] 482 st %f13, [$ivec + 4] 483 st %f14, [$ivec + 8] 484 st %f15, [$ivec + 12] 485___ 486$::code.=<<___ if (!$::evp); 487 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 488 nop 489 490 std %f12, [$ivec + 0] ! write out ivec 491 std %f14, [$ivec + 8] 492 ret 493 restore 494 495.align 16 496.L${bits}_cbc_dec_unaligned_ivec: 497 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec 498 mov 0xff, $omask 499 srl $omask, $ivoff, $omask 500 faligndata %f12, %f12, %f0 501 faligndata %f12, %f14, %f2 502 faligndata %f14, %f14, %f4 503 stda %f0, [$ivec + $omask]0xc0 504 std %f2, [$ivec + 8] 505 add $ivec, 16, $ivec 506 orn %g0, $omask, $omask 507 stda %f4, [$ivec + $omask]0xc0 508___ 509$::code.=<<___; 510 ret 511 restore 512 513!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 514.align 32 515.L${bits}cbc_dec_blk: 516 add $out, $len, $blk_init 517 and $blk_init, 63, $blk_init ! tail 518 sub $len, $blk_init, $len 519 add $blk_init, 15, $blk_init ! round up to 16n 520 srlx $len, 4, $len 521 srl $blk_init, 4, $blk_init 522 sub $len, 1, $len 523 add $blk_init, 1, $blk_init 524 525.L${bits}_cbc_dec_blk_loop2x: 526 ldx [$inp + 0], %o0 527 ldx [$inp + 8], %o1 528 ldx [$inp + 16], %o2 529 brz,pt $ileft, 5f 530 ldx [$inp + 24], %o3 531 532 ldx [$inp + 32], %o4 533 sllx %o0, $ileft, %o0 534 srlx %o1, $iright, %g1 535 or %g1, %o0, %o0 536 sllx %o1, $ileft, %o1 537 srlx %o2, $iright, %g1 538 or %g1, %o1, %o1 539 sllx %o2, $ileft, %o2 540 srlx %o3, $iright, %g1 541 or %g1, %o2, %o2 542 sllx %o3, $ileft, %o3 543 srlx %o4, $iright, %o4 544 or %o4, %o3, %o3 5455: 546 xor %g4, %o0, %o4 ! ^= rk[0] 547 xor %g5, %o1, %o5 548 movxtod %o4, %f0 549 movxtod %o5, %f2 550 xor %g4, %o2, %o4 551 xor %g5, %o3, %o5 552 movxtod %o4, %f4 553 movxtod %o5, %f6 554 555 prefetch [$inp + 32+63], 20 556 call _${alg}${bits}_decrypt_2x 557 add $inp, 32, $inp 558 subcc $len, 2, $len 559 560 movxtod %o0, %f8 561 movxtod %o1, %f10 562 fxor %f12, %f0, %f0 ! ^= ivec 563 fxor %f14, %f2, %f2 564 movxtod %o2, %f12 565 movxtod %o3, %f14 566 fxor %f8, %f4, %f4 567 fxor %f10, %f6, %f6 568 569 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 570 add $out, 8, $out 571 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 572 add $out, 8, $out 573 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 574 add $out, 8, $out 575 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 576 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x 577 add $out, 8, $out 578 579 add $blk_init, $len, $len 580 andcc $len, 1, %g0 ! is number of blocks even? 581 membar #StoreLoad|#StoreStore 582 bnz,pt %icc, .L${bits}_cbc_dec_loop 583 srl $len, 0, $len 584 brnz,pn $len, .L${bits}_cbc_dec_loop2x 585 nop 586___ 587$::code.=<<___ if ($::evp); 588 st %f12, [$ivec + 0] ! write out ivec 589 st %f13, [$ivec + 4] 590 st %f14, [$ivec + 8] 591 st %f15, [$ivec + 12] 592___ 593$::code.=<<___ if (!$::evp); 594 brnz,pn $ivoff, 3b 595 nop 596 597 std %f12, [$ivec + 0] ! write out ivec 598 std %f14, [$ivec + 8] 599___ 600$::code.=<<___; 601 ret 602 restore 603.type ${alg}${bits}_t4_cbc_decrypt,#function 604.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt 605___ 606} 607 608sub alg_ctr32_implement { 609my ($alg,$bits) = @_; 610 611$::code.=<<___; 612.globl ${alg}${bits}_t4_ctr32_encrypt 613.align 32 614${alg}${bits}_t4_ctr32_encrypt: 615 save %sp, -$::frame, %sp 616 617 prefetch [$inp], 20 618 prefetch [$inp + 63], 20 619 call _${alg}${bits}_load_enckey 620 sllx $len, 4, $len 621 622 ld [$ivec + 0], %l4 ! counter 623 ld [$ivec + 4], %l5 624 ld [$ivec + 8], %l6 625 ld [$ivec + 12], %l7 626 627 sllx %l4, 32, %o5 628 or %l5, %o5, %o5 629 sllx %l6, 32, %g1 630 xor %o5, %g4, %g4 ! ^= rk[0] 631 xor %g1, %g5, %g5 632 movxtod %g4, %f14 ! most significant 64 bits 633 634 sub $inp, $out, $blk_init ! $inp!=$out 635 and $inp, 7, $ileft 636 andn $inp, 7, $inp 637 sll $ileft, 3, $ileft 638 mov 64, $iright 639 mov 0xff, $omask 640 sub $iright, $ileft, $iright 641 and $out, 7, $ooff 642 cmp $len, 255 643 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 644 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 645 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out) 646 srl $omask, $ooff, $omask 647 648 andcc $len, 16, %g0 ! is number of blocks even? 649 alignaddrl $out, %g0, $out 650 bz %icc, .L${bits}_ctr32_loop2x 651 srlx $len, 4, $len 652.L${bits}_ctr32_loop: 653 ldx [$inp + 0], %o0 654 brz,pt $ileft, 4f 655 ldx [$inp + 8], %o1 656 657 ldx [$inp + 16], %o2 658 sllx %o0, $ileft, %o0 659 srlx %o1, $iright, %g1 660 sllx %o1, $ileft, %o1 661 or %g1, %o0, %o0 662 srlx %o2, $iright, %o2 663 or %o2, %o1, %o1 6644: 665 xor %g5, %l7, %g1 ! ^= rk[0] 666 add %l7, 1, %l7 667 movxtod %g1, %f2 668 srl %l7, 0, %l7 ! clruw 669 prefetch [$out + 63], 22 670 prefetch [$inp + 16+63], 20 671___ 672$::code.=<<___ if ($alg eq "aes"); 673 aes_eround01 %f16, %f14, %f2, %f4 674 aes_eround23 %f18, %f14, %f2, %f2 675___ 676$::code.=<<___ if ($alg eq "cmll"); 677 camellia_f %f16, %f2, %f14, %f2 678 camellia_f %f18, %f14, %f2, %f0 679___ 680$::code.=<<___; 681 call _${alg}${bits}_encrypt_1x+8 682 add $inp, 16, $inp 683 684 movxtod %o0, %f10 685 movxtod %o1, %f12 686 fxor %f10, %f0, %f0 ! ^= inp 687 fxor %f12, %f2, %f2 688 689 brnz,pn $ooff, 2f 690 sub $len, 1, $len 691 692 std %f0, [$out + 0] 693 std %f2, [$out + 8] 694 brnz,pt $len, .L${bits}_ctr32_loop2x 695 add $out, 16, $out 696 697 ret 698 restore 699 700.align 16 7012: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 702 ! and ~3x deterioration 703 ! in inp==out case 704 faligndata %f0, %f0, %f4 ! handle unaligned output 705 faligndata %f0, %f2, %f6 706 faligndata %f2, %f2, %f8 707 stda %f4, [$out + $omask]0xc0 ! partial store 708 std %f6, [$out + 8] 709 add $out, 16, $out 710 orn %g0, $omask, $omask 711 stda %f8, [$out + $omask]0xc0 ! partial store 712 713 brnz,pt $len, .L${bits}_ctr32_loop2x+4 714 orn %g0, $omask, $omask 715 716 ret 717 restore 718 719!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 720.align 32 721.L${bits}_ctr32_loop2x: 722 ldx [$inp + 0], %o0 723 ldx [$inp + 8], %o1 724 ldx [$inp + 16], %o2 725 brz,pt $ileft, 4f 726 ldx [$inp + 24], %o3 727 728 ldx [$inp + 32], %o4 729 sllx %o0, $ileft, %o0 730 srlx %o1, $iright, %g1 731 or %g1, %o0, %o0 732 sllx %o1, $ileft, %o1 733 srlx %o2, $iright, %g1 734 or %g1, %o1, %o1 735 sllx %o2, $ileft, %o2 736 srlx %o3, $iright, %g1 737 or %g1, %o2, %o2 738 sllx %o3, $ileft, %o3 739 srlx %o4, $iright, %o4 740 or %o4, %o3, %o3 7414: 742 xor %g5, %l7, %g1 ! ^= rk[0] 743 add %l7, 1, %l7 744 movxtod %g1, %f2 745 srl %l7, 0, %l7 ! clruw 746 xor %g5, %l7, %g1 747 add %l7, 1, %l7 748 movxtod %g1, %f6 749 srl %l7, 0, %l7 ! clruw 750 prefetch [$out + 63], 22 751 prefetch [$inp + 32+63], 20 752___ 753$::code.=<<___ if ($alg eq "aes"); 754 aes_eround01 %f16, %f14, %f2, %f8 755 aes_eround23 %f18, %f14, %f2, %f2 756 aes_eround01 %f16, %f14, %f6, %f10 757 aes_eround23 %f18, %f14, %f6, %f6 758___ 759$::code.=<<___ if ($alg eq "cmll"); 760 camellia_f %f16, %f2, %f14, %f2 761 camellia_f %f16, %f6, %f14, %f6 762 camellia_f %f18, %f14, %f2, %f0 763 camellia_f %f18, %f14, %f6, %f4 764___ 765$::code.=<<___; 766 call _${alg}${bits}_encrypt_2x+16 767 add $inp, 32, $inp 768 769 movxtod %o0, %f8 770 movxtod %o1, %f10 771 movxtod %o2, %f12 772 fxor %f8, %f0, %f0 ! ^= inp 773 movxtod %o3, %f8 774 fxor %f10, %f2, %f2 775 fxor %f12, %f4, %f4 776 fxor %f8, %f6, %f6 777 778 brnz,pn $ooff, 2f 779 sub $len, 2, $len 780 781 std %f0, [$out + 0] 782 std %f2, [$out + 8] 783 std %f4, [$out + 16] 784 std %f6, [$out + 24] 785 brnz,pt $len, .L${bits}_ctr32_loop2x 786 add $out, 32, $out 787 788 ret 789 restore 790 791.align 16 7922: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 793 ! and ~3x deterioration 794 ! in inp==out case 795 faligndata %f0, %f0, %f8 ! handle unaligned output 796 faligndata %f0, %f2, %f0 797 faligndata %f2, %f4, %f2 798 faligndata %f4, %f6, %f4 799 faligndata %f6, %f6, %f6 800 801 stda %f8, [$out + $omask]0xc0 ! partial store 802 std %f0, [$out + 8] 803 std %f2, [$out + 16] 804 std %f4, [$out + 24] 805 add $out, 32, $out 806 orn %g0, $omask, $omask 807 stda %f6, [$out + $omask]0xc0 ! partial store 808 809 brnz,pt $len, .L${bits}_ctr32_loop2x+4 810 orn %g0, $omask, $omask 811 812 ret 813 restore 814 815!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 816.align 32 817.L${bits}_ctr32_blk: 818 add $out, $len, $blk_init 819 and $blk_init, 63, $blk_init ! tail 820 sub $len, $blk_init, $len 821 add $blk_init, 15, $blk_init ! round up to 16n 822 srlx $len, 4, $len 823 srl $blk_init, 4, $blk_init 824 sub $len, 1, $len 825 add $blk_init, 1, $blk_init 826 827.L${bits}_ctr32_blk_loop2x: 828 ldx [$inp + 0], %o0 829 ldx [$inp + 8], %o1 830 ldx [$inp + 16], %o2 831 brz,pt $ileft, 5f 832 ldx [$inp + 24], %o3 833 834 ldx [$inp + 32], %o4 835 sllx %o0, $ileft, %o0 836 srlx %o1, $iright, %g1 837 or %g1, %o0, %o0 838 sllx %o1, $ileft, %o1 839 srlx %o2, $iright, %g1 840 or %g1, %o1, %o1 841 sllx %o2, $ileft, %o2 842 srlx %o3, $iright, %g1 843 or %g1, %o2, %o2 844 sllx %o3, $ileft, %o3 845 srlx %o4, $iright, %o4 846 or %o4, %o3, %o3 8475: 848 xor %g5, %l7, %g1 ! ^= rk[0] 849 add %l7, 1, %l7 850 movxtod %g1, %f2 851 srl %l7, 0, %l7 ! clruw 852 xor %g5, %l7, %g1 853 add %l7, 1, %l7 854 movxtod %g1, %f6 855 srl %l7, 0, %l7 ! clruw 856 prefetch [$inp + 32+63], 20 857___ 858$::code.=<<___ if ($alg eq "aes"); 859 aes_eround01 %f16, %f14, %f2, %f8 860 aes_eround23 %f18, %f14, %f2, %f2 861 aes_eround01 %f16, %f14, %f6, %f10 862 aes_eround23 %f18, %f14, %f6, %f6 863___ 864$::code.=<<___ if ($alg eq "cmll"); 865 camellia_f %f16, %f2, %f14, %f2 866 camellia_f %f16, %f6, %f14, %f6 867 camellia_f %f18, %f14, %f2, %f0 868 camellia_f %f18, %f14, %f6, %f4 869___ 870$::code.=<<___; 871 call _${alg}${bits}_encrypt_2x+16 872 add $inp, 32, $inp 873 subcc $len, 2, $len 874 875 movxtod %o0, %f8 876 movxtod %o1, %f10 877 movxtod %o2, %f12 878 fxor %f8, %f0, %f0 ! ^= inp 879 movxtod %o3, %f8 880 fxor %f10, %f2, %f2 881 fxor %f12, %f4, %f4 882 fxor %f8, %f6, %f6 883 884 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 885 add $out, 8, $out 886 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 887 add $out, 8, $out 888 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 889 add $out, 8, $out 890 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 891 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x 892 add $out, 8, $out 893 894 add $blk_init, $len, $len 895 andcc $len, 1, %g0 ! is number of blocks even? 896 membar #StoreLoad|#StoreStore 897 bnz,pt %icc, .L${bits}_ctr32_loop 898 srl $len, 0, $len 899 brnz,pn $len, .L${bits}_ctr32_loop2x 900 nop 901 902 ret 903 restore 904.type ${alg}${bits}_t4_ctr32_encrypt,#function 905.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt 906___ 907} 908 909sub alg_xts_implement { 910my ($alg,$bits,$dir) = @_; 911my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5)); 912my $rem=$ivec; 913 914$::code.=<<___; 915.globl ${alg}${bits}_t4_xts_${dir}crypt 916.align 32 917${alg}${bits}_t4_xts_${dir}crypt: 918 save %sp, -$::frame-16, %sp 919 920 mov $ivec, %o0 921 add %fp, $::bias-16, %o1 922 call ${alg}_t4_encrypt 923 mov $key2, %o2 924 925 add %fp, $::bias-16, %l7 926 ldxa [%l7]0x88, %g2 927 add %fp, $::bias-8, %l7 928 ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak 929 930 sethi %hi(0x76543210), %l7 931 or %l7, %lo(0x76543210), %l7 932 bmask %l7, %g0, %g0 ! byte swap mask 933 934 prefetch [$inp], 20 935 prefetch [$inp + 63], 20 936 call _${alg}${bits}_load_${dir}ckey 937 and $len, 15, $rem 938 and $len, -16, $len 939___ 940$code.=<<___ if ($dir eq "de"); 941 mov 0, %l7 942 movrnz $rem, 16, %l7 943 sub $len, %l7, $len 944___ 945$code.=<<___; 946 947 sub $inp, $out, $blk_init ! $inp!=$out 948 and $inp, 7, $ileft 949 andn $inp, 7, $inp 950 sll $ileft, 3, $ileft 951 mov 64, $iright 952 mov 0xff, $omask 953 sub $iright, $ileft, $iright 954 and $out, 7, $ooff 955 cmp $len, 255 956 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 957 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 958 brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out) 959 srl $omask, $ooff, $omask 960 961 andcc $len, 16, %g0 ! is number of blocks even? 962___ 963$code.=<<___ if ($dir eq "de"); 964 brz,pn $len, .L${bits}_xts_${dir}steal 965___ 966$code.=<<___; 967 alignaddrl $out, %g0, $out 968 bz %icc, .L${bits}_xts_${dir}loop2x 969 srlx $len, 4, $len 970.L${bits}_xts_${dir}loop: 971 ldx [$inp + 0], %o0 972 brz,pt $ileft, 4f 973 ldx [$inp + 8], %o1 974 975 ldx [$inp + 16], %o2 976 sllx %o0, $ileft, %o0 977 srlx %o1, $iright, %g1 978 sllx %o1, $ileft, %o1 979 or %g1, %o0, %o0 980 srlx %o2, $iright, %o2 981 or %o2, %o1, %o1 9824: 983 movxtod %g2, %f12 984 movxtod %g3, %f14 985 bshuffle %f12, %f12, %f12 986 bshuffle %f14, %f14, %f14 987 988 xor %g4, %o0, %o0 ! ^= rk[0] 989 xor %g5, %o1, %o1 990 movxtod %o0, %f0 991 movxtod %o1, %f2 992 993 fxor %f12, %f0, %f0 ! ^= tweak[0] 994 fxor %f14, %f2, %f2 995 996 prefetch [$out + 63], 22 997 prefetch [$inp + 16+63], 20 998 call _${alg}${bits}_${dir}crypt_1x 999 add $inp, 16, $inp 1000 1001 fxor %f12, %f0, %f0 ! ^= tweak[0] 1002 fxor %f14, %f2, %f2 1003 1004 srax %g3, 63, %l7 ! next tweak value 1005 addcc %g2, %g2, %g2 1006 and %l7, 0x87, %l7 1007 addxc %g3, %g3, %g3 1008 xor %l7, %g2, %g2 1009 1010 brnz,pn $ooff, 2f 1011 sub $len, 1, $len 1012 1013 std %f0, [$out + 0] 1014 std %f2, [$out + 8] 1015 brnz,pt $len, .L${bits}_xts_${dir}loop2x 1016 add $out, 16, $out 1017 1018 brnz,pn $rem, .L${bits}_xts_${dir}steal 1019 nop 1020 1021 ret 1022 restore 1023 1024.align 16 10252: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 1026 ! and ~3x deterioration 1027 ! in inp==out case 1028 faligndata %f0, %f0, %f4 ! handle unaligned output 1029 faligndata %f0, %f2, %f6 1030 faligndata %f2, %f2, %f8 1031 stda %f4, [$out + $omask]0xc0 ! partial store 1032 std %f6, [$out + 8] 1033 add $out, 16, $out 1034 orn %g0, $omask, $omask 1035 stda %f8, [$out + $omask]0xc0 ! partial store 1036 1037 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 1038 orn %g0, $omask, $omask 1039 1040 brnz,pn $rem, .L${bits}_xts_${dir}steal 1041 nop 1042 1043 ret 1044 restore 1045 1046!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1047.align 32 1048.L${bits}_xts_${dir}loop2x: 1049 ldx [$inp + 0], %o0 1050 ldx [$inp + 8], %o1 1051 ldx [$inp + 16], %o2 1052 brz,pt $ileft, 4f 1053 ldx [$inp + 24], %o3 1054 1055 ldx [$inp + 32], %o4 1056 sllx %o0, $ileft, %o0 1057 srlx %o1, $iright, %g1 1058 or %g1, %o0, %o0 1059 sllx %o1, $ileft, %o1 1060 srlx %o2, $iright, %g1 1061 or %g1, %o1, %o1 1062 sllx %o2, $ileft, %o2 1063 srlx %o3, $iright, %g1 1064 or %g1, %o2, %o2 1065 sllx %o3, $ileft, %o3 1066 srlx %o4, $iright, %o4 1067 or %o4, %o3, %o3 10684: 1069 movxtod %g2, %f12 1070 movxtod %g3, %f14 1071 bshuffle %f12, %f12, %f12 1072 bshuffle %f14, %f14, %f14 1073 1074 srax %g3, 63, %l7 ! next tweak value 1075 addcc %g2, %g2, %g2 1076 and %l7, 0x87, %l7 1077 addxc %g3, %g3, %g3 1078 xor %l7, %g2, %g2 1079 1080 movxtod %g2, %f8 1081 movxtod %g3, %f10 1082 bshuffle %f8, %f8, %f8 1083 bshuffle %f10, %f10, %f10 1084 1085 xor %g4, %o0, %o0 ! ^= rk[0] 1086 xor %g5, %o1, %o1 1087 xor %g4, %o2, %o2 ! ^= rk[0] 1088 xor %g5, %o3, %o3 1089 movxtod %o0, %f0 1090 movxtod %o1, %f2 1091 movxtod %o2, %f4 1092 movxtod %o3, %f6 1093 1094 fxor %f12, %f0, %f0 ! ^= tweak[0] 1095 fxor %f14, %f2, %f2 1096 fxor %f8, %f4, %f4 ! ^= tweak[0] 1097 fxor %f10, %f6, %f6 1098 1099 prefetch [$out + 63], 22 1100 prefetch [$inp + 32+63], 20 1101 call _${alg}${bits}_${dir}crypt_2x 1102 add $inp, 32, $inp 1103 1104 movxtod %g2, %f8 1105 movxtod %g3, %f10 1106 1107 srax %g3, 63, %l7 ! next tweak value 1108 addcc %g2, %g2, %g2 1109 and %l7, 0x87, %l7 1110 addxc %g3, %g3, %g3 1111 xor %l7, %g2, %g2 1112 1113 bshuffle %f8, %f8, %f8 1114 bshuffle %f10, %f10, %f10 1115 1116 fxor %f12, %f0, %f0 ! ^= tweak[0] 1117 fxor %f14, %f2, %f2 1118 fxor %f8, %f4, %f4 1119 fxor %f10, %f6, %f6 1120 1121 brnz,pn $ooff, 2f 1122 sub $len, 2, $len 1123 1124 std %f0, [$out + 0] 1125 std %f2, [$out + 8] 1126 std %f4, [$out + 16] 1127 std %f6, [$out + 24] 1128 brnz,pt $len, .L${bits}_xts_${dir}loop2x 1129 add $out, 32, $out 1130 1131 fsrc2 %f4, %f0 1132 fsrc2 %f6, %f2 1133 brnz,pn $rem, .L${bits}_xts_${dir}steal 1134 nop 1135 1136 ret 1137 restore 1138 1139.align 16 11402: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 1141 ! and ~3x deterioration 1142 ! in inp==out case 1143 faligndata %f0, %f0, %f8 ! handle unaligned output 1144 faligndata %f0, %f2, %f10 1145 faligndata %f2, %f4, %f12 1146 faligndata %f4, %f6, %f14 1147 faligndata %f6, %f6, %f0 1148 1149 stda %f8, [$out + $omask]0xc0 ! partial store 1150 std %f10, [$out + 8] 1151 std %f12, [$out + 16] 1152 std %f14, [$out + 24] 1153 add $out, 32, $out 1154 orn %g0, $omask, $omask 1155 stda %f0, [$out + $omask]0xc0 ! partial store 1156 1157 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 1158 orn %g0, $omask, $omask 1159 1160 fsrc2 %f4, %f0 1161 fsrc2 %f6, %f2 1162 brnz,pn $rem, .L${bits}_xts_${dir}steal 1163 nop 1164 1165 ret 1166 restore 1167 1168!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1169.align 32 1170.L${bits}_xts_${dir}blk: 1171 add $out, $len, $blk_init 1172 and $blk_init, 63, $blk_init ! tail 1173 sub $len, $blk_init, $len 1174 add $blk_init, 15, $blk_init ! round up to 16n 1175 srlx $len, 4, $len 1176 srl $blk_init, 4, $blk_init 1177 sub $len, 1, $len 1178 add $blk_init, 1, $blk_init 1179 1180.L${bits}_xts_${dir}blk2x: 1181 ldx [$inp + 0], %o0 1182 ldx [$inp + 8], %o1 1183 ldx [$inp + 16], %o2 1184 brz,pt $ileft, 5f 1185 ldx [$inp + 24], %o3 1186 1187 ldx [$inp + 32], %o4 1188 sllx %o0, $ileft, %o0 1189 srlx %o1, $iright, %g1 1190 or %g1, %o0, %o0 1191 sllx %o1, $ileft, %o1 1192 srlx %o2, $iright, %g1 1193 or %g1, %o1, %o1 1194 sllx %o2, $ileft, %o2 1195 srlx %o3, $iright, %g1 1196 or %g1, %o2, %o2 1197 sllx %o3, $ileft, %o3 1198 srlx %o4, $iright, %o4 1199 or %o4, %o3, %o3 12005: 1201 movxtod %g2, %f12 1202 movxtod %g3, %f14 1203 bshuffle %f12, %f12, %f12 1204 bshuffle %f14, %f14, %f14 1205 1206 srax %g3, 63, %l7 ! next tweak value 1207 addcc %g2, %g2, %g2 1208 and %l7, 0x87, %l7 1209 addxc %g3, %g3, %g3 1210 xor %l7, %g2, %g2 1211 1212 movxtod %g2, %f8 1213 movxtod %g3, %f10 1214 bshuffle %f8, %f8, %f8 1215 bshuffle %f10, %f10, %f10 1216 1217 xor %g4, %o0, %o0 ! ^= rk[0] 1218 xor %g5, %o1, %o1 1219 xor %g4, %o2, %o2 ! ^= rk[0] 1220 xor %g5, %o3, %o3 1221 movxtod %o0, %f0 1222 movxtod %o1, %f2 1223 movxtod %o2, %f4 1224 movxtod %o3, %f6 1225 1226 fxor %f12, %f0, %f0 ! ^= tweak[0] 1227 fxor %f14, %f2, %f2 1228 fxor %f8, %f4, %f4 ! ^= tweak[0] 1229 fxor %f10, %f6, %f6 1230 1231 prefetch [$inp + 32+63], 20 1232 call _${alg}${bits}_${dir}crypt_2x 1233 add $inp, 32, $inp 1234 1235 movxtod %g2, %f8 1236 movxtod %g3, %f10 1237 1238 srax %g3, 63, %l7 ! next tweak value 1239 addcc %g2, %g2, %g2 1240 and %l7, 0x87, %l7 1241 addxc %g3, %g3, %g3 1242 xor %l7, %g2, %g2 1243 1244 bshuffle %f8, %f8, %f8 1245 bshuffle %f10, %f10, %f10 1246 1247 fxor %f12, %f0, %f0 ! ^= tweak[0] 1248 fxor %f14, %f2, %f2 1249 fxor %f8, %f4, %f4 1250 fxor %f10, %f6, %f6 1251 1252 subcc $len, 2, $len 1253 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1254 add $out, 8, $out 1255 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1256 add $out, 8, $out 1257 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1258 add $out, 8, $out 1259 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1260 bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x 1261 add $out, 8, $out 1262 1263 add $blk_init, $len, $len 1264 andcc $len, 1, %g0 ! is number of blocks even? 1265 membar #StoreLoad|#StoreStore 1266 bnz,pt %icc, .L${bits}_xts_${dir}loop 1267 srl $len, 0, $len 1268 brnz,pn $len, .L${bits}_xts_${dir}loop2x 1269 nop 1270 1271 fsrc2 %f4, %f0 1272 fsrc2 %f6, %f2 1273 brnz,pn $rem, .L${bits}_xts_${dir}steal 1274 nop 1275 1276 ret 1277 restore 1278!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1279___ 1280$code.=<<___ if ($dir eq "en"); 1281.align 32 1282.L${bits}_xts_${dir}steal: 1283 std %f0, [%fp + $::bias-16] ! copy of output 1284 std %f2, [%fp + $::bias-8] 1285 1286 srl $ileft, 3, $ileft 1287 add %fp, $::bias-16, %l7 1288 add $inp, $ileft, $inp ! original $inp+$len&-15 1289 add $out, $ooff, $out ! original $out+$len&-15 1290 mov 0, $ileft 1291 nop ! align 1292 1293.L${bits}_xts_${dir}stealing: 1294 ldub [$inp + $ileft], %o0 1295 ldub [%l7 + $ileft], %o1 1296 dec $rem 1297 stb %o0, [%l7 + $ileft] 1298 stb %o1, [$out + $ileft] 1299 brnz $rem, .L${bits}_xts_${dir}stealing 1300 inc $ileft 1301 1302 mov %l7, $inp 1303 sub $out, 16, $out 1304 mov 0, $ileft 1305 sub $out, $ooff, $out 1306 ba .L${bits}_xts_${dir}loop ! one more time 1307 mov 1, $len ! $rem is 0 1308___ 1309$code.=<<___ if ($dir eq "de"); 1310.align 32 1311.L${bits}_xts_${dir}steal: 1312 ldx [$inp + 0], %o0 1313 brz,pt $ileft, 8f 1314 ldx [$inp + 8], %o1 1315 1316 ldx [$inp + 16], %o2 1317 sllx %o0, $ileft, %o0 1318 srlx %o1, $iright, %g1 1319 sllx %o1, $ileft, %o1 1320 or %g1, %o0, %o0 1321 srlx %o2, $iright, %o2 1322 or %o2, %o1, %o1 13238: 1324 srax %g3, 63, %l7 ! next tweak value 1325 addcc %g2, %g2, %o2 1326 and %l7, 0x87, %l7 1327 addxc %g3, %g3, %o3 1328 xor %l7, %o2, %o2 1329 1330 movxtod %o2, %f12 1331 movxtod %o3, %f14 1332 bshuffle %f12, %f12, %f12 1333 bshuffle %f14, %f14, %f14 1334 1335 xor %g4, %o0, %o0 ! ^= rk[0] 1336 xor %g5, %o1, %o1 1337 movxtod %o0, %f0 1338 movxtod %o1, %f2 1339 1340 fxor %f12, %f0, %f0 ! ^= tweak[0] 1341 fxor %f14, %f2, %f2 1342 1343 call _${alg}${bits}_${dir}crypt_1x 1344 add $inp, 16, $inp 1345 1346 fxor %f12, %f0, %f0 ! ^= tweak[0] 1347 fxor %f14, %f2, %f2 1348 1349 std %f0, [%fp + $::bias-16] 1350 std %f2, [%fp + $::bias-8] 1351 1352 srl $ileft, 3, $ileft 1353 add %fp, $::bias-16, %l7 1354 add $inp, $ileft, $inp ! original $inp+$len&-15 1355 add $out, $ooff, $out ! original $out+$len&-15 1356 mov 0, $ileft 1357 add $out, 16, $out 1358 nop ! align 1359 1360.L${bits}_xts_${dir}stealing: 1361 ldub [$inp + $ileft], %o0 1362 ldub [%l7 + $ileft], %o1 1363 dec $rem 1364 stb %o0, [%l7 + $ileft] 1365 stb %o1, [$out + $ileft] 1366 brnz $rem, .L${bits}_xts_${dir}stealing 1367 inc $ileft 1368 1369 mov %l7, $inp 1370 sub $out, 16, $out 1371 mov 0, $ileft 1372 sub $out, $ooff, $out 1373 ba .L${bits}_xts_${dir}loop ! one more time 1374 mov 1, $len ! $rem is 0 1375___ 1376$code.=<<___; 1377 ret 1378 restore 1379.type ${alg}${bits}_t4_xts_${dir}crypt,#function 1380.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt 1381___ 1382} 1383 1384# Purpose of these subroutines is to explicitly encode VIS instructions, 1385# so that one can compile the module without having to specify VIS 1386# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 1387# Idea is to reserve for option to produce "universal" binary and let 1388# programmer detect if current CPU is VIS capable at run-time. 1389sub unvis { 1390my ($mnemonic,$rs1,$rs2,$rd)=@_; 1391my ($ref,$opf); 1392my %visopf = ( "faligndata" => 0x048, 1393 "bshuffle" => 0x04c, 1394 "fnot2" => 0x066, 1395 "fxor" => 0x06c, 1396 "fsrc2" => 0x078 ); 1397 1398 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1399 1400 if ($opf=$visopf{$mnemonic}) { 1401 foreach ($rs1,$rs2,$rd) { 1402 return $ref if (!/%f([0-9]{1,2})/); 1403 $_=$1; 1404 if ($1>=32) { 1405 return $ref if ($1&1); 1406 # re-encode for upper double register addressing 1407 $_=($1|$1>>5)&31; 1408 } 1409 } 1410 1411 return sprintf ".word\t0x%08x !%s", 1412 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1413 $ref; 1414 } else { 1415 return $ref; 1416 } 1417} 1418 1419sub unvis3 { 1420my ($mnemonic,$rs1,$rs2,$rd)=@_; 1421my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 1422my ($ref,$opf); 1423my %visopf = ( "addxc" => 0x011, 1424 "addxccc" => 0x013, 1425 "umulxhi" => 0x016, 1426 "alignaddr" => 0x018, 1427 "bmask" => 0x019, 1428 "alignaddrl" => 0x01a ); 1429 1430 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1431 1432 if ($opf=$visopf{$mnemonic}) { 1433 foreach ($rs1,$rs2,$rd) { 1434 return $ref if (!/%([goli])([0-9])/); 1435 $_=$bias{$1}+$2; 1436 } 1437 1438 return sprintf ".word\t0x%08x !%s", 1439 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1440 $ref; 1441 } else { 1442 return $ref; 1443 } 1444} 1445 1446sub unaes_round { # 4-argument instructions 1447my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1448my ($ref,$opf); 1449my %aesopf = ( "aes_eround01" => 0, 1450 "aes_eround23" => 1, 1451 "aes_dround01" => 2, 1452 "aes_dround23" => 3, 1453 "aes_eround01_l"=> 4, 1454 "aes_eround23_l"=> 5, 1455 "aes_dround01_l"=> 6, 1456 "aes_dround23_l"=> 7, 1457 "aes_kexpand1" => 8 ); 1458 1459 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1460 1461 if (defined($opf=$aesopf{$mnemonic})) { 1462 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; 1463 foreach ($rs1,$rs2,$rd) { 1464 return $ref if (!/%f([0-9]{1,2})/); 1465 $_=$1; 1466 if ($1>=32) { 1467 return $ref if ($1&1); 1468 # re-encode for upper double register addressing 1469 $_=($1|$1>>5)&31; 1470 } 1471 } 1472 1473 return sprintf ".word\t0x%08x !%s", 1474 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, 1475 $ref; 1476 } else { 1477 return $ref; 1478 } 1479} 1480 1481sub unaes_kexpand { # 3-argument instructions 1482my ($mnemonic,$rs1,$rs2,$rd)=@_; 1483my ($ref,$opf); 1484my %aesopf = ( "aes_kexpand0" => 0x130, 1485 "aes_kexpand2" => 0x131 ); 1486 1487 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1488 1489 if (defined($opf=$aesopf{$mnemonic})) { 1490 foreach ($rs1,$rs2,$rd) { 1491 return $ref if (!/%f([0-9]{1,2})/); 1492 $_=$1; 1493 if ($1>=32) { 1494 return $ref if ($1&1); 1495 # re-encode for upper double register addressing 1496 $_=($1|$1>>5)&31; 1497 } 1498 } 1499 1500 return sprintf ".word\t0x%08x !%s", 1501 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1502 $ref; 1503 } else { 1504 return $ref; 1505 } 1506} 1507 1508sub uncamellia_f { # 4-argument instructions 1509my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1510my ($ref,$opf); 1511 1512 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1513 1514 if (1) { 1515 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; 1516 foreach ($rs1,$rs2,$rd) { 1517 return $ref if (!/%f([0-9]{1,2})/); 1518 $_=$1; 1519 if ($1>=32) { 1520 return $ref if ($1&1); 1521 # re-encode for upper double register addressing 1522 $_=($1|$1>>5)&31; 1523 } 1524 } 1525 1526 return sprintf ".word\t0x%08x !%s", 1527 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2, 1528 $ref; 1529 } else { 1530 return $ref; 1531 } 1532} 1533 1534sub uncamellia3 { # 3-argument instructions 1535my ($mnemonic,$rs1,$rs2,$rd)=@_; 1536my ($ref,$opf); 1537my %cmllopf = ( "camellia_fl" => 0x13c, 1538 "camellia_fli" => 0x13d ); 1539 1540 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1541 1542 if (defined($opf=$cmllopf{$mnemonic})) { 1543 foreach ($rs1,$rs2,$rd) { 1544 return $ref if (!/%f([0-9]{1,2})/); 1545 $_=$1; 1546 if ($1>=32) { 1547 return $ref if ($1&1); 1548 # re-encode for upper double register addressing 1549 $_=($1|$1>>5)&31; 1550 } 1551 } 1552 1553 return sprintf ".word\t0x%08x !%s", 1554 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1555 $ref; 1556 } else { 1557 return $ref; 1558 } 1559} 1560 1561sub unmovxtox { # 2-argument instructions 1562my ($mnemonic,$rs,$rd)=@_; 1563my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 ); 1564my ($ref,$opf); 1565my %movxopf = ( "movdtox" => 0x110, 1566 "movstouw" => 0x111, 1567 "movstosw" => 0x113, 1568 "movxtod" => 0x118, 1569 "movwtos" => 0x119 ); 1570 1571 $ref = "$mnemonic\t$rs,$rd"; 1572 1573 if (defined($opf=$movxopf{$mnemonic})) { 1574 foreach ($rs,$rd) { 1575 return $ref if (!/%([fgoli])([0-9]{1,2})/); 1576 $_=$bias{$1}+$2; 1577 if ($2>=32) { 1578 return $ref if ($2&1); 1579 # re-encode for upper double register addressing 1580 $_=($2|$2>>5)&31; 1581 } 1582 } 1583 1584 return sprintf ".word\t0x%08x !%s", 1585 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs, 1586 $ref; 1587 } else { 1588 return $ref; 1589 } 1590} 1591 1592sub undes { 1593my ($mnemonic)=shift; 1594my @args=@_; 1595my ($ref,$opf); 1596my %desopf = ( "des_round" => 0b1001, 1597 "des_ip" => 0b100110100, 1598 "des_iip" => 0b100110101, 1599 "des_kexpand" => 0b100110110 ); 1600 1601 $ref = "$mnemonic\t".join(",",@_); 1602 1603 if (defined($opf=$desopf{$mnemonic})) { # 4-arg 1604 if ($mnemonic eq "des_round") { 1605 foreach (@args[0..3]) { 1606 return $ref if (!/%f([0-9]{1,2})/); 1607 $_=$1; 1608 if ($1>=32) { 1609 return $ref if ($1&1); 1610 # re-encode for upper double register addressing 1611 $_=($1|$1>>5)&31; 1612 } 1613 } 1614 return sprintf ".word\t0x%08x !%s", 1615 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25, 1616 $ref; 1617 } elsif ($mnemonic eq "des_kexpand") { # 3-arg 1618 foreach (@args[0..2]) { 1619 return $ref if (!/(%f)?([0-9]{1,2})/); 1620 $_=$2; 1621 if ($2>=32) { 1622 return $ref if ($2&1); 1623 # re-encode for upper double register addressing 1624 $_=($2|$2>>5)&31; 1625 } 1626 } 1627 return sprintf ".word\t0x%08x !%s", 1628 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25, 1629 $ref; 1630 } else { # 2-arg 1631 foreach (@args[0..1]) { 1632 return $ref if (!/%f([0-9]{1,2})/); 1633 $_=$1; 1634 if ($1>=32) { 1635 return $ref if ($2&1); 1636 # re-encode for upper double register addressing 1637 $_=($1|$1>>5)&31; 1638 } 1639 } 1640 return sprintf ".word\t0x%08x !%s", 1641 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25, 1642 $ref; 1643 } 1644 } else { 1645 return $ref; 1646 } 1647} 1648 1649sub emit_assembler { 1650 foreach (split("\n",$::code)) { 1651 s/\`([^\`]*)\`/eval $1/ge; 1652 1653 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go; 1654 1655 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1656 &unaes_round($1,$2,$3,$4,$5) 1657 /geo or 1658 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1659 &unaes_kexpand($1,$2,$3,$4) 1660 /geo or 1661 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1662 &uncamellia_f($1,$2,$3,$4,$5) 1663 /geo or 1664 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1665 &uncamellia3($1,$2,$3,$4) 1666 /geo or 1667 s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/ 1668 &undes($1,$2,$3,$4,$5) 1669 /geo or 1670 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/ 1671 &unmovxtox($1,$2,$3) 1672 /geo or 1673 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/ 1674 &unmovxtox($1,$2,$3) 1675 /geo or 1676 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1677 &unvis($1,$2,$3,$4) 1678 /geo or 1679 s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 1680 &unvis3($1,$2,$3,$4) 1681 /geo; 1682 1683 print $_,"\n"; 1684 } 1685} 1686 16871; 1688