1#! /usr/bin/env perl 2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# Specific modes implementations for SPARC Architecture 2011. There 11# is T4 dependency though, an ASI value that is not specified in the 12# Architecture Manual. But as SPARC universe is rather monocultural, 13# we imply that processor capable of executing crypto instructions 14# can handle the ASI in question as well. This means that we ought to 15# keep eyes open when new processors emerge... 16# 17# As for above mentioned ASI. It's so called "block initializing 18# store" which cancels "read" in "read-update-write" on cache lines. 19# This is "cooperative" optimization, as it reduces overall pressure 20# on memory interface. Benefits can't be observed/quantified with 21# usual benchmarks, on the contrary you can notice that single-thread 22# performance for parallelizable modes is ~1.5% worse for largest 23# block sizes [though few percent better for not so long ones]. All 24# this based on suggestions from David Miller. 25 26$::bias="STACK_BIAS"; 27$::frame="STACK_FRAME"; 28$::size_t_cc="SIZE_T_CC"; 29 30sub asm_init { # to be called with @ARGV as argument 31 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); } 32 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; } 33 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; } 34} 35 36# unified interface 37my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5)); 38# local variables 39my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7)); 40 41sub alg_cbc_encrypt_implement { 42my ($alg,$bits) = @_; 43 44$::code.=<<___; 45.globl ${alg}${bits}_t4_cbc_encrypt 46.align 32 47${alg}${bits}_t4_cbc_encrypt: 48 save %sp, -$::frame, %sp 49 cmp $len, 0 50 be,pn $::size_t_cc, .L${bits}_cbc_enc_abort 51 srln $len, 0, $len ! needed on v8+, "nop" on v9 52 sub $inp, $out, $blk_init ! $inp!=$out 53___ 54$::code.=<<___ if (!$::evp); 55 andcc $ivec, 7, $ivoff 56 alignaddr $ivec, %g0, $ivec 57 58 ldd [$ivec + 0], %f0 ! load ivec 59 bz,pt %icc, 1f 60 ldd [$ivec + 8], %f2 61 ldd [$ivec + 16], %f4 62 faligndata %f0, %f2, %f0 63 faligndata %f2, %f4, %f2 641: 65___ 66$::code.=<<___ if ($::evp); 67 ld [$ivec + 0], %f0 68 ld [$ivec + 4], %f1 69 ld [$ivec + 8], %f2 70 ld [$ivec + 12], %f3 71___ 72$::code.=<<___; 73 prefetch [$inp], 20 74 prefetch [$inp + 63], 20 75 call _${alg}${bits}_load_enckey 76 and $inp, 7, $ileft 77 andn $inp, 7, $inp 78 sll $ileft, 3, $ileft 79 mov 64, $iright 80 mov 0xff, $omask 81 sub $iright, $ileft, $iright 82 and $out, 7, $ooff 83 cmp $len, 127 84 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 85 movleu $::size_t_cc, 0, $blk_init ! $len<128 || 86 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out) 87 srl $omask, $ooff, $omask 88 89 alignaddrl $out, %g0, $out 90 srlx $len, 4, $len 91 prefetch [$out], 22 92 93.L${bits}_cbc_enc_loop: 94 ldx [$inp + 0], %o0 95 brz,pt $ileft, 4f 96 ldx [$inp + 8], %o1 97 98 ldx [$inp + 16], %o2 99 sllx %o0, $ileft, %o0 100 srlx %o1, $iright, %g1 101 sllx %o1, $ileft, %o1 102 or %g1, %o0, %o0 103 srlx %o2, $iright, %o2 104 or %o2, %o1, %o1 1054: 106 xor %g4, %o0, %o0 ! ^= rk[0] 107 xor %g5, %o1, %o1 108 movxtod %o0, %f12 109 movxtod %o1, %f14 110 111 fxor %f12, %f0, %f0 ! ^= ivec 112 fxor %f14, %f2, %f2 113 prefetch [$out + 63], 22 114 prefetch [$inp + 16+63], 20 115 call _${alg}${bits}_encrypt_1x 116 add $inp, 16, $inp 117 118 brnz,pn $ooff, 2f 119 sub $len, 1, $len 120 121 std %f0, [$out + 0] 122 std %f2, [$out + 8] 123 brnz,pt $len, .L${bits}_cbc_enc_loop 124 add $out, 16, $out 125___ 126$::code.=<<___ if ($::evp); 127 st %f0, [$ivec + 0] 128 st %f1, [$ivec + 4] 129 st %f2, [$ivec + 8] 130 st %f3, [$ivec + 12] 131___ 132$::code.=<<___ if (!$::evp); 133 brnz,pn $ivoff, 3f 134 nop 135 136 std %f0, [$ivec + 0] ! write out ivec 137 std %f2, [$ivec + 8] 138___ 139$::code.=<<___; 140.L${bits}_cbc_enc_abort: 141 ret 142 restore 143 144.align 16 1452: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 146 ! and ~3x deterioration 147 ! in inp==out case 148 faligndata %f0, %f0, %f4 ! handle unaligned output 149 faligndata %f0, %f2, %f6 150 faligndata %f2, %f2, %f8 151 152 stda %f4, [$out + $omask]0xc0 ! partial store 153 std %f6, [$out + 8] 154 add $out, 16, $out 155 orn %g0, $omask, $omask 156 stda %f8, [$out + $omask]0xc0 ! partial store 157 158 brnz,pt $len, .L${bits}_cbc_enc_loop+4 159 orn %g0, $omask, $omask 160___ 161$::code.=<<___ if ($::evp); 162 st %f0, [$ivec + 0] 163 st %f1, [$ivec + 4] 164 st %f2, [$ivec + 8] 165 st %f3, [$ivec + 12] 166___ 167$::code.=<<___ if (!$::evp); 168 brnz,pn $ivoff, 3f 169 nop 170 171 std %f0, [$ivec + 0] ! write out ivec 172 std %f2, [$ivec + 8] 173 ret 174 restore 175 176.align 16 1773: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec 178 mov 0xff, $omask 179 srl $omask, $ivoff, $omask 180 faligndata %f0, %f0, %f4 181 faligndata %f0, %f2, %f6 182 faligndata %f2, %f2, %f8 183 stda %f4, [$ivec + $omask]0xc0 184 std %f6, [$ivec + 8] 185 add $ivec, 16, $ivec 186 orn %g0, $omask, $omask 187 stda %f8, [$ivec + $omask]0xc0 188___ 189$::code.=<<___; 190 ret 191 restore 192 193!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 194.align 32 195.L${bits}cbc_enc_blk: 196 add $out, $len, $blk_init 197 and $blk_init, 63, $blk_init ! tail 198 sub $len, $blk_init, $len 199 add $blk_init, 15, $blk_init ! round up to 16n 200 srlx $len, 4, $len 201 srl $blk_init, 4, $blk_init 202 203.L${bits}_cbc_enc_blk_loop: 204 ldx [$inp + 0], %o0 205 brz,pt $ileft, 5f 206 ldx [$inp + 8], %o1 207 208 ldx [$inp + 16], %o2 209 sllx %o0, $ileft, %o0 210 srlx %o1, $iright, %g1 211 sllx %o1, $ileft, %o1 212 or %g1, %o0, %o0 213 srlx %o2, $iright, %o2 214 or %o2, %o1, %o1 2155: 216 xor %g4, %o0, %o0 ! ^= rk[0] 217 xor %g5, %o1, %o1 218 movxtod %o0, %f12 219 movxtod %o1, %f14 220 221 fxor %f12, %f0, %f0 ! ^= ivec 222 fxor %f14, %f2, %f2 223 prefetch [$inp + 16+63], 20 224 call _${alg}${bits}_encrypt_1x 225 add $inp, 16, $inp 226 sub $len, 1, $len 227 228 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 229 add $out, 8, $out 230 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 231 brnz,pt $len, .L${bits}_cbc_enc_blk_loop 232 add $out, 8, $out 233 234 membar #StoreLoad|#StoreStore 235 brnz,pt $blk_init, .L${bits}_cbc_enc_loop 236 mov $blk_init, $len 237___ 238$::code.=<<___ if ($::evp); 239 st %f0, [$ivec + 0] 240 st %f1, [$ivec + 4] 241 st %f2, [$ivec + 8] 242 st %f3, [$ivec + 12] 243___ 244$::code.=<<___ if (!$::evp); 245 brnz,pn $ivoff, 3b 246 nop 247 248 std %f0, [$ivec + 0] ! write out ivec 249 std %f2, [$ivec + 8] 250___ 251$::code.=<<___; 252 ret 253 restore 254.type ${alg}${bits}_t4_cbc_encrypt,#function 255.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt 256___ 257} 258 259sub alg_cbc_decrypt_implement { 260my ($alg,$bits) = @_; 261 262$::code.=<<___; 263.globl ${alg}${bits}_t4_cbc_decrypt 264.align 32 265${alg}${bits}_t4_cbc_decrypt: 266 save %sp, -$::frame, %sp 267 cmp $len, 0 268 be,pn $::size_t_cc, .L${bits}_cbc_dec_abort 269 srln $len, 0, $len ! needed on v8+, "nop" on v9 270 sub $inp, $out, $blk_init ! $inp!=$out 271___ 272$::code.=<<___ if (!$::evp); 273 andcc $ivec, 7, $ivoff 274 alignaddr $ivec, %g0, $ivec 275 276 ldd [$ivec + 0], %f12 ! load ivec 277 bz,pt %icc, 1f 278 ldd [$ivec + 8], %f14 279 ldd [$ivec + 16], %f0 280 faligndata %f12, %f14, %f12 281 faligndata %f14, %f0, %f14 2821: 283___ 284$::code.=<<___ if ($::evp); 285 ld [$ivec + 0], %f12 ! load ivec 286 ld [$ivec + 4], %f13 287 ld [$ivec + 8], %f14 288 ld [$ivec + 12], %f15 289___ 290$::code.=<<___; 291 prefetch [$inp], 20 292 prefetch [$inp + 63], 20 293 call _${alg}${bits}_load_deckey 294 and $inp, 7, $ileft 295 andn $inp, 7, $inp 296 sll $ileft, 3, $ileft 297 mov 64, $iright 298 mov 0xff, $omask 299 sub $iright, $ileft, $iright 300 and $out, 7, $ooff 301 cmp $len, 255 302 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 303 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 304 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out) 305 srl $omask, $ooff, $omask 306 307 andcc $len, 16, %g0 ! is number of blocks even? 308 srlx $len, 4, $len 309 alignaddrl $out, %g0, $out 310 bz %icc, .L${bits}_cbc_dec_loop2x 311 prefetch [$out], 22 312.L${bits}_cbc_dec_loop: 313 ldx [$inp + 0], %o0 314 brz,pt $ileft, 4f 315 ldx [$inp + 8], %o1 316 317 ldx [$inp + 16], %o2 318 sllx %o0, $ileft, %o0 319 srlx %o1, $iright, %g1 320 sllx %o1, $ileft, %o1 321 or %g1, %o0, %o0 322 srlx %o2, $iright, %o2 323 or %o2, %o1, %o1 3244: 325 xor %g4, %o0, %o2 ! ^= rk[0] 326 xor %g5, %o1, %o3 327 movxtod %o2, %f0 328 movxtod %o3, %f2 329 330 prefetch [$out + 63], 22 331 prefetch [$inp + 16+63], 20 332 call _${alg}${bits}_decrypt_1x 333 add $inp, 16, $inp 334 335 fxor %f12, %f0, %f0 ! ^= ivec 336 fxor %f14, %f2, %f2 337 movxtod %o0, %f12 338 movxtod %o1, %f14 339 340 brnz,pn $ooff, 2f 341 sub $len, 1, $len 342 343 std %f0, [$out + 0] 344 std %f2, [$out + 8] 345 brnz,pt $len, .L${bits}_cbc_dec_loop2x 346 add $out, 16, $out 347___ 348$::code.=<<___ if ($::evp); 349 st %f12, [$ivec + 0] 350 st %f13, [$ivec + 4] 351 st %f14, [$ivec + 8] 352 st %f15, [$ivec + 12] 353___ 354$::code.=<<___ if (!$::evp); 355 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 356 nop 357 358 std %f12, [$ivec + 0] ! write out ivec 359 std %f14, [$ivec + 8] 360___ 361$::code.=<<___; 362.L${bits}_cbc_dec_abort: 363 ret 364 restore 365 366.align 16 3672: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 368 ! and ~3x deterioration 369 ! in inp==out case 370 faligndata %f0, %f0, %f4 ! handle unaligned output 371 faligndata %f0, %f2, %f6 372 faligndata %f2, %f2, %f8 373 374 stda %f4, [$out + $omask]0xc0 ! partial store 375 std %f6, [$out + 8] 376 add $out, 16, $out 377 orn %g0, $omask, $omask 378 stda %f8, [$out + $omask]0xc0 ! partial store 379 380 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 381 orn %g0, $omask, $omask 382___ 383$::code.=<<___ if ($::evp); 384 st %f12, [$ivec + 0] 385 st %f13, [$ivec + 4] 386 st %f14, [$ivec + 8] 387 st %f15, [$ivec + 12] 388___ 389$::code.=<<___ if (!$::evp); 390 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 391 nop 392 393 std %f12, [$ivec + 0] ! write out ivec 394 std %f14, [$ivec + 8] 395___ 396$::code.=<<___; 397 ret 398 restore 399 400!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 401.align 32 402.L${bits}_cbc_dec_loop2x: 403 ldx [$inp + 0], %o0 404 ldx [$inp + 8], %o1 405 ldx [$inp + 16], %o2 406 brz,pt $ileft, 4f 407 ldx [$inp + 24], %o3 408 409 ldx [$inp + 32], %o4 410 sllx %o0, $ileft, %o0 411 srlx %o1, $iright, %g1 412 or %g1, %o0, %o0 413 sllx %o1, $ileft, %o1 414 srlx %o2, $iright, %g1 415 or %g1, %o1, %o1 416 sllx %o2, $ileft, %o2 417 srlx %o3, $iright, %g1 418 or %g1, %o2, %o2 419 sllx %o3, $ileft, %o3 420 srlx %o4, $iright, %o4 421 or %o4, %o3, %o3 4224: 423 xor %g4, %o0, %o4 ! ^= rk[0] 424 xor %g5, %o1, %o5 425 movxtod %o4, %f0 426 movxtod %o5, %f2 427 xor %g4, %o2, %o4 428 xor %g5, %o3, %o5 429 movxtod %o4, %f4 430 movxtod %o5, %f6 431 432 prefetch [$out + 63], 22 433 prefetch [$inp + 32+63], 20 434 call _${alg}${bits}_decrypt_2x 435 add $inp, 32, $inp 436 437 movxtod %o0, %f8 438 movxtod %o1, %f10 439 fxor %f12, %f0, %f0 ! ^= ivec 440 fxor %f14, %f2, %f2 441 movxtod %o2, %f12 442 movxtod %o3, %f14 443 fxor %f8, %f4, %f4 444 fxor %f10, %f6, %f6 445 446 brnz,pn $ooff, 2f 447 sub $len, 2, $len 448 449 std %f0, [$out + 0] 450 std %f2, [$out + 8] 451 std %f4, [$out + 16] 452 std %f6, [$out + 24] 453 brnz,pt $len, .L${bits}_cbc_dec_loop2x 454 add $out, 32, $out 455___ 456$::code.=<<___ if ($::evp); 457 st %f12, [$ivec + 0] 458 st %f13, [$ivec + 4] 459 st %f14, [$ivec + 8] 460 st %f15, [$ivec + 12] 461___ 462$::code.=<<___ if (!$::evp); 463 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 464 nop 465 466 std %f12, [$ivec + 0] ! write out ivec 467 std %f14, [$ivec + 8] 468___ 469$::code.=<<___; 470 ret 471 restore 472 473.align 16 4742: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 475 ! and ~3x deterioration 476 ! in inp==out case 477 faligndata %f0, %f0, %f8 ! handle unaligned output 478 faligndata %f0, %f2, %f0 479 faligndata %f2, %f4, %f2 480 faligndata %f4, %f6, %f4 481 faligndata %f6, %f6, %f6 482 stda %f8, [$out + $omask]0xc0 ! partial store 483 std %f0, [$out + 8] 484 std %f2, [$out + 16] 485 std %f4, [$out + 24] 486 add $out, 32, $out 487 orn %g0, $omask, $omask 488 stda %f6, [$out + $omask]0xc0 ! partial store 489 490 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 491 orn %g0, $omask, $omask 492___ 493$::code.=<<___ if ($::evp); 494 st %f12, [$ivec + 0] 495 st %f13, [$ivec + 4] 496 st %f14, [$ivec + 8] 497 st %f15, [$ivec + 12] 498___ 499$::code.=<<___ if (!$::evp); 500 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 501 nop 502 503 std %f12, [$ivec + 0] ! write out ivec 504 std %f14, [$ivec + 8] 505 ret 506 restore 507 508.align 16 509.L${bits}_cbc_dec_unaligned_ivec: 510 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec 511 mov 0xff, $omask 512 srl $omask, $ivoff, $omask 513 faligndata %f12, %f12, %f0 514 faligndata %f12, %f14, %f2 515 faligndata %f14, %f14, %f4 516 stda %f0, [$ivec + $omask]0xc0 517 std %f2, [$ivec + 8] 518 add $ivec, 16, $ivec 519 orn %g0, $omask, $omask 520 stda %f4, [$ivec + $omask]0xc0 521___ 522$::code.=<<___; 523 ret 524 restore 525 526!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 527.align 32 528.L${bits}cbc_dec_blk: 529 add $out, $len, $blk_init 530 and $blk_init, 63, $blk_init ! tail 531 sub $len, $blk_init, $len 532 add $blk_init, 15, $blk_init ! round up to 16n 533 srlx $len, 4, $len 534 srl $blk_init, 4, $blk_init 535 sub $len, 1, $len 536 add $blk_init, 1, $blk_init 537 538.L${bits}_cbc_dec_blk_loop2x: 539 ldx [$inp + 0], %o0 540 ldx [$inp + 8], %o1 541 ldx [$inp + 16], %o2 542 brz,pt $ileft, 5f 543 ldx [$inp + 24], %o3 544 545 ldx [$inp + 32], %o4 546 sllx %o0, $ileft, %o0 547 srlx %o1, $iright, %g1 548 or %g1, %o0, %o0 549 sllx %o1, $ileft, %o1 550 srlx %o2, $iright, %g1 551 or %g1, %o1, %o1 552 sllx %o2, $ileft, %o2 553 srlx %o3, $iright, %g1 554 or %g1, %o2, %o2 555 sllx %o3, $ileft, %o3 556 srlx %o4, $iright, %o4 557 or %o4, %o3, %o3 5585: 559 xor %g4, %o0, %o4 ! ^= rk[0] 560 xor %g5, %o1, %o5 561 movxtod %o4, %f0 562 movxtod %o5, %f2 563 xor %g4, %o2, %o4 564 xor %g5, %o3, %o5 565 movxtod %o4, %f4 566 movxtod %o5, %f6 567 568 prefetch [$inp + 32+63], 20 569 call _${alg}${bits}_decrypt_2x 570 add $inp, 32, $inp 571 subcc $len, 2, $len 572 573 movxtod %o0, %f8 574 movxtod %o1, %f10 575 fxor %f12, %f0, %f0 ! ^= ivec 576 fxor %f14, %f2, %f2 577 movxtod %o2, %f12 578 movxtod %o3, %f14 579 fxor %f8, %f4, %f4 580 fxor %f10, %f6, %f6 581 582 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 583 add $out, 8, $out 584 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 585 add $out, 8, $out 586 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 587 add $out, 8, $out 588 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 589 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x 590 add $out, 8, $out 591 592 add $blk_init, $len, $len 593 andcc $len, 1, %g0 ! is number of blocks even? 594 membar #StoreLoad|#StoreStore 595 bnz,pt %icc, .L${bits}_cbc_dec_loop 596 srl $len, 0, $len 597 brnz,pn $len, .L${bits}_cbc_dec_loop2x 598 nop 599___ 600$::code.=<<___ if ($::evp); 601 st %f12, [$ivec + 0] ! write out ivec 602 st %f13, [$ivec + 4] 603 st %f14, [$ivec + 8] 604 st %f15, [$ivec + 12] 605___ 606$::code.=<<___ if (!$::evp); 607 brnz,pn $ivoff, 3b 608 nop 609 610 std %f12, [$ivec + 0] ! write out ivec 611 std %f14, [$ivec + 8] 612___ 613$::code.=<<___; 614 ret 615 restore 616.type ${alg}${bits}_t4_cbc_decrypt,#function 617.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt 618___ 619} 620 621sub alg_ctr32_implement { 622my ($alg,$bits) = @_; 623 624$::code.=<<___; 625.globl ${alg}${bits}_t4_ctr32_encrypt 626.align 32 627${alg}${bits}_t4_ctr32_encrypt: 628 save %sp, -$::frame, %sp 629 srln $len, 0, $len ! needed on v8+, "nop" on v9 630 631 prefetch [$inp], 20 632 prefetch [$inp + 63], 20 633 call _${alg}${bits}_load_enckey 634 sllx $len, 4, $len 635 636 ld [$ivec + 0], %l4 ! counter 637 ld [$ivec + 4], %l5 638 ld [$ivec + 8], %l6 639 ld [$ivec + 12], %l7 640 641 sllx %l4, 32, %o5 642 or %l5, %o5, %o5 643 sllx %l6, 32, %g1 644 xor %o5, %g4, %g4 ! ^= rk[0] 645 xor %g1, %g5, %g5 646 movxtod %g4, %f14 ! most significant 64 bits 647 648 sub $inp, $out, $blk_init ! $inp!=$out 649 and $inp, 7, $ileft 650 andn $inp, 7, $inp 651 sll $ileft, 3, $ileft 652 mov 64, $iright 653 mov 0xff, $omask 654 sub $iright, $ileft, $iright 655 and $out, 7, $ooff 656 cmp $len, 255 657 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 658 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 659 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out) 660 srl $omask, $ooff, $omask 661 662 andcc $len, 16, %g0 ! is number of blocks even? 663 alignaddrl $out, %g0, $out 664 bz %icc, .L${bits}_ctr32_loop2x 665 srlx $len, 4, $len 666.L${bits}_ctr32_loop: 667 ldx [$inp + 0], %o0 668 brz,pt $ileft, 4f 669 ldx [$inp + 8], %o1 670 671 ldx [$inp + 16], %o2 672 sllx %o0, $ileft, %o0 673 srlx %o1, $iright, %g1 674 sllx %o1, $ileft, %o1 675 or %g1, %o0, %o0 676 srlx %o2, $iright, %o2 677 or %o2, %o1, %o1 6784: 679 xor %g5, %l7, %g1 ! ^= rk[0] 680 add %l7, 1, %l7 681 movxtod %g1, %f2 682 srl %l7, 0, %l7 ! clruw 683 prefetch [$out + 63], 22 684 prefetch [$inp + 16+63], 20 685___ 686$::code.=<<___ if ($alg eq "aes"); 687 aes_eround01 %f16, %f14, %f2, %f4 688 aes_eround23 %f18, %f14, %f2, %f2 689___ 690$::code.=<<___ if ($alg eq "cmll"); 691 camellia_f %f16, %f2, %f14, %f2 692 camellia_f %f18, %f14, %f2, %f0 693___ 694$::code.=<<___; 695 call _${alg}${bits}_encrypt_1x+8 696 add $inp, 16, $inp 697 698 movxtod %o0, %f10 699 movxtod %o1, %f12 700 fxor %f10, %f0, %f0 ! ^= inp 701 fxor %f12, %f2, %f2 702 703 brnz,pn $ooff, 2f 704 sub $len, 1, $len 705 706 std %f0, [$out + 0] 707 std %f2, [$out + 8] 708 brnz,pt $len, .L${bits}_ctr32_loop2x 709 add $out, 16, $out 710 711 ret 712 restore 713 714.align 16 7152: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 716 ! and ~3x deterioration 717 ! in inp==out case 718 faligndata %f0, %f0, %f4 ! handle unaligned output 719 faligndata %f0, %f2, %f6 720 faligndata %f2, %f2, %f8 721 stda %f4, [$out + $omask]0xc0 ! partial store 722 std %f6, [$out + 8] 723 add $out, 16, $out 724 orn %g0, $omask, $omask 725 stda %f8, [$out + $omask]0xc0 ! partial store 726 727 brnz,pt $len, .L${bits}_ctr32_loop2x+4 728 orn %g0, $omask, $omask 729 730 ret 731 restore 732 733!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 734.align 32 735.L${bits}_ctr32_loop2x: 736 ldx [$inp + 0], %o0 737 ldx [$inp + 8], %o1 738 ldx [$inp + 16], %o2 739 brz,pt $ileft, 4f 740 ldx [$inp + 24], %o3 741 742 ldx [$inp + 32], %o4 743 sllx %o0, $ileft, %o0 744 srlx %o1, $iright, %g1 745 or %g1, %o0, %o0 746 sllx %o1, $ileft, %o1 747 srlx %o2, $iright, %g1 748 or %g1, %o1, %o1 749 sllx %o2, $ileft, %o2 750 srlx %o3, $iright, %g1 751 or %g1, %o2, %o2 752 sllx %o3, $ileft, %o3 753 srlx %o4, $iright, %o4 754 or %o4, %o3, %o3 7554: 756 xor %g5, %l7, %g1 ! ^= rk[0] 757 add %l7, 1, %l7 758 movxtod %g1, %f2 759 srl %l7, 0, %l7 ! clruw 760 xor %g5, %l7, %g1 761 add %l7, 1, %l7 762 movxtod %g1, %f6 763 srl %l7, 0, %l7 ! clruw 764 prefetch [$out + 63], 22 765 prefetch [$inp + 32+63], 20 766___ 767$::code.=<<___ if ($alg eq "aes"); 768 aes_eround01 %f16, %f14, %f2, %f8 769 aes_eround23 %f18, %f14, %f2, %f2 770 aes_eround01 %f16, %f14, %f6, %f10 771 aes_eround23 %f18, %f14, %f6, %f6 772___ 773$::code.=<<___ if ($alg eq "cmll"); 774 camellia_f %f16, %f2, %f14, %f2 775 camellia_f %f16, %f6, %f14, %f6 776 camellia_f %f18, %f14, %f2, %f0 777 camellia_f %f18, %f14, %f6, %f4 778___ 779$::code.=<<___; 780 call _${alg}${bits}_encrypt_2x+16 781 add $inp, 32, $inp 782 783 movxtod %o0, %f8 784 movxtod %o1, %f10 785 movxtod %o2, %f12 786 fxor %f8, %f0, %f0 ! ^= inp 787 movxtod %o3, %f8 788 fxor %f10, %f2, %f2 789 fxor %f12, %f4, %f4 790 fxor %f8, %f6, %f6 791 792 brnz,pn $ooff, 2f 793 sub $len, 2, $len 794 795 std %f0, [$out + 0] 796 std %f2, [$out + 8] 797 std %f4, [$out + 16] 798 std %f6, [$out + 24] 799 brnz,pt $len, .L${bits}_ctr32_loop2x 800 add $out, 32, $out 801 802 ret 803 restore 804 805.align 16 8062: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 807 ! and ~3x deterioration 808 ! in inp==out case 809 faligndata %f0, %f0, %f8 ! handle unaligned output 810 faligndata %f0, %f2, %f0 811 faligndata %f2, %f4, %f2 812 faligndata %f4, %f6, %f4 813 faligndata %f6, %f6, %f6 814 815 stda %f8, [$out + $omask]0xc0 ! partial store 816 std %f0, [$out + 8] 817 std %f2, [$out + 16] 818 std %f4, [$out + 24] 819 add $out, 32, $out 820 orn %g0, $omask, $omask 821 stda %f6, [$out + $omask]0xc0 ! partial store 822 823 brnz,pt $len, .L${bits}_ctr32_loop2x+4 824 orn %g0, $omask, $omask 825 826 ret 827 restore 828 829!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 830.align 32 831.L${bits}_ctr32_blk: 832 add $out, $len, $blk_init 833 and $blk_init, 63, $blk_init ! tail 834 sub $len, $blk_init, $len 835 add $blk_init, 15, $blk_init ! round up to 16n 836 srlx $len, 4, $len 837 srl $blk_init, 4, $blk_init 838 sub $len, 1, $len 839 add $blk_init, 1, $blk_init 840 841.L${bits}_ctr32_blk_loop2x: 842 ldx [$inp + 0], %o0 843 ldx [$inp + 8], %o1 844 ldx [$inp + 16], %o2 845 brz,pt $ileft, 5f 846 ldx [$inp + 24], %o3 847 848 ldx [$inp + 32], %o4 849 sllx %o0, $ileft, %o0 850 srlx %o1, $iright, %g1 851 or %g1, %o0, %o0 852 sllx %o1, $ileft, %o1 853 srlx %o2, $iright, %g1 854 or %g1, %o1, %o1 855 sllx %o2, $ileft, %o2 856 srlx %o3, $iright, %g1 857 or %g1, %o2, %o2 858 sllx %o3, $ileft, %o3 859 srlx %o4, $iright, %o4 860 or %o4, %o3, %o3 8615: 862 xor %g5, %l7, %g1 ! ^= rk[0] 863 add %l7, 1, %l7 864 movxtod %g1, %f2 865 srl %l7, 0, %l7 ! clruw 866 xor %g5, %l7, %g1 867 add %l7, 1, %l7 868 movxtod %g1, %f6 869 srl %l7, 0, %l7 ! clruw 870 prefetch [$inp + 32+63], 20 871___ 872$::code.=<<___ if ($alg eq "aes"); 873 aes_eround01 %f16, %f14, %f2, %f8 874 aes_eround23 %f18, %f14, %f2, %f2 875 aes_eround01 %f16, %f14, %f6, %f10 876 aes_eround23 %f18, %f14, %f6, %f6 877___ 878$::code.=<<___ if ($alg eq "cmll"); 879 camellia_f %f16, %f2, %f14, %f2 880 camellia_f %f16, %f6, %f14, %f6 881 camellia_f %f18, %f14, %f2, %f0 882 camellia_f %f18, %f14, %f6, %f4 883___ 884$::code.=<<___; 885 call _${alg}${bits}_encrypt_2x+16 886 add $inp, 32, $inp 887 subcc $len, 2, $len 888 889 movxtod %o0, %f8 890 movxtod %o1, %f10 891 movxtod %o2, %f12 892 fxor %f8, %f0, %f0 ! ^= inp 893 movxtod %o3, %f8 894 fxor %f10, %f2, %f2 895 fxor %f12, %f4, %f4 896 fxor %f8, %f6, %f6 897 898 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 899 add $out, 8, $out 900 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 901 add $out, 8, $out 902 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 903 add $out, 8, $out 904 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 905 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x 906 add $out, 8, $out 907 908 add $blk_init, $len, $len 909 andcc $len, 1, %g0 ! is number of blocks even? 910 membar #StoreLoad|#StoreStore 911 bnz,pt %icc, .L${bits}_ctr32_loop 912 srl $len, 0, $len 913 brnz,pn $len, .L${bits}_ctr32_loop2x 914 nop 915 916 ret 917 restore 918.type ${alg}${bits}_t4_ctr32_encrypt,#function 919.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt 920___ 921} 922 923sub alg_xts_implement { 924my ($alg,$bits,$dir) = @_; 925my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5)); 926my $rem=$ivec; 927 928$::code.=<<___; 929.globl ${alg}${bits}_t4_xts_${dir}crypt 930.align 32 931${alg}${bits}_t4_xts_${dir}crypt: 932 save %sp, -$::frame-16, %sp 933 srln $len, 0, $len ! needed on v8+, "nop" on v9 934 935 mov $ivec, %o0 936 add %fp, $::bias-16, %o1 937 call ${alg}_t4_encrypt 938 mov $key2, %o2 939 940 add %fp, $::bias-16, %l7 941 ldxa [%l7]0x88, %g2 942 add %fp, $::bias-8, %l7 943 ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak 944 945 sethi %hi(0x76543210), %l7 946 or %l7, %lo(0x76543210), %l7 947 bmask %l7, %g0, %g0 ! byte swap mask 948 949 prefetch [$inp], 20 950 prefetch [$inp + 63], 20 951 call _${alg}${bits}_load_${dir}ckey 952 and $len, 15, $rem 953 and $len, -16, $len 954___ 955$code.=<<___ if ($dir eq "de"); 956 mov 0, %l7 957 movrnz $rem, 16, %l7 958 sub $len, %l7, $len 959___ 960$code.=<<___; 961 962 sub $inp, $out, $blk_init ! $inp!=$out 963 and $inp, 7, $ileft 964 andn $inp, 7, $inp 965 sll $ileft, 3, $ileft 966 mov 64, $iright 967 mov 0xff, $omask 968 sub $iright, $ileft, $iright 969 and $out, 7, $ooff 970 cmp $len, 255 971 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 972 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 973 brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out) 974 srl $omask, $ooff, $omask 975 976 andcc $len, 16, %g0 ! is number of blocks even? 977___ 978$code.=<<___ if ($dir eq "de"); 979 brz,pn $len, .L${bits}_xts_${dir}steal 980___ 981$code.=<<___; 982 alignaddrl $out, %g0, $out 983 bz %icc, .L${bits}_xts_${dir}loop2x 984 srlx $len, 4, $len 985.L${bits}_xts_${dir}loop: 986 ldx [$inp + 0], %o0 987 brz,pt $ileft, 4f 988 ldx [$inp + 8], %o1 989 990 ldx [$inp + 16], %o2 991 sllx %o0, $ileft, %o0 992 srlx %o1, $iright, %g1 993 sllx %o1, $ileft, %o1 994 or %g1, %o0, %o0 995 srlx %o2, $iright, %o2 996 or %o2, %o1, %o1 9974: 998 movxtod %g2, %f12 999 movxtod %g3, %f14 1000 bshuffle %f12, %f12, %f12 1001 bshuffle %f14, %f14, %f14 1002 1003 xor %g4, %o0, %o0 ! ^= rk[0] 1004 xor %g5, %o1, %o1 1005 movxtod %o0, %f0 1006 movxtod %o1, %f2 1007 1008 fxor %f12, %f0, %f0 ! ^= tweak[0] 1009 fxor %f14, %f2, %f2 1010 1011 prefetch [$out + 63], 22 1012 prefetch [$inp + 16+63], 20 1013 call _${alg}${bits}_${dir}crypt_1x 1014 add $inp, 16, $inp 1015 1016 fxor %f12, %f0, %f0 ! ^= tweak[0] 1017 fxor %f14, %f2, %f2 1018 1019 srax %g3, 63, %l7 ! next tweak value 1020 addcc %g2, %g2, %g2 1021 and %l7, 0x87, %l7 1022 addxc %g3, %g3, %g3 1023 xor %l7, %g2, %g2 1024 1025 brnz,pn $ooff, 2f 1026 sub $len, 1, $len 1027 1028 std %f0, [$out + 0] 1029 std %f2, [$out + 8] 1030 brnz,pt $len, .L${bits}_xts_${dir}loop2x 1031 add $out, 16, $out 1032 1033 brnz,pn $rem, .L${bits}_xts_${dir}steal 1034 nop 1035 1036 ret 1037 restore 1038 1039.align 16 10402: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 1041 ! and ~3x deterioration 1042 ! in inp==out case 1043 faligndata %f0, %f0, %f4 ! handle unaligned output 1044 faligndata %f0, %f2, %f6 1045 faligndata %f2, %f2, %f8 1046 stda %f4, [$out + $omask]0xc0 ! partial store 1047 std %f6, [$out + 8] 1048 add $out, 16, $out 1049 orn %g0, $omask, $omask 1050 stda %f8, [$out + $omask]0xc0 ! partial store 1051 1052 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 1053 orn %g0, $omask, $omask 1054 1055 brnz,pn $rem, .L${bits}_xts_${dir}steal 1056 nop 1057 1058 ret 1059 restore 1060 1061!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1062.align 32 1063.L${bits}_xts_${dir}loop2x: 1064 ldx [$inp + 0], %o0 1065 ldx [$inp + 8], %o1 1066 ldx [$inp + 16], %o2 1067 brz,pt $ileft, 4f 1068 ldx [$inp + 24], %o3 1069 1070 ldx [$inp + 32], %o4 1071 sllx %o0, $ileft, %o0 1072 srlx %o1, $iright, %g1 1073 or %g1, %o0, %o0 1074 sllx %o1, $ileft, %o1 1075 srlx %o2, $iright, %g1 1076 or %g1, %o1, %o1 1077 sllx %o2, $ileft, %o2 1078 srlx %o3, $iright, %g1 1079 or %g1, %o2, %o2 1080 sllx %o3, $ileft, %o3 1081 srlx %o4, $iright, %o4 1082 or %o4, %o3, %o3 10834: 1084 movxtod %g2, %f12 1085 movxtod %g3, %f14 1086 bshuffle %f12, %f12, %f12 1087 bshuffle %f14, %f14, %f14 1088 1089 srax %g3, 63, %l7 ! next tweak value 1090 addcc %g2, %g2, %g2 1091 and %l7, 0x87, %l7 1092 addxc %g3, %g3, %g3 1093 xor %l7, %g2, %g2 1094 1095 movxtod %g2, %f8 1096 movxtod %g3, %f10 1097 bshuffle %f8, %f8, %f8 1098 bshuffle %f10, %f10, %f10 1099 1100 xor %g4, %o0, %o0 ! ^= rk[0] 1101 xor %g5, %o1, %o1 1102 xor %g4, %o2, %o2 ! ^= rk[0] 1103 xor %g5, %o3, %o3 1104 movxtod %o0, %f0 1105 movxtod %o1, %f2 1106 movxtod %o2, %f4 1107 movxtod %o3, %f6 1108 1109 fxor %f12, %f0, %f0 ! ^= tweak[0] 1110 fxor %f14, %f2, %f2 1111 fxor %f8, %f4, %f4 ! ^= tweak[0] 1112 fxor %f10, %f6, %f6 1113 1114 prefetch [$out + 63], 22 1115 prefetch [$inp + 32+63], 20 1116 call _${alg}${bits}_${dir}crypt_2x 1117 add $inp, 32, $inp 1118 1119 movxtod %g2, %f8 1120 movxtod %g3, %f10 1121 1122 srax %g3, 63, %l7 ! next tweak value 1123 addcc %g2, %g2, %g2 1124 and %l7, 0x87, %l7 1125 addxc %g3, %g3, %g3 1126 xor %l7, %g2, %g2 1127 1128 bshuffle %f8, %f8, %f8 1129 bshuffle %f10, %f10, %f10 1130 1131 fxor %f12, %f0, %f0 ! ^= tweak[0] 1132 fxor %f14, %f2, %f2 1133 fxor %f8, %f4, %f4 1134 fxor %f10, %f6, %f6 1135 1136 brnz,pn $ooff, 2f 1137 sub $len, 2, $len 1138 1139 std %f0, [$out + 0] 1140 std %f2, [$out + 8] 1141 std %f4, [$out + 16] 1142 std %f6, [$out + 24] 1143 brnz,pt $len, .L${bits}_xts_${dir}loop2x 1144 add $out, 32, $out 1145 1146 fsrc2 %f4, %f0 1147 fsrc2 %f6, %f2 1148 brnz,pn $rem, .L${bits}_xts_${dir}steal 1149 nop 1150 1151 ret 1152 restore 1153 1154.align 16 11552: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 1156 ! and ~3x deterioration 1157 ! in inp==out case 1158 faligndata %f0, %f0, %f8 ! handle unaligned output 1159 faligndata %f0, %f2, %f10 1160 faligndata %f2, %f4, %f12 1161 faligndata %f4, %f6, %f14 1162 faligndata %f6, %f6, %f0 1163 1164 stda %f8, [$out + $omask]0xc0 ! partial store 1165 std %f10, [$out + 8] 1166 std %f12, [$out + 16] 1167 std %f14, [$out + 24] 1168 add $out, 32, $out 1169 orn %g0, $omask, $omask 1170 stda %f0, [$out + $omask]0xc0 ! partial store 1171 1172 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 1173 orn %g0, $omask, $omask 1174 1175 fsrc2 %f4, %f0 1176 fsrc2 %f6, %f2 1177 brnz,pn $rem, .L${bits}_xts_${dir}steal 1178 nop 1179 1180 ret 1181 restore 1182 1183!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1184.align 32 1185.L${bits}_xts_${dir}blk: 1186 add $out, $len, $blk_init 1187 and $blk_init, 63, $blk_init ! tail 1188 sub $len, $blk_init, $len 1189 add $blk_init, 15, $blk_init ! round up to 16n 1190 srlx $len, 4, $len 1191 srl $blk_init, 4, $blk_init 1192 sub $len, 1, $len 1193 add $blk_init, 1, $blk_init 1194 1195.L${bits}_xts_${dir}blk2x: 1196 ldx [$inp + 0], %o0 1197 ldx [$inp + 8], %o1 1198 ldx [$inp + 16], %o2 1199 brz,pt $ileft, 5f 1200 ldx [$inp + 24], %o3 1201 1202 ldx [$inp + 32], %o4 1203 sllx %o0, $ileft, %o0 1204 srlx %o1, $iright, %g1 1205 or %g1, %o0, %o0 1206 sllx %o1, $ileft, %o1 1207 srlx %o2, $iright, %g1 1208 or %g1, %o1, %o1 1209 sllx %o2, $ileft, %o2 1210 srlx %o3, $iright, %g1 1211 or %g1, %o2, %o2 1212 sllx %o3, $ileft, %o3 1213 srlx %o4, $iright, %o4 1214 or %o4, %o3, %o3 12155: 1216 movxtod %g2, %f12 1217 movxtod %g3, %f14 1218 bshuffle %f12, %f12, %f12 1219 bshuffle %f14, %f14, %f14 1220 1221 srax %g3, 63, %l7 ! next tweak value 1222 addcc %g2, %g2, %g2 1223 and %l7, 0x87, %l7 1224 addxc %g3, %g3, %g3 1225 xor %l7, %g2, %g2 1226 1227 movxtod %g2, %f8 1228 movxtod %g3, %f10 1229 bshuffle %f8, %f8, %f8 1230 bshuffle %f10, %f10, %f10 1231 1232 xor %g4, %o0, %o0 ! ^= rk[0] 1233 xor %g5, %o1, %o1 1234 xor %g4, %o2, %o2 ! ^= rk[0] 1235 xor %g5, %o3, %o3 1236 movxtod %o0, %f0 1237 movxtod %o1, %f2 1238 movxtod %o2, %f4 1239 movxtod %o3, %f6 1240 1241 fxor %f12, %f0, %f0 ! ^= tweak[0] 1242 fxor %f14, %f2, %f2 1243 fxor %f8, %f4, %f4 ! ^= tweak[0] 1244 fxor %f10, %f6, %f6 1245 1246 prefetch [$inp + 32+63], 20 1247 call _${alg}${bits}_${dir}crypt_2x 1248 add $inp, 32, $inp 1249 1250 movxtod %g2, %f8 1251 movxtod %g3, %f10 1252 1253 srax %g3, 63, %l7 ! next tweak value 1254 addcc %g2, %g2, %g2 1255 and %l7, 0x87, %l7 1256 addxc %g3, %g3, %g3 1257 xor %l7, %g2, %g2 1258 1259 bshuffle %f8, %f8, %f8 1260 bshuffle %f10, %f10, %f10 1261 1262 fxor %f12, %f0, %f0 ! ^= tweak[0] 1263 fxor %f14, %f2, %f2 1264 fxor %f8, %f4, %f4 1265 fxor %f10, %f6, %f6 1266 1267 subcc $len, 2, $len 1268 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1269 add $out, 8, $out 1270 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1271 add $out, 8, $out 1272 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1273 add $out, 8, $out 1274 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1275 bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x 1276 add $out, 8, $out 1277 1278 add $blk_init, $len, $len 1279 andcc $len, 1, %g0 ! is number of blocks even? 1280 membar #StoreLoad|#StoreStore 1281 bnz,pt %icc, .L${bits}_xts_${dir}loop 1282 srl $len, 0, $len 1283 brnz,pn $len, .L${bits}_xts_${dir}loop2x 1284 nop 1285 1286 fsrc2 %f4, %f0 1287 fsrc2 %f6, %f2 1288 brnz,pn $rem, .L${bits}_xts_${dir}steal 1289 nop 1290 1291 ret 1292 restore 1293!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1294___ 1295$code.=<<___ if ($dir eq "en"); 1296.align 32 1297.L${bits}_xts_${dir}steal: 1298 std %f0, [%fp + $::bias-16] ! copy of output 1299 std %f2, [%fp + $::bias-8] 1300 1301 srl $ileft, 3, $ileft 1302 add %fp, $::bias-16, %l7 1303 add $inp, $ileft, $inp ! original $inp+$len&-15 1304 add $out, $ooff, $out ! original $out+$len&-15 1305 mov 0, $ileft 1306 nop ! align 1307 1308.L${bits}_xts_${dir}stealing: 1309 ldub [$inp + $ileft], %o0 1310 ldub [%l7 + $ileft], %o1 1311 dec $rem 1312 stb %o0, [%l7 + $ileft] 1313 stb %o1, [$out + $ileft] 1314 brnz $rem, .L${bits}_xts_${dir}stealing 1315 inc $ileft 1316 1317 mov %l7, $inp 1318 sub $out, 16, $out 1319 mov 0, $ileft 1320 sub $out, $ooff, $out 1321 ba .L${bits}_xts_${dir}loop ! one more time 1322 mov 1, $len ! $rem is 0 1323___ 1324$code.=<<___ if ($dir eq "de"); 1325.align 32 1326.L${bits}_xts_${dir}steal: 1327 ldx [$inp + 0], %o0 1328 brz,pt $ileft, 8f 1329 ldx [$inp + 8], %o1 1330 1331 ldx [$inp + 16], %o2 1332 sllx %o0, $ileft, %o0 1333 srlx %o1, $iright, %g1 1334 sllx %o1, $ileft, %o1 1335 or %g1, %o0, %o0 1336 srlx %o2, $iright, %o2 1337 or %o2, %o1, %o1 13388: 1339 srax %g3, 63, %l7 ! next tweak value 1340 addcc %g2, %g2, %o2 1341 and %l7, 0x87, %l7 1342 addxc %g3, %g3, %o3 1343 xor %l7, %o2, %o2 1344 1345 movxtod %o2, %f12 1346 movxtod %o3, %f14 1347 bshuffle %f12, %f12, %f12 1348 bshuffle %f14, %f14, %f14 1349 1350 xor %g4, %o0, %o0 ! ^= rk[0] 1351 xor %g5, %o1, %o1 1352 movxtod %o0, %f0 1353 movxtod %o1, %f2 1354 1355 fxor %f12, %f0, %f0 ! ^= tweak[0] 1356 fxor %f14, %f2, %f2 1357 1358 call _${alg}${bits}_${dir}crypt_1x 1359 add $inp, 16, $inp 1360 1361 fxor %f12, %f0, %f0 ! ^= tweak[0] 1362 fxor %f14, %f2, %f2 1363 1364 std %f0, [%fp + $::bias-16] 1365 std %f2, [%fp + $::bias-8] 1366 1367 srl $ileft, 3, $ileft 1368 add %fp, $::bias-16, %l7 1369 add $inp, $ileft, $inp ! original $inp+$len&-15 1370 add $out, $ooff, $out ! original $out+$len&-15 1371 mov 0, $ileft 1372 add $out, 16, $out 1373 nop ! align 1374 1375.L${bits}_xts_${dir}stealing: 1376 ldub [$inp + $ileft], %o0 1377 ldub [%l7 + $ileft], %o1 1378 dec $rem 1379 stb %o0, [%l7 + $ileft] 1380 stb %o1, [$out + $ileft] 1381 brnz $rem, .L${bits}_xts_${dir}stealing 1382 inc $ileft 1383 1384 mov %l7, $inp 1385 sub $out, 16, $out 1386 mov 0, $ileft 1387 sub $out, $ooff, $out 1388 ba .L${bits}_xts_${dir}loop ! one more time 1389 mov 1, $len ! $rem is 0 1390___ 1391$code.=<<___; 1392 ret 1393 restore 1394.type ${alg}${bits}_t4_xts_${dir}crypt,#function 1395.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt 1396___ 1397} 1398 1399# Purpose of these subroutines is to explicitly encode VIS instructions, 1400# so that one can compile the module without having to specify VIS 1401# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 1402# Idea is to reserve for option to produce "universal" binary and let 1403# programmer detect if current CPU is VIS capable at run-time. 1404sub unvis { 1405my ($mnemonic,$rs1,$rs2,$rd)=@_; 1406my ($ref,$opf); 1407my %visopf = ( "faligndata" => 0x048, 1408 "bshuffle" => 0x04c, 1409 "fnot2" => 0x066, 1410 "fxor" => 0x06c, 1411 "fsrc2" => 0x078 ); 1412 1413 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1414 1415 if ($opf=$visopf{$mnemonic}) { 1416 foreach ($rs1,$rs2,$rd) { 1417 return $ref if (!/%f([0-9]{1,2})/); 1418 $_=$1; 1419 if ($1>=32) { 1420 return $ref if ($1&1); 1421 # re-encode for upper double register addressing 1422 $_=($1|$1>>5)&31; 1423 } 1424 } 1425 1426 return sprintf ".word\t0x%08x !%s", 1427 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1428 $ref; 1429 } else { 1430 return $ref; 1431 } 1432} 1433 1434sub unvis3 { 1435my ($mnemonic,$rs1,$rs2,$rd)=@_; 1436my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 1437my ($ref,$opf); 1438my %visopf = ( "addxc" => 0x011, 1439 "addxccc" => 0x013, 1440 "umulxhi" => 0x016, 1441 "alignaddr" => 0x018, 1442 "bmask" => 0x019, 1443 "alignaddrl" => 0x01a ); 1444 1445 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1446 1447 if ($opf=$visopf{$mnemonic}) { 1448 foreach ($rs1,$rs2,$rd) { 1449 return $ref if (!/%([goli])([0-9])/); 1450 $_=$bias{$1}+$2; 1451 } 1452 1453 return sprintf ".word\t0x%08x !%s", 1454 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1455 $ref; 1456 } else { 1457 return $ref; 1458 } 1459} 1460 1461sub unaes_round { # 4-argument instructions 1462my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1463my ($ref,$opf); 1464my %aesopf = ( "aes_eround01" => 0, 1465 "aes_eround23" => 1, 1466 "aes_dround01" => 2, 1467 "aes_dround23" => 3, 1468 "aes_eround01_l"=> 4, 1469 "aes_eround23_l"=> 5, 1470 "aes_dround01_l"=> 6, 1471 "aes_dround23_l"=> 7, 1472 "aes_kexpand1" => 8 ); 1473 1474 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1475 1476 if (defined($opf=$aesopf{$mnemonic})) { 1477 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; 1478 foreach ($rs1,$rs2,$rd) { 1479 return $ref if (!/%f([0-9]{1,2})/); 1480 $_=$1; 1481 if ($1>=32) { 1482 return $ref if ($1&1); 1483 # re-encode for upper double register addressing 1484 $_=($1|$1>>5)&31; 1485 } 1486 } 1487 1488 return sprintf ".word\t0x%08x !%s", 1489 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, 1490 $ref; 1491 } else { 1492 return $ref; 1493 } 1494} 1495 1496sub unaes_kexpand { # 3-argument instructions 1497my ($mnemonic,$rs1,$rs2,$rd)=@_; 1498my ($ref,$opf); 1499my %aesopf = ( "aes_kexpand0" => 0x130, 1500 "aes_kexpand2" => 0x131 ); 1501 1502 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1503 1504 if (defined($opf=$aesopf{$mnemonic})) { 1505 foreach ($rs1,$rs2,$rd) { 1506 return $ref if (!/%f([0-9]{1,2})/); 1507 $_=$1; 1508 if ($1>=32) { 1509 return $ref if ($1&1); 1510 # re-encode for upper double register addressing 1511 $_=($1|$1>>5)&31; 1512 } 1513 } 1514 1515 return sprintf ".word\t0x%08x !%s", 1516 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1517 $ref; 1518 } else { 1519 return $ref; 1520 } 1521} 1522 1523sub uncamellia_f { # 4-argument instructions 1524my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1525my ($ref,$opf); 1526 1527 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1528 1529 if (1) { 1530 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; 1531 foreach ($rs1,$rs2,$rd) { 1532 return $ref if (!/%f([0-9]{1,2})/); 1533 $_=$1; 1534 if ($1>=32) { 1535 return $ref if ($1&1); 1536 # re-encode for upper double register addressing 1537 $_=($1|$1>>5)&31; 1538 } 1539 } 1540 1541 return sprintf ".word\t0x%08x !%s", 1542 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2, 1543 $ref; 1544 } else { 1545 return $ref; 1546 } 1547} 1548 1549sub uncamellia3 { # 3-argument instructions 1550my ($mnemonic,$rs1,$rs2,$rd)=@_; 1551my ($ref,$opf); 1552my %cmllopf = ( "camellia_fl" => 0x13c, 1553 "camellia_fli" => 0x13d ); 1554 1555 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1556 1557 if (defined($opf=$cmllopf{$mnemonic})) { 1558 foreach ($rs1,$rs2,$rd) { 1559 return $ref if (!/%f([0-9]{1,2})/); 1560 $_=$1; 1561 if ($1>=32) { 1562 return $ref if ($1&1); 1563 # re-encode for upper double register addressing 1564 $_=($1|$1>>5)&31; 1565 } 1566 } 1567 1568 return sprintf ".word\t0x%08x !%s", 1569 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1570 $ref; 1571 } else { 1572 return $ref; 1573 } 1574} 1575 1576sub unmovxtox { # 2-argument instructions 1577my ($mnemonic,$rs,$rd)=@_; 1578my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 ); 1579my ($ref,$opf); 1580my %movxopf = ( "movdtox" => 0x110, 1581 "movstouw" => 0x111, 1582 "movstosw" => 0x113, 1583 "movxtod" => 0x118, 1584 "movwtos" => 0x119 ); 1585 1586 $ref = "$mnemonic\t$rs,$rd"; 1587 1588 if (defined($opf=$movxopf{$mnemonic})) { 1589 foreach ($rs,$rd) { 1590 return $ref if (!/%([fgoli])([0-9]{1,2})/); 1591 $_=$bias{$1}+$2; 1592 if ($2>=32) { 1593 return $ref if ($2&1); 1594 # re-encode for upper double register addressing 1595 $_=($2|$2>>5)&31; 1596 } 1597 } 1598 1599 return sprintf ".word\t0x%08x !%s", 1600 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs, 1601 $ref; 1602 } else { 1603 return $ref; 1604 } 1605} 1606 1607sub undes { 1608my ($mnemonic)=shift; 1609my @args=@_; 1610my ($ref,$opf); 1611my %desopf = ( "des_round" => 0b1001, 1612 "des_ip" => 0b100110100, 1613 "des_iip" => 0b100110101, 1614 "des_kexpand" => 0b100110110 ); 1615 1616 $ref = "$mnemonic\t".join(",",@_); 1617 1618 if (defined($opf=$desopf{$mnemonic})) { # 4-arg 1619 if ($mnemonic eq "des_round") { 1620 foreach (@args[0..3]) { 1621 return $ref if (!/%f([0-9]{1,2})/); 1622 $_=$1; 1623 if ($1>=32) { 1624 return $ref if ($1&1); 1625 # re-encode for upper double register addressing 1626 $_=($1|$1>>5)&31; 1627 } 1628 } 1629 return sprintf ".word\t0x%08x !%s", 1630 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25, 1631 $ref; 1632 } elsif ($mnemonic eq "des_kexpand") { # 3-arg 1633 foreach (@args[0..2]) { 1634 return $ref if (!/(%f)?([0-9]{1,2})/); 1635 $_=$2; 1636 if ($2>=32) { 1637 return $ref if ($2&1); 1638 # re-encode for upper double register addressing 1639 $_=($2|$2>>5)&31; 1640 } 1641 } 1642 return sprintf ".word\t0x%08x !%s", 1643 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25, 1644 $ref; 1645 } else { # 2-arg 1646 foreach (@args[0..1]) { 1647 return $ref if (!/%f([0-9]{1,2})/); 1648 $_=$1; 1649 if ($1>=32) { 1650 return $ref if ($2&1); 1651 # re-encode for upper double register addressing 1652 $_=($1|$1>>5)&31; 1653 } 1654 } 1655 return sprintf ".word\t0x%08x !%s", 1656 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25, 1657 $ref; 1658 } 1659 } else { 1660 return $ref; 1661 } 1662} 1663 1664sub emit_assembler { 1665 foreach (split("\n",$::code)) { 1666 s/\`([^\`]*)\`/eval $1/ge; 1667 1668 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go; 1669 1670 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1671 &unaes_round($1,$2,$3,$4,$5) 1672 /geo or 1673 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1674 &unaes_kexpand($1,$2,$3,$4) 1675 /geo or 1676 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1677 &uncamellia_f($1,$2,$3,$4,$5) 1678 /geo or 1679 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1680 &uncamellia3($1,$2,$3,$4) 1681 /geo or 1682 s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/ 1683 &undes($1,$2,$3,$4,$5) 1684 /geo or 1685 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/ 1686 &unmovxtox($1,$2,$3) 1687 /geo or 1688 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/ 1689 &unmovxtox($1,$2,$3) 1690 /geo or 1691 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1692 &unvis($1,$2,$3,$4) 1693 /geo or 1694 s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 1695 &unvis3($1,$2,$3,$4) 1696 /geo; 1697 1698 print $_,"\n"; 1699 } 1700} 1701 17021; 1703