1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2016 18# 19# Initial support for Fujitsu SPARC64 X/X+ comprises minimally 20# required key setup and single-block procedures. 21# 22# April 2016 23# 24# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means 25# that parallelizable nature of CBC decrypt and CTR is not utilized 26# yet. CBC encrypt on the other hand is as good as it can possibly 27# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X. 28# This is ~6x faster than pure software implementation... 29# 30# July 2016 31# 32# Switch from faligndata to fshiftorx, which allows to omit alignaddr 33# instructions and improve single-block and short-input performance 34# with misaligned data. 35 36$output = pop; 37open STDOUT,">$output"; 38 39{ 40my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5)); 41 42$code.=<<___; 43#include "sparc_arch.h" 44 45#define LOCALS (STACK_BIAS+STACK_FRAME) 46 47.text 48 49.globl aes_fx_encrypt 50.align 32 51aes_fx_encrypt: 52 and $inp, 7, $tmp ! is input aligned? 53 andn $inp, 7, $inp 54 ldd [$key + 0], %f6 ! round[0] 55 ldd [$key + 8], %f8 56 mov %o7, %g1 57 ld [$key + 240], $rounds 58 591: call .+8 60 add %o7, .Linp_align-1b, %o7 61 62 sll $tmp, 3, $tmp 63 ldd [$inp + 0], %f0 ! load input 64 brz,pt $tmp, .Lenc_inp_aligned 65 ldd [$inp + 8], %f2 66 67 ldd [%o7 + $tmp], %f14 ! shift left params 68 ldd [$inp + 16], %f4 69 fshiftorx %f0, %f2, %f14, %f0 70 fshiftorx %f2, %f4, %f14, %f2 71 72.Lenc_inp_aligned: 73 ldd [$key + 16], %f10 ! round[1] 74 ldd [$key + 24], %f12 75 76 fxor %f0, %f6, %f0 ! ^=round[0] 77 fxor %f2, %f8, %f2 78 ldd [$key + 32], %f6 ! round[2] 79 ldd [$key + 40], %f8 80 add $key, 32, $key 81 sub $rounds, 4, $rounds 82 83.Loop_enc: 84 fmovd %f0, %f4 85 faesencx %f2, %f10, %f0 86 faesencx %f4, %f12, %f2 87 ldd [$key + 16], %f10 88 ldd [$key + 24], %f12 89 add $key, 32, $key 90 91 fmovd %f0, %f4 92 faesencx %f2, %f6, %f0 93 faesencx %f4, %f8, %f2 94 ldd [$key + 0], %f6 95 ldd [$key + 8], %f8 96 97 brnz,a $rounds, .Loop_enc 98 sub $rounds, 2, $rounds 99 100 andcc $out, 7, $tmp ! is output aligned? 101 andn $out, 7, $out 102 mov 0xff, $mask 103 srl $mask, $tmp, $mask 104 add %o7, 64, %o7 105 sll $tmp, 3, $tmp 106 107 fmovd %f0, %f4 108 faesencx %f2, %f10, %f0 109 faesencx %f4, %f12, %f2 110 ldd [%o7 + $tmp], %f14 ! shift right params 111 112 fmovd %f0, %f4 113 faesenclx %f2, %f6, %f0 114 faesenclx %f4, %f8, %f2 115 116 bnz,pn %icc, .Lenc_out_unaligned 117 mov %g1, %o7 118 119 std %f0, [$out + 0] 120 retl 121 std %f2, [$out + 8] 122 123.align 16 124.Lenc_out_unaligned: 125 add $out, 16, $inp 126 orn %g0, $mask, $tmp 127 fshiftorx %f0, %f0, %f14, %f4 128 fshiftorx %f0, %f2, %f14, %f6 129 fshiftorx %f2, %f2, %f14, %f8 130 131 stda %f4, [$out + $mask]0xc0 ! partial store 132 std %f6, [$out + 8] 133 stda %f8, [$inp + $tmp]0xc0 ! partial store 134 retl 135 nop 136.type aes_fx_encrypt,#function 137.size aes_fx_encrypt,.-aes_fx_encrypt 138 139.globl aes_fx_decrypt 140.align 32 141aes_fx_decrypt: 142 and $inp, 7, $tmp ! is input aligned? 143 andn $inp, 7, $inp 144 ldd [$key + 0], %f6 ! round[0] 145 ldd [$key + 8], %f8 146 mov %o7, %g1 147 ld [$key + 240], $rounds 148 1491: call .+8 150 add %o7, .Linp_align-1b, %o7 151 152 sll $tmp, 3, $tmp 153 ldd [$inp + 0], %f0 ! load input 154 brz,pt $tmp, .Ldec_inp_aligned 155 ldd [$inp + 8], %f2 156 157 ldd [%o7 + $tmp], %f14 ! shift left params 158 ldd [$inp + 16], %f4 159 fshiftorx %f0, %f2, %f14, %f0 160 fshiftorx %f2, %f4, %f14, %f2 161 162.Ldec_inp_aligned: 163 ldd [$key + 16], %f10 ! round[1] 164 ldd [$key + 24], %f12 165 166 fxor %f0, %f6, %f0 ! ^=round[0] 167 fxor %f2, %f8, %f2 168 ldd [$key + 32], %f6 ! round[2] 169 ldd [$key + 40], %f8 170 add $key, 32, $key 171 sub $rounds, 4, $rounds 172 173.Loop_dec: 174 fmovd %f0, %f4 175 faesdecx %f2, %f10, %f0 176 faesdecx %f4, %f12, %f2 177 ldd [$key + 16], %f10 178 ldd [$key + 24], %f12 179 add $key, 32, $key 180 181 fmovd %f0, %f4 182 faesdecx %f2, %f6, %f0 183 faesdecx %f4, %f8, %f2 184 ldd [$key + 0], %f6 185 ldd [$key + 8], %f8 186 187 brnz,a $rounds, .Loop_dec 188 sub $rounds, 2, $rounds 189 190 andcc $out, 7, $tmp ! is output aligned? 191 andn $out, 7, $out 192 mov 0xff, $mask 193 srl $mask, $tmp, $mask 194 add %o7, 64, %o7 195 sll $tmp, 3, $tmp 196 197 fmovd %f0, %f4 198 faesdecx %f2, %f10, %f0 199 faesdecx %f4, %f12, %f2 200 ldd [%o7 + $tmp], %f14 ! shift right params 201 202 fmovd %f0, %f4 203 faesdeclx %f2, %f6, %f0 204 faesdeclx %f4, %f8, %f2 205 206 bnz,pn %icc, .Ldec_out_unaligned 207 mov %g1, %o7 208 209 std %f0, [$out + 0] 210 retl 211 std %f2, [$out + 8] 212 213.align 16 214.Ldec_out_unaligned: 215 add $out, 16, $inp 216 orn %g0, $mask, $tmp 217 fshiftorx %f0, %f0, %f14, %f4 218 fshiftorx %f0, %f2, %f14, %f6 219 fshiftorx %f2, %f2, %f14, %f8 220 221 stda %f4, [$out + $mask]0xc0 ! partial store 222 std %f6, [$out + 8] 223 stda %f8, [$inp + $tmp]0xc0 ! partial store 224 retl 225 nop 226.type aes_fx_decrypt,#function 227.size aes_fx_decrypt,.-aes_fx_decrypt 228___ 229} 230{ 231my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5)); 232$code.=<<___; 233.globl aes_fx_set_decrypt_key 234.align 32 235aes_fx_set_decrypt_key: 236 b .Lset_encrypt_key 237 mov -1, $inc 238 retl 239 nop 240.type aes_fx_set_decrypt_key,#function 241.size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key 242 243.globl aes_fx_set_encrypt_key 244.align 32 245aes_fx_set_encrypt_key: 246 mov 1, $inc 247 nop 248.Lset_encrypt_key: 249 and $inp, 7, $tmp 250 andn $inp, 7, $inp 251 sll $tmp, 3, $tmp 252 mov %o7, %g1 253 2541: call .+8 255 add %o7, .Linp_align-1b, %o7 256 257 ldd [%o7 + $tmp], %f10 ! shift left params 258 mov %g1, %o7 259 260 cmp $bits, 192 261 ldd [$inp + 0], %f0 262 bl,pt %icc, .L128 263 ldd [$inp + 8], %f2 264 265 be,pt %icc, .L192 266 ldd [$inp + 16], %f4 267 brz,pt $tmp, .L256aligned 268 ldd [$inp + 24], %f6 269 270 ldd [$inp + 32], %f8 271 fshiftorx %f0, %f2, %f10, %f0 272 fshiftorx %f2, %f4, %f10, %f2 273 fshiftorx %f4, %f6, %f10, %f4 274 fshiftorx %f6, %f8, %f10, %f6 275 276.L256aligned: 277 mov 14, $bits 278 and $inc, `14*16`, $tmp 279 st $bits, [$out + 240] ! store rounds 280 add $out, $tmp, $out ! start or end of key schedule 281 sllx $inc, 4, $inc ! 16 or -16 282___ 283for ($i=0; $i<6; $i++) { 284 $code.=<<___; 285 std %f0, [$out + 0] 286 faeskeyx %f6, `0x10+$i`, %f0 287 std %f2, [$out + 8] 288 add $out, $inc, $out 289 faeskeyx %f0, 0x00, %f2 290 std %f4, [$out + 0] 291 faeskeyx %f2, 0x01, %f4 292 std %f6, [$out + 8] 293 add $out, $inc, $out 294 faeskeyx %f4, 0x00, %f6 295___ 296} 297$code.=<<___; 298 std %f0, [$out + 0] 299 faeskeyx %f6, `0x10+$i`, %f0 300 std %f2, [$out + 8] 301 add $out, $inc, $out 302 faeskeyx %f0, 0x00, %f2 303 std %f4,[$out + 0] 304 std %f6,[$out + 8] 305 add $out, $inc, $out 306 std %f0,[$out + 0] 307 std %f2,[$out + 8] 308 retl 309 xor %o0, %o0, %o0 ! return 0 310 311.align 16 312.L192: 313 brz,pt $tmp, .L192aligned 314 nop 315 316 ldd [$inp + 24], %f6 317 fshiftorx %f0, %f2, %f10, %f0 318 fshiftorx %f2, %f4, %f10, %f2 319 fshiftorx %f4, %f6, %f10, %f4 320 321.L192aligned: 322 mov 12, $bits 323 and $inc, `12*16`, $tmp 324 st $bits, [$out + 240] ! store rounds 325 add $out, $tmp, $out ! start or end of key schedule 326 sllx $inc, 4, $inc ! 16 or -16 327___ 328for ($i=0; $i<8; $i+=2) { 329 $code.=<<___; 330 std %f0, [$out + 0] 331 faeskeyx %f4, `0x10+$i`, %f0 332 std %f2, [$out + 8] 333 add $out, $inc, $out 334 faeskeyx %f0, 0x00, %f2 335 std %f4, [$out + 0] 336 faeskeyx %f2, 0x00, %f4 337 std %f0, [$out + 8] 338 add $out, $inc, $out 339 faeskeyx %f4, `0x10+$i+1`, %f0 340 std %f2, [$out + 0] 341 faeskeyx %f0, 0x00, %f2 342 std %f4, [$out + 8] 343 add $out, $inc, $out 344___ 345$code.=<<___ if ($i<6); 346 faeskeyx %f2, 0x00, %f4 347___ 348} 349$code.=<<___; 350 std %f0, [$out + 0] 351 std %f2, [$out + 8] 352 retl 353 xor %o0, %o0, %o0 ! return 0 354 355.align 16 356.L128: 357 brz,pt $tmp, .L128aligned 358 nop 359 360 ldd [$inp + 16], %f4 361 fshiftorx %f0, %f2, %f10, %f0 362 fshiftorx %f2, %f4, %f10, %f2 363 364.L128aligned: 365 mov 10, $bits 366 and $inc, `10*16`, $tmp 367 st $bits, [$out + 240] ! store rounds 368 add $out, $tmp, $out ! start or end of key schedule 369 sllx $inc, 4, $inc ! 16 or -16 370___ 371for ($i=0; $i<10; $i++) { 372 $code.=<<___; 373 std %f0, [$out + 0] 374 faeskeyx %f2, `0x10+$i`, %f0 375 std %f2, [$out + 8] 376 add $out, $inc, $out 377 faeskeyx %f0, 0x00, %f2 378___ 379} 380$code.=<<___; 381 std %f0, [$out + 0] 382 std %f2, [$out + 8] 383 retl 384 xor %o0, %o0, %o0 ! return 0 385.type aes_fx_set_encrypt_key,#function 386.size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key 387___ 388} 389{ 390my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5)); 391my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7)); 392my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift) 393 = map("%f$_",grep { !($_ & 1) } (16 .. 62)); 394my ($ileft,$iright) = ($ialign,$oalign); 395 396$code.=<<___; 397.globl aes_fx_cbc_encrypt 398.align 32 399aes_fx_cbc_encrypt: 400 save %sp, -STACK_FRAME-16, %sp 401 srln $len, 4, $len 402 and $inp, 7, $ialign 403 andn $inp, 7, $inp 404 brz,pn $len, .Lcbc_no_data 405 sll $ialign, 3, $ileft 406 4071: call .+8 408 add %o7, .Linp_align-1b, %o7 409 410 ld [$key + 240], $rounds 411 and $out, 7, $oalign 412 ld [$ivp + 0], %f0 ! load ivec 413 andn $out, 7, $out 414 ld [$ivp + 4], %f1 415 sll $oalign, 3, $mask 416 ld [$ivp + 8], %f2 417 ld [$ivp + 12], %f3 418 419 sll $rounds, 4, $rounds 420 add $rounds, $key, $end 421 ldd [$key + 0], $r0hi ! round[0] 422 ldd [$key + 8], $r0lo 423 424 add $inp, 16, $inp 425 sub $len, 1, $len 426 ldd [$end + 0], $rlhi ! round[last] 427 ldd [$end + 8], $rllo 428 429 mov 16, $inc 430 movrz $len, 0, $inc 431 ldd [$key + 16], %f10 ! round[1] 432 ldd [$key + 24], %f12 433 434 ldd [%o7 + $ileft], $fshift ! shift left params 435 add %o7, 64, %o7 436 ldd [$inp - 16], $in0 ! load input 437 ldd [$inp - 8], $in1 438 ldda [$inp]0x82, $intail ! non-faulting load 439 brz $dir, .Lcbc_decrypt 440 add $inp, $inc, $inp ! inp+=16 441 442 fxor $r0hi, %f0, %f0 ! ivec^=round[0] 443 fxor $r0lo, %f2, %f2 444 fshiftorx $in0, $in1, $fshift, $in0 445 fshiftorx $in1, $intail, $fshift, $in1 446 nop 447 448.Loop_cbc_enc: 449 fxor $in0, %f0, %f0 ! inp^ivec^round[0] 450 fxor $in1, %f2, %f2 451 ldd [$key + 32], %f6 ! round[2] 452 ldd [$key + 40], %f8 453 add $key, 32, $end 454 sub $rounds, 16*6, $inner 455 456.Lcbc_enc: 457 fmovd %f0, %f4 458 faesencx %f2, %f10, %f0 459 faesencx %f4, %f12, %f2 460 ldd [$end + 16], %f10 461 ldd [$end + 24], %f12 462 add $end, 32, $end 463 464 fmovd %f0, %f4 465 faesencx %f2, %f6, %f0 466 faesencx %f4, %f8, %f2 467 ldd [$end + 0], %f6 468 ldd [$end + 8], %f8 469 470 brnz,a $inner, .Lcbc_enc 471 sub $inner, 16*2, $inner 472 473 fmovd %f0, %f4 474 faesencx %f2, %f10, %f0 475 faesencx %f4, %f12, %f2 476 ldd [$end + 16], %f10 ! round[last-1] 477 ldd [$end + 24], %f12 478 479 movrz $len, 0, $inc 480 fmovd $intail, $in0 481 ldd [$inp - 8], $in1 ! load next input block 482 ldda [$inp]0x82, $intail ! non-faulting load 483 add $inp, $inc, $inp ! inp+=16 484 485 fmovd %f0, %f4 486 faesencx %f2, %f6, %f0 487 faesencx %f4, %f8, %f2 488 489 fshiftorx $in0, $in1, $fshift, $in0 490 fshiftorx $in1, $intail, $fshift, $in1 491 492 fmovd %f0, %f4 493 faesencx %f2, %f10, %f0 494 faesencx %f4, %f12, %f2 495 ldd [$key + 16], %f10 ! round[1] 496 ldd [$key + 24], %f12 497 498 fxor $r0hi, $in0, $in0 ! inp^=round[0] 499 fxor $r0lo, $in1, $in1 500 501 fmovd %f0, %f4 502 faesenclx %f2, $rlhi, %f0 503 faesenclx %f4, $rllo, %f2 504 505 brnz,pn $oalign, .Lcbc_enc_unaligned_out 506 nop 507 508 std %f0, [$out + 0] 509 std %f2, [$out + 8] 510 add $out, 16, $out 511 512 brnz,a $len, .Loop_cbc_enc 513 sub $len, 1, $len 514 515 st %f0, [$ivp + 0] ! output ivec 516 st %f1, [$ivp + 4] 517 st %f2, [$ivp + 8] 518 st %f3, [$ivp + 12] 519 520.Lcbc_no_data: 521 ret 522 restore 523 524.align 32 525.Lcbc_enc_unaligned_out: 526 ldd [%o7 + $mask], $fshift ! shift right params 527 mov 0xff, $mask 528 srl $mask, $oalign, $mask 529 sub %g0, $ileft, $iright 530 531 fshiftorx %f0, %f0, $fshift, %f6 532 fshiftorx %f0, %f2, $fshift, %f8 533 534 stda %f6, [$out + $mask]0xc0 ! partial store 535 orn %g0, $mask, $mask 536 std %f8, [$out + 8] 537 add $out, 16, $out 538 brz $len, .Lcbc_enc_unaligned_out_done 539 sub $len, 1, $len 540 b .Loop_cbc_enc_unaligned_out 541 nop 542 543.align 32 544.Loop_cbc_enc_unaligned_out: 545 fmovd %f2, $outhead 546 fxor $in0, %f0, %f0 ! inp^ivec^round[0] 547 fxor $in1, %f2, %f2 548 ldd [$key + 32], %f6 ! round[2] 549 ldd [$key + 40], %f8 550 551 fmovd %f0, %f4 552 faesencx %f2, %f10, %f0 553 faesencx %f4, %f12, %f2 554 ldd [$key + 48], %f10 ! round[3] 555 ldd [$key + 56], %f12 556 557 ldx [$inp - 16], %o0 558 ldx [$inp - 8], %o1 559 brz $ileft, .Lcbc_enc_aligned_inp 560 movrz $len, 0, $inc 561 562 ldx [$inp], %o2 563 sllx %o0, $ileft, %o0 564 srlx %o1, $iright, %g1 565 sllx %o1, $ileft, %o1 566 or %g1, %o0, %o0 567 srlx %o2, $iright, %o2 568 or %o2, %o1, %o1 569 570.Lcbc_enc_aligned_inp: 571 fmovd %f0, %f4 572 faesencx %f2, %f6, %f0 573 faesencx %f4, %f8, %f2 574 ldd [$key + 64], %f6 ! round[4] 575 ldd [$key + 72], %f8 576 add $key, 64, $end 577 sub $rounds, 16*8, $inner 578 579 stx %o0, [%sp + LOCALS + 0] 580 stx %o1, [%sp + LOCALS + 8] 581 add $inp, $inc, $inp ! inp+=16 582 nop 583 584.Lcbc_enc_unaligned: 585 fmovd %f0, %f4 586 faesencx %f2, %f10, %f0 587 faesencx %f4, %f12, %f2 588 ldd [$end + 16], %f10 589 ldd [$end + 24], %f12 590 add $end, 32, $end 591 592 fmovd %f0, %f4 593 faesencx %f2, %f6, %f0 594 faesencx %f4, %f8, %f2 595 ldd [$end + 0], %f6 596 ldd [$end + 8], %f8 597 598 brnz,a $inner, .Lcbc_enc_unaligned 599 sub $inner, 16*2, $inner 600 601 fmovd %f0, %f4 602 faesencx %f2, %f10, %f0 603 faesencx %f4, %f12, %f2 604 ldd [$end + 16], %f10 ! round[last-1] 605 ldd [$end + 24], %f12 606 607 fmovd %f0, %f4 608 faesencx %f2, %f6, %f0 609 faesencx %f4, %f8, %f2 610 611 ldd [%sp + LOCALS + 0], $in0 612 ldd [%sp + LOCALS + 8], $in1 613 614 fmovd %f0, %f4 615 faesencx %f2, %f10, %f0 616 faesencx %f4, %f12, %f2 617 ldd [$key + 16], %f10 ! round[1] 618 ldd [$key + 24], %f12 619 620 fxor $r0hi, $in0, $in0 ! inp^=round[0] 621 fxor $r0lo, $in1, $in1 622 623 fmovd %f0, %f4 624 faesenclx %f2, $rlhi, %f0 625 faesenclx %f4, $rllo, %f2 626 627 fshiftorx $outhead, %f0, $fshift, %f6 628 fshiftorx %f0, %f2, $fshift, %f8 629 std %f6, [$out + 0] 630 std %f8, [$out + 8] 631 add $out, 16, $out 632 633 brnz,a $len, .Loop_cbc_enc_unaligned_out 634 sub $len, 1, $len 635 636.Lcbc_enc_unaligned_out_done: 637 fshiftorx %f2, %f2, $fshift, %f8 638 stda %f8, [$out + $mask]0xc0 ! partial store 639 640 st %f0, [$ivp + 0] ! output ivec 641 st %f1, [$ivp + 4] 642 st %f2, [$ivp + 8] 643 st %f3, [$ivp + 12] 644 645 ret 646 restore 647 648.align 32 649.Lcbc_decrypt: 650 fshiftorx $in0, $in1, $fshift, $in0 651 fshiftorx $in1, $intail, $fshift, $in1 652 fmovd %f0, $iv0 653 fmovd %f2, $iv1 654 655.Loop_cbc_dec: 656 fxor $in0, $r0hi, %f0 ! inp^round[0] 657 fxor $in1, $r0lo, %f2 658 ldd [$key + 32], %f6 ! round[2] 659 ldd [$key + 40], %f8 660 add $key, 32, $end 661 sub $rounds, 16*6, $inner 662 663.Lcbc_dec: 664 fmovd %f0, %f4 665 faesdecx %f2, %f10, %f0 666 faesdecx %f4, %f12, %f2 667 ldd [$end + 16], %f10 668 ldd [$end + 24], %f12 669 add $end, 32, $end 670 671 fmovd %f0, %f4 672 faesdecx %f2, %f6, %f0 673 faesdecx %f4, %f8, %f2 674 ldd [$end + 0], %f6 675 ldd [$end + 8], %f8 676 677 brnz,a $inner, .Lcbc_dec 678 sub $inner, 16*2, $inner 679 680 fmovd %f0, %f4 681 faesdecx %f2, %f10, %f0 682 faesdecx %f4, %f12, %f2 683 ldd [$end + 16], %f10 ! round[last-1] 684 ldd [$end + 24], %f12 685 686 fmovd %f0, %f4 687 faesdecx %f2, %f6, %f0 688 faesdecx %f4, %f8, %f2 689 fxor $iv0, $rlhi, %f6 ! ivec^round[last] 690 fxor $iv1, $rllo, %f8 691 fmovd $in0, $iv0 692 fmovd $in1, $iv1 693 694 movrz $len, 0, $inc 695 fmovd $intail, $in0 696 ldd [$inp - 8], $in1 ! load next input block 697 ldda [$inp]0x82, $intail ! non-faulting load 698 add $inp, $inc, $inp ! inp+=16 699 700 fmovd %f0, %f4 701 faesdecx %f2, %f10, %f0 702 faesdecx %f4, %f12, %f2 703 ldd [$key + 16], %f10 ! round[1] 704 ldd [$key + 24], %f12 705 706 fshiftorx $in0, $in1, $fshift, $in0 707 fshiftorx $in1, $intail, $fshift, $in1 708 709 fmovd %f0, %f4 710 faesdeclx %f2, %f6, %f0 711 faesdeclx %f4, %f8, %f2 712 713 brnz,pn $oalign, .Lcbc_dec_unaligned_out 714 nop 715 716 std %f0, [$out + 0] 717 std %f2, [$out + 8] 718 add $out, 16, $out 719 720 brnz,a $len, .Loop_cbc_dec 721 sub $len, 1, $len 722 723 st $iv0, [$ivp + 0] ! output ivec 724 st $iv0#lo, [$ivp + 4] 725 st $iv1, [$ivp + 8] 726 st $iv1#lo, [$ivp + 12] 727 728 ret 729 restore 730 731.align 32 732.Lcbc_dec_unaligned_out: 733 ldd [%o7 + $mask], $fshift ! shift right params 734 mov 0xff, $mask 735 srl $mask, $oalign, $mask 736 sub %g0, $ileft, $iright 737 738 fshiftorx %f0, %f0, $fshift, %f6 739 fshiftorx %f0, %f2, $fshift, %f8 740 741 stda %f6, [$out + $mask]0xc0 ! partial store 742 orn %g0, $mask, $mask 743 std %f8, [$out + 8] 744 add $out, 16, $out 745 brz $len, .Lcbc_dec_unaligned_out_done 746 sub $len, 1, $len 747 b .Loop_cbc_dec_unaligned_out 748 nop 749 750.align 32 751.Loop_cbc_dec_unaligned_out: 752 fmovd %f2, $outhead 753 fxor $in0, $r0hi, %f0 ! inp^round[0] 754 fxor $in1, $r0lo, %f2 755 ldd [$key + 32], %f6 ! round[2] 756 ldd [$key + 40], %f8 757 758 fmovd %f0, %f4 759 faesdecx %f2, %f10, %f0 760 faesdecx %f4, %f12, %f2 761 ldd [$key + 48], %f10 ! round[3] 762 ldd [$key + 56], %f12 763 764 ldx [$inp - 16], %o0 765 ldx [$inp - 8], %o1 766 brz $ileft, .Lcbc_dec_aligned_inp 767 movrz $len, 0, $inc 768 769 ldx [$inp], %o2 770 sllx %o0, $ileft, %o0 771 srlx %o1, $iright, %g1 772 sllx %o1, $ileft, %o1 773 or %g1, %o0, %o0 774 srlx %o2, $iright, %o2 775 or %o2, %o1, %o1 776 777.Lcbc_dec_aligned_inp: 778 fmovd %f0, %f4 779 faesdecx %f2, %f6, %f0 780 faesdecx %f4, %f8, %f2 781 ldd [$key + 64], %f6 ! round[4] 782 ldd [$key + 72], %f8 783 add $key, 64, $end 784 sub $rounds, 16*8, $inner 785 786 stx %o0, [%sp + LOCALS + 0] 787 stx %o1, [%sp + LOCALS + 8] 788 add $inp, $inc, $inp ! inp+=16 789 nop 790 791.Lcbc_dec_unaligned: 792 fmovd %f0, %f4 793 faesdecx %f2, %f10, %f0 794 faesdecx %f4, %f12, %f2 795 ldd [$end + 16], %f10 796 ldd [$end + 24], %f12 797 add $end, 32, $end 798 799 fmovd %f0, %f4 800 faesdecx %f2, %f6, %f0 801 faesdecx %f4, %f8, %f2 802 ldd [$end + 0], %f6 803 ldd [$end + 8], %f8 804 805 brnz,a $inner, .Lcbc_dec_unaligned 806 sub $inner, 16*2, $inner 807 808 fmovd %f0, %f4 809 faesdecx %f2, %f10, %f0 810 faesdecx %f4, %f12, %f2 811 ldd [$end + 16], %f10 ! round[last-1] 812 ldd [$end + 24], %f12 813 814 fmovd %f0, %f4 815 faesdecx %f2, %f6, %f0 816 faesdecx %f4, %f8, %f2 817 818 fxor $iv0, $rlhi, %f6 ! ivec^round[last] 819 fxor $iv1, $rllo, %f8 820 fmovd $in0, $iv0 821 fmovd $in1, $iv1 822 ldd [%sp + LOCALS + 0], $in0 823 ldd [%sp + LOCALS + 8], $in1 824 825 fmovd %f0, %f4 826 faesdecx %f2, %f10, %f0 827 faesdecx %f4, %f12, %f2 828 ldd [$key + 16], %f10 ! round[1] 829 ldd [$key + 24], %f12 830 831 fmovd %f0, %f4 832 faesdeclx %f2, %f6, %f0 833 faesdeclx %f4, %f8, %f2 834 835 fshiftorx $outhead, %f0, $fshift, %f6 836 fshiftorx %f0, %f2, $fshift, %f8 837 std %f6, [$out + 0] 838 std %f8, [$out + 8] 839 add $out, 16, $out 840 841 brnz,a $len, .Loop_cbc_dec_unaligned_out 842 sub $len, 1, $len 843 844.Lcbc_dec_unaligned_out_done: 845 fshiftorx %f2, %f2, $fshift, %f8 846 stda %f8, [$out + $mask]0xc0 ! partial store 847 848 st $iv0, [$ivp + 0] ! output ivec 849 st $iv0#lo, [$ivp + 4] 850 st $iv1, [$ivp + 8] 851 st $iv1#lo, [$ivp + 12] 852 853 ret 854 restore 855.type aes_fx_cbc_encrypt,#function 856.size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt 857___ 858} 859{ 860my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5)); 861my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7)); 862my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift) 863 = map("%f$_",grep { !($_ & 1) } (16 .. 62)); 864my ($ileft,$iright) = ($ialign, $oalign); 865my $one = "%f14"; 866 867$code.=<<___; 868.globl aes_fx_ctr32_encrypt_blocks 869.align 32 870aes_fx_ctr32_encrypt_blocks: 871 save %sp, -STACK_FRAME-16, %sp 872 srln $len, 0, $len 873 and $inp, 7, $ialign 874 andn $inp, 7, $inp 875 brz,pn $len, .Lctr32_no_data 876 sll $ialign, 3, $ileft 877 878.Lpic: call .+8 879 add %o7, .Linp_align - .Lpic, %o7 880 881 ld [$key + 240], $rounds 882 and $out, 7, $oalign 883 ld [$ivp + 0], $ctr0 ! load counter 884 andn $out, 7, $out 885 ld [$ivp + 4], $ctr0#lo 886 sll $oalign, 3, $mask 887 ld [$ivp + 8], $ctr1 888 ld [$ivp + 12], $ctr1#lo 889 ldd [%o7 + 128], $one 890 891 sll $rounds, 4, $rounds 892 add $rounds, $key, $end 893 ldd [$key + 0], $r0hi ! round[0] 894 ldd [$key + 8], $r0lo 895 896 add $inp, 16, $inp 897 sub $len, 1, $len 898 ldd [$key + 16], %f10 ! round[1] 899 ldd [$key + 24], %f12 900 901 mov 16, $inc 902 movrz $len, 0, $inc 903 ldd [$end + 0], $rlhi ! round[last] 904 ldd [$end + 8], $rllo 905 906 ldd [%o7 + $ileft], $fshift ! shiftleft params 907 add %o7, 64, %o7 908 ldd [$inp - 16], $in0 ! load input 909 ldd [$inp - 8], $in1 910 ldda [$inp]0x82, $intail ! non-faulting load 911 add $inp, $inc, $inp ! inp+=16 912 913 fshiftorx $in0, $in1, $fshift, $in0 914 fshiftorx $in1, $intail, $fshift, $in1 915 916.Loop_ctr32: 917 fxor $ctr0, $r0hi, %f0 ! counter^round[0] 918 fxor $ctr1, $r0lo, %f2 919 ldd [$key + 32], %f6 ! round[2] 920 ldd [$key + 40], %f8 921 add $key, 32, $end 922 sub $rounds, 16*6, $inner 923 924.Lctr32_enc: 925 fmovd %f0, %f4 926 faesencx %f2, %f10, %f0 927 faesencx %f4, %f12, %f2 928 ldd [$end + 16], %f10 929 ldd [$end + 24], %f12 930 add $end, 32, $end 931 932 fmovd %f0, %f4 933 faesencx %f2, %f6, %f0 934 faesencx %f4, %f8, %f2 935 ldd [$end + 0], %f6 936 ldd [$end + 8], %f8 937 938 brnz,a $inner, .Lctr32_enc 939 sub $inner, 16*2, $inner 940 941 fmovd %f0, %f4 942 faesencx %f2, %f10, %f0 943 faesencx %f4, %f12, %f2 944 ldd [$end + 16], %f10 ! round[last-1] 945 ldd [$end + 24], %f12 946 947 fmovd %f0, %f4 948 faesencx %f2, %f6, %f0 949 faesencx %f4, %f8, %f2 950 fxor $in0, $rlhi, %f6 ! inp^round[last] 951 fxor $in1, $rllo, %f8 952 953 movrz $len, 0, $inc 954 fmovd $intail, $in0 955 ldd [$inp - 8], $in1 ! load next input block 956 ldda [$inp]0x82, $intail ! non-faulting load 957 add $inp, $inc, $inp ! inp+=16 958 959 fmovd %f0, %f4 960 faesencx %f2, %f10, %f0 961 faesencx %f4, %f12, %f2 962 ldd [$key + 16], %f10 ! round[1] 963 ldd [$key + 24], %f12 964 965 fshiftorx $in0, $in1, $fshift, $in0 966 fshiftorx $in1, $intail, $fshift, $in1 967 fpadd32 $ctr1, $one, $ctr1 ! increment counter 968 969 fmovd %f0, %f4 970 faesenclx %f2, %f6, %f0 971 faesenclx %f4, %f8, %f2 972 973 brnz,pn $oalign, .Lctr32_unaligned_out 974 nop 975 976 std %f0, [$out + 0] 977 std %f2, [$out + 8] 978 add $out, 16, $out 979 980 brnz,a $len, .Loop_ctr32 981 sub $len, 1, $len 982 983.Lctr32_no_data: 984 ret 985 restore 986 987.align 32 988.Lctr32_unaligned_out: 989 ldd [%o7 + $mask], $fshift ! shift right params 990 mov 0xff, $mask 991 srl $mask, $oalign, $mask 992 sub %g0, $ileft, $iright 993 994 fshiftorx %f0, %f0, $fshift, %f6 995 fshiftorx %f0, %f2, $fshift, %f8 996 997 stda %f6, [$out + $mask]0xc0 ! partial store 998 orn %g0, $mask, $mask 999 std %f8, [$out + 8] 1000 add $out, 16, $out 1001 brz $len, .Lctr32_unaligned_out_done 1002 sub $len, 1, $len 1003 b .Loop_ctr32_unaligned_out 1004 nop 1005 1006.align 32 1007.Loop_ctr32_unaligned_out: 1008 fmovd %f2, $outhead 1009 fxor $ctr0, $r0hi, %f0 ! counter^round[0] 1010 fxor $ctr1, $r0lo, %f2 1011 ldd [$key + 32], %f6 ! round[2] 1012 ldd [$key + 40], %f8 1013 1014 fmovd %f0, %f4 1015 faesencx %f2, %f10, %f0 1016 faesencx %f4, %f12, %f2 1017 ldd [$key + 48], %f10 ! round[3] 1018 ldd [$key + 56], %f12 1019 1020 ldx [$inp - 16], %o0 1021 ldx [$inp - 8], %o1 1022 brz $ileft, .Lctr32_aligned_inp 1023 movrz $len, 0, $inc 1024 1025 ldx [$inp], %o2 1026 sllx %o0, $ileft, %o0 1027 srlx %o1, $iright, %g1 1028 sllx %o1, $ileft, %o1 1029 or %g1, %o0, %o0 1030 srlx %o2, $iright, %o2 1031 or %o2, %o1, %o1 1032 1033.Lctr32_aligned_inp: 1034 fmovd %f0, %f4 1035 faesencx %f2, %f6, %f0 1036 faesencx %f4, %f8, %f2 1037 ldd [$key + 64], %f6 ! round[4] 1038 ldd [$key + 72], %f8 1039 add $key, 64, $end 1040 sub $rounds, 16*8, $inner 1041 1042 stx %o0, [%sp + LOCALS + 0] 1043 stx %o1, [%sp + LOCALS + 8] 1044 add $inp, $inc, $inp ! inp+=16 1045 nop 1046 1047.Lctr32_enc_unaligned: 1048 fmovd %f0, %f4 1049 faesencx %f2, %f10, %f0 1050 faesencx %f4, %f12, %f2 1051 ldd [$end + 16], %f10 1052 ldd [$end + 24], %f12 1053 add $end, 32, $end 1054 1055 fmovd %f0, %f4 1056 faesencx %f2, %f6, %f0 1057 faesencx %f4, %f8, %f2 1058 ldd [$end + 0], %f6 1059 ldd [$end + 8], %f8 1060 1061 brnz,a $inner, .Lctr32_enc_unaligned 1062 sub $inner, 16*2, $inner 1063 1064 fmovd %f0, %f4 1065 faesencx %f2, %f10, %f0 1066 faesencx %f4, %f12, %f2 1067 ldd [$end + 16], %f10 ! round[last-1] 1068 ldd [$end + 24], %f12 1069 fpadd32 $ctr1, $one, $ctr1 ! increment counter 1070 1071 fmovd %f0, %f4 1072 faesencx %f2, %f6, %f0 1073 faesencx %f4, %f8, %f2 1074 fxor $in0, $rlhi, %f6 ! inp^round[last] 1075 fxor $in1, $rllo, %f8 1076 ldd [%sp + LOCALS + 0], $in0 1077 ldd [%sp + LOCALS + 8], $in1 1078 1079 fmovd %f0, %f4 1080 faesencx %f2, %f10, %f0 1081 faesencx %f4, %f12, %f2 1082 ldd [$key + 16], %f10 ! round[1] 1083 ldd [$key + 24], %f12 1084 1085 fmovd %f0, %f4 1086 faesenclx %f2, %f6, %f0 1087 faesenclx %f4, %f8, %f2 1088 1089 fshiftorx $outhead, %f0, $fshift, %f6 1090 fshiftorx %f0, %f2, $fshift, %f8 1091 std %f6, [$out + 0] 1092 std %f8, [$out + 8] 1093 add $out, 16, $out 1094 1095 brnz,a $len, .Loop_ctr32_unaligned_out 1096 sub $len, 1, $len 1097 1098.Lctr32_unaligned_out_done: 1099 fshiftorx %f2, %f2, $fshift, %f8 1100 stda %f8, [$out + $mask]0xc0 ! partial store 1101 1102 ret 1103 restore 1104.type aes_fx_ctr32_encrypt_blocks,#function 1105.size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks 1106 1107.align 32 1108.Linp_align: ! fshiftorx parameters for left shift toward %rs1 1109 .byte 0, 0, 64, 0, 0, 64, 0, -64 1110 .byte 0, 0, 56, 8, 0, 56, 8, -56 1111 .byte 0, 0, 48, 16, 0, 48, 16, -48 1112 .byte 0, 0, 40, 24, 0, 40, 24, -40 1113 .byte 0, 0, 32, 32, 0, 32, 32, -32 1114 .byte 0, 0, 24, 40, 0, 24, 40, -24 1115 .byte 0, 0, 16, 48, 0, 16, 48, -16 1116 .byte 0, 0, 8, 56, 0, 8, 56, -8 1117.Lout_align: ! fshiftorx parameters for right shift toward %rs2 1118 .byte 0, 0, 0, 64, 0, 0, 64, 0 1119 .byte 0, 0, 8, 56, 0, 8, 56, -8 1120 .byte 0, 0, 16, 48, 0, 16, 48, -16 1121 .byte 0, 0, 24, 40, 0, 24, 40, -24 1122 .byte 0, 0, 32, 32, 0, 32, 32, -32 1123 .byte 0, 0, 40, 24, 0, 40, 24, -40 1124 .byte 0, 0, 48, 16, 0, 48, 16, -48 1125 .byte 0, 0, 56, 8, 0, 56, 8, -56 1126.Lone: 1127 .word 0, 1 1128.asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>" 1129.align 4 1130___ 1131} 1132# Purpose of these subroutines is to explicitly encode VIS instructions, 1133# so that one can compile the module without having to specify VIS 1134# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 1135# Idea is to reserve for option to produce "universal" binary and let 1136# programmer detect if current CPU is VIS capable at run-time. 1137sub unvis { 1138my ($mnemonic,$rs1,$rs2,$rd)=@_; 1139my ($ref,$opf); 1140my %visopf = ( "faligndata" => 0x048, 1141 "bshuffle" => 0x04c, 1142 "fpadd32" => 0x052, 1143 "fxor" => 0x06c, 1144 "fsrc2" => 0x078 ); 1145 1146 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1147 1148 if ($opf=$visopf{$mnemonic}) { 1149 foreach ($rs1,$rs2,$rd) { 1150 return $ref if (!/%f([0-9]{1,2})/); 1151 $_=$1; 1152 if ($1>=32) { 1153 return $ref if ($1&1); 1154 # re-encode for upper double register addressing 1155 $_=($1|$1>>5)&31; 1156 } 1157 } 1158 1159 return sprintf ".word\t0x%08x !%s", 1160 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1161 $ref; 1162 } else { 1163 return $ref; 1164 } 1165} 1166 1167sub unvis3 { 1168my ($mnemonic,$rs1,$rs2,$rd)=@_; 1169my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 1170my ($ref,$opf); 1171my %visopf = ( "alignaddr" => 0x018, 1172 "bmask" => 0x019, 1173 "alignaddrl" => 0x01a ); 1174 1175 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1176 1177 if ($opf=$visopf{$mnemonic}) { 1178 foreach ($rs1,$rs2,$rd) { 1179 return $ref if (!/%([goli])([0-9])/); 1180 $_=$bias{$1}+$2; 1181 } 1182 1183 return sprintf ".word\t0x%08x !%s", 1184 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1185 $ref; 1186 } else { 1187 return $ref; 1188 } 1189} 1190 1191sub unfx { 1192my ($mnemonic,$rs1,$rs2,$rd)=@_; 1193my ($ref,$opf); 1194my %aesopf = ( "faesencx" => 0x90, 1195 "faesdecx" => 0x91, 1196 "faesenclx" => 0x92, 1197 "faesdeclx" => 0x93, 1198 "faeskeyx" => 0x94 ); 1199 1200 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1201 1202 if (defined($opf=$aesopf{$mnemonic})) { 1203 $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2; 1204 $rs2 = oct($rs2) if ($rs2 =~ /^0/); 1205 1206 foreach ($rs1,$rd) { 1207 return $ref if (!/%f([0-9]{1,2})/); 1208 $_=$1; 1209 if ($1>=32) { 1210 return $ref if ($1&1); 1211 # re-encode for upper double register addressing 1212 $_=($1|$1>>5)&31; 1213 } 1214 } 1215 1216 return sprintf ".word\t0x%08x !%s", 1217 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1218 $ref; 1219 } else { 1220 return $ref; 1221 } 1222} 1223 1224sub unfx3src { 1225my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1226my ($ref,$opf); 1227my %aesopf = ( "fshiftorx" => 0x0b ); 1228 1229 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1230 1231 if (defined($opf=$aesopf{$mnemonic})) { 1232 foreach ($rs1,$rs2,$rs3,$rd) { 1233 return $ref if (!/%f([0-9]{1,2})/); 1234 $_=$1; 1235 if ($1>=32) { 1236 return $ref if ($1&1); 1237 # re-encode for upper double register addressing 1238 $_=($1|$1>>5)&31; 1239 } 1240 } 1241 1242 return sprintf ".word\t0x%08x !%s", 1243 2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, 1244 $ref; 1245 } else { 1246 return $ref; 1247 } 1248} 1249 1250foreach (split("\n",$code)) { 1251 s/\`([^\`]*)\`/eval $1/ge; 1252 1253 s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge; 1254 1255 s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1256 &unfx($1,$2,$3,$4) 1257 /ge or 1258 s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1259 &unfx3src($1,$2,$3,$4,$5) 1260 /ge or 1261 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1262 &unvis($1,$2,$3,$4) 1263 /ge or 1264 s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 1265 &unvis3($1,$2,$3,$4) 1266 /ge; 1267 print $_,"\n"; 1268} 1269 1270close STDOUT or die "error closing STDOUT: $!"; 1271